def tikaGeoExtract(esHit, geoField): text = getField(geoField, esHit['_source']) if text == None: return None res = callServer('put', ServerEndpoint, '/rmeta', text, {'Accept' : 'application/json', 'Content-Type' : 'application/geotopic'}, False) if res[0] != 200: return None jsonParse = json.loads(res[1]) return jsonParse[0]
def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: """ The method is called from parse_file_on_server to parse the file calling Tika as a server. :param option: command line options to send to Tika's server :param url_or_path: local path (or URL) to the file being parsed :param server_endpoint: Tika server's URL :param verbose: make Tika produse verbose log :param tika_server_jar: path to Tika's JAR file :param response_mime_type: response format (application/json) for plain text + metadata in JSON format :param services: :param raw_response: get raw response from Tika (text + metadata + warnings), False by default :param extra_headers: extra request header :return: dictionary with "content" (text) and "metadata" (another dictionary) keys """ services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))
def __call(self, params={}): (status, response) = callServer('put', TIKA_SERVER, '/rmeta', open(self.file), params) if status != 200: raise "Tika Parse Exception" d = self.modules["json"].loads(response)[0] if 'X-TIKA:content' in d: content = d.pop('X-TIKA:content') else: content = '' return {'metadata': d, 'content': content}
def __call(self, params={}): (status, response) = callServer('put', TIKA_SERVER, '/rmeta', open(self.file), params) if status != 200: raise "Tika Parse Exception" d = self.modules["json"].loads(response)[0] if 'X-TIKA:content' in d: content = d.pop('X-TIKA:content') else: content = '' return { 'metadata': d, 'content' : content }
def find_location(request, file_name): ''' Find location name from extracted text using Geograpy. ''' if "none" in IndexStatus("locations", file_name): text_content = QueryText(file_name) if text_content: parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False) points = parse_lat_lon(eval(parsed[1])[0]) status = IndexLocationName(file_name, points) if status[0]: return HttpResponse(status=200, content="Location/s found and index to Solr.") else: return HttpResponse(status=400, content=status[1]) else: return HttpResponse(status=400, content="Cannot find location.") else: return HttpResponse(status=200, content="Loading...")
def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))
def find_location(request, file_name): ''' Find location name from extracted text using Geograpy. ''' if "none" in IndexStatus("locations", file_name): text_content = QueryText(file_name) if text_content: parsed = callServer( 'put', TIKA_SERVER, '/rmeta', text_content, { 'Accept': 'application/json', 'Content-Type': 'application/geotopic' }, False) points = parse_lat_lon(eval(parsed[1])[0]) status = IndexLocationName(file_name, points) if status[0]: return HttpResponse( status=200, content="Location/s found and index to Solr.") else: return HttpResponse(status=400, content=status[1]) else: return HttpResponse(status=400, content="Cannot find location.") else: return HttpResponse(status=200, content="Loading...")
def query_crawled_index(request, domain_name, indexed_path): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): ''' Query admin core to get core information for domain_name, indexed_path combination ''' core_name, username, passwd = get_index_core(domain_name, indexed_path) print core_name if create_core(core_name): # 1 query solr QUERY_RANGE records at a time # 2 Run GeotopicParser on each doc one at a time # 3 keep appending results # 4 Save it in local solr instance rows_processed = 0 try: rows_processed = GetIndexSize(core_name) except: pass try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] print "Total number of records to be geotagged {0}".format(numFound) #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) khooshe_gen_freq_l = rows_processed for row in range(rows_processed, int(numFound), QUERY_RANGE): # loop solr query if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE): print "Generating khooshe tiles.." gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ): khooshe_gen_freq_l += KHOOSHE_GEN_FREQ else: khooshe_gen_freq_l = (row + QUERY_RANGE) * 2 else: print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format(row,khooshe_gen_freq_l) docs = {} url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(indexed_path, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() text = response['response']['docs'] docCount = 0 for t in text: # loop tika server starts points = [] try: docCount += 1 text_content = '' try: for v in t.values(): if(hasattr(v, '__iter__')): a = u' '.join(unicode(e) for e in v) elif(isinstance(v, unicode)): a = v.encode('ascii', 'ignore') else: a = str(v) text_content += ' ' + a.encode('ascii', 'ignore') except Exception as e: print traceback.format_exc() text_content = str(t.values()) # simplify text text_content = ' '.join(text_content.split()) parsed = callServer('put', TIKA_SERVER, '/rmeta', text_content, {'Accept': 'application/json', 'Content-Type' : 'application/geotopic'}, False) location_names = parse_lat_lon(eval(parsed[1])[0]) for key, values in location_names.iteritems(): try: # # TODO - ADD META DATA points.append( {'loc_name': smart_str(key), 'position':{ 'x': smart_str(values[0]), 'y': smart_str(values[1]) } } ) except Exception as e: print "Error while transforming points " print e pass print "Found {0} coordinates..".format(len(points)) # print docs except Exception as e: print traceback.format_exc() pass docs[str(t['id'])] = points # loop tika server ends status = IndexCrawledPoints(core_name, docs) print status # loop solr query ends gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) return HttpResponse(status=200, content= ("Crawled data geo parsed successfully.")) except Exception as e: print traceback.format_exc() print e return HttpResponse(status=500, content= ("Error while geo parsing crawled data.")) else: return HttpResponse(status=500, content= ("Only solr indexes supported for now"))
def extract_geo_from_doc(doc, tika): """ Uses Tika to extract geo data and returns an updated copy of the document. This script utilizes GeoTopicParser: https://wiki.apache.org/tika/GeoTopicParser Make sure to follow the set up steps at the link above to set up location-ner-model and geotopic-mime. This script queries documents from the Solr source core and extracts geo data from the document's 'content' field. GeoTopicParser returns a set of location names and coordinates. This information is added to each document in the fields 'location_names' and 'location_coordinates' respectively, and the new document is indexed to the destination core. Make sure you have: - a lucene-geo-gazetteer server running. - a Tika server running with location-ner-model and geotopic-mime in the classpath. Set the Tika server url below. - a Solr server running. Set the source core url (the core of documents you want to process) and the destination core url (where the enriched documents will go). This script assumes Tika is set up with GeoTopicParser. If any of the above is not satisfied, documents will still be indexed into the destination core, but will not have any new data. If you do not configure your own Tika server, tika-python will attempt to start its own server, which will not have the necessary items in the classpath. Additionally, the gazetteer must be running for the Geo extractions to be performed. :param doc: A document from solr, as a dict :return: Updated document, as a dict, or None if error """ try: names = [] coords = [] if 'content' in doc: content = doc['content'] if type(content) is list: for c in content: res = callServer('put', tika, '/rmeta', c, {'Accept' : 'application/json', 'Content-Type' : 'application/geotopic'}, False) if res[0] == 200: parsed = res[1] parsed_json = json.loads(parsed) for item in parsed_json: # Group long/lat/name by index geo_groups = {} for key in item.keys(): reg = re.findall(r'Optional_([a-zA-Z]+)(\d+)', key) if reg: attr = str(reg[0][0]).lower() n = str(reg[0][1]) if n not in geo_groups.keys(): geo_groups[n] = {} geo_groups[n][attr] = item[key] for key, value in geo_groups.iteritems(): geokeys = value.keys() if 'name' in geokeys: names.append(value['name']) lat = "" longd = "" if 'latitude' in geokeys: lat = str(value['latitude']) if 'longitude' in geokeys: longd = str(value['longitude']) coords.append(lat + ',' + longd) # now we have all names grouped, all coordinates grouped enriched = copy.deepcopy(doc) enriched['location_name'] = names enriched['location_coordinates'] = coords return enriched except Exception as e: print(e.message) return None
def query_crawled_index(request, domain_name, indexed_path, username, passwd): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): ''' Query admin core to get core information for domain_name, indexed_path combination ''' core_name = get_index_core(domain_name, indexed_path) print core_name if create_core(core_name): # 1 query solr QUERY_RANGE records at a time # 2 Run GeotopicParser on each doc one at a time # 3 keep appending results # 4 Save it in local solr instance rows_processed = 0 try: rows_processed = GetIndexSize(core_name) except: pass try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) if r.status_code != 200: return HttpResponse(status=r.status_code, content=r.reason) response = r.json() numFound = response['response']['numFound'] print "Total number of records to be geotagged {0}".format( numFound) #gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) khooshe_gen_freq_l = rows_processed for row in range(rows_processed, int(numFound), QUERY_RANGE): # loop solr query if row <= khooshe_gen_freq_l <= (row + QUERY_RANGE): print "Generating khooshe tiles.." gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) if (khooshe_gen_freq_l >= KHOOSHE_GEN_FREQ): khooshe_gen_freq_l += KHOOSHE_GEN_FREQ else: khooshe_gen_freq_l = (row + QUERY_RANGE) * 2 else: print "Skip generating khooshe tiles.. row - {0}, next scheduled - {1} ".format( row, khooshe_gen_freq_l) docs = {} url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format( indexed_path, row, QUERY_RANGE) print "solr query - {0}".format(url) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(username, passwd)) response = r.json() text = response['response']['docs'] docCount = 0 for t in text: # loop tika server starts points = [] try: docCount += 1 text_content = '' try: for v in t.values(): if (hasattr(v, '__iter__')): a = u' '.join(unicode(e) for e in v) elif (isinstance(v, unicode)): a = v.encode('ascii', 'ignore') else: a = str(v) text_content += a.encode('ascii', 'ignore') except Exception as e: print traceback.format_exc() text_content = str(t.values()) # simplify text text_content = ' '.join(text_content.split()) parsed = callServer( 'put', TIKA_SERVER, '/rmeta', text_content, { 'Accept': 'application/json', 'Content-Type': 'application/geotopic' }, False) location_names = parse_lat_lon(eval(parsed[1])[0]) for key, values in location_names.iteritems(): try: # # TODO - ADD META DATA points.append({ 'loc_name': smart_str(key), 'position': { 'x': smart_str(values[0]), 'y': smart_str(values[1]) } }) except Exception as e: print "Error while transforming points " print e pass print "Found {0} coordinates..".format(len(points)) # print docs except Exception as e: print traceback.format_exc() pass docs[str(t['doi'])] = points # loop tika server ends status = IndexCrawledPoints(core_name, docs) print status # loop solr query ends gen_khooshe_update_admin(core_name, domain_name, indexed_path, numFound) return HttpResponse( status=200, content=("Crawled data geo parsed successfully.")) except Exception as e: print traceback.format_exc() print e return HttpResponse( status=500, content=("Error while geo parsing crawled data.")) else: return HttpResponse(status=500, content=("Only solr indexes supported for now"))
def extract_geo_from_doc(doc, tika): """ Uses Tika to extract geo data and returns an updated copy of the document. This script utilizes GeoTopicParser: https://wiki.apache.org/tika/GeoTopicParser Make sure to follow the set up steps at the link above to set up location-ner-model and geotopic-mime. This script queries documents from the Solr source core and extracts geo data from the document's 'content' field. GeoTopicParser returns a set of location names and coordinates. This information is added to each document in the fields 'location_names' and 'location_coordinates' respectively, and the new document is indexed to the destination core. Make sure you have: - a lucene-geo-gazetteer server running. - a Tika server running with location-ner-model and geotopic-mime in the classpath. Set the Tika server url below. - a Solr server running. Set the source core url (the core of documents you want to process) and the destination core url (where the enriched documents will go). This script assumes Tika is set up with GeoTopicParser. If any of the above is not satisfied, documents will still be indexed into the destination core, but will not have any new data. If you do not configure your own Tika server, tika-python will attempt to start its own server, which will not have the necessary items in the classpath. Additionally, the gazetteer must be running for the Geo extractions to be performed. :param doc: A document from solr, as a dict :return: Updated document, as a dict, or None if error """ try: names = [] coords = [] if 'content' in doc: content = doc['content'] if type(content) is list: for c in content: res = callServer( 'put', tika, '/rmeta', c, { 'Accept': 'application/json', 'Content-Type': 'application/geotopic' }, False) if res[0] == 200: parsed = res[1] parsed_json = json.loads(parsed) for item in parsed_json: # Group long/lat/name by index geo_groups = {} for key in item.keys(): reg = re.findall(r'Optional_([a-zA-Z]+)(\d+)', key) if reg: attr = str(reg[0][0]).lower() n = str(reg[0][1]) if n not in geo_groups.keys(): geo_groups[n] = {} geo_groups[n][attr] = item[key] for key, value in geo_groups.iteritems(): geokeys = value.keys() if 'name' in geokeys: names.append(value['name']) lat = "" longd = "" if 'latitude' in geokeys: lat = str(value['latitude']) if 'longitude' in geokeys: longd = str(value['longitude']) coords.append(lat + ',' + longd) # now we have all names grouped, all coordinates grouped enriched = copy.deepcopy(doc) enriched['location_name'] = names enriched['location_coordinates'] = coords return enriched except Exception as e: print(e.message) return None