def _basic_params(self): self.name = "extract" self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, "lib", "tika-app-1.2.jar") if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] if not os.path.exists(self.pdftotext): logging.error('pdftotext not found!') sys.exit(1) self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
def run_geoparser(self): import __builtin__ jarLoad = classPathHacker() sqlitePath = os.path.join(self.cwd, "lib", "geodict", "sqlite-jdbc-3.7.2.jar") jarLoad.addFile(sqlitePath) import lib.geodict.geodict_config self.database_path = os.path.join(self.cwd, "lib", "geodict", "geodict.db") from lib.geodict.geodict_lib import GeodictParser geo_parsed = {} places_by_entityURI = {} self.cache_filename = os.path.join(self.out_dir, "geoparser.cache") if os.path.exists(self.cache_filename): self.cache = json.load(file(self.cache_filename)) else: self.cache = {} for filename in self.files: logging.info("processing " + filename) self.update_progress() file_geoparsed = filename.replace(".txt", "_geoparse.json") contexts_json = filename.replace(".txt", "_contexts.json") if os.path.exists(file_geoparsed): try: geoparse_obj = json.load(file(file_geoparsed)) if "places_by_entityURI" in geoparse_obj: if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj(geoparse_obj, filename) continue else: os.remove(file_geoparsed) except: logging.error("File " + file_geoparsed + " could not be read.") logging.error(traceback.format_exc()) if not self.dry_run: geoparse_obj = {'places_by_entityURI': {}, 'references': {}} try: id = self.metadata[filename]['itemID'] str_to_parse = self.metadata[filename]['place'] last_index = len(str_to_parse) str_to_parse += codecs.open(filename, 'rU', encoding='utf8').read() city = None places = set() json_filename = filename.replace('.txt', '_geodict.json') if not os.path.exists(json_filename): parser = GeodictParser(self.database_path) places_found = list(self.get_places(str_to_parse, parser.find_locations_in_text)) with codecs.open(json_filename, 'w', encoding='utf8') as json_file: json.dump(places_found, json_file) else: with codecs.open(json_filename, 'r', encoding='utf8') as json_file: places_found = json.load(json_file) for (place, reference) in places_found: entityURI = place["entityURI"] geoparse_obj['places_by_entityURI'][entityURI] = {'name': place["name"], 'type': place["type"], 'coordinates': [place["longitude"], place["latitude"]]} if reference[0] < last_index: city = entityURI else: places.add(entityURI) if not entityURI in geoparse_obj['references']: geoparse_obj['references'][entityURI] = [] geoparse_obj['references'][entityURI].append((reference[0] - last_index, reference[1] - last_index)) if city is None and self.metadata[filename]['place'] != "": try: query_str = self.metadata[filename]['place'] if query_str in self.cache: result = self.cache.get(query_str) if result is not None: geoparse_obj['places_by_entityURI'][result["entityURI"]] = {'name': result["name"], 'type': result["fcodeName"], 'coordinates': [result["lng"], result["lat"]]} places.add(result["entityURI"]) city = result["entityURI"] else: search_for = {"q": query_str} query_url = "http://ws.geonames.org/searchJSON?%s" % urllib.urlencode(search_for) result_obj = json.load(urllib2.urlopen(query_url)) result_places = result_obj.get("geonames", []) if len(result_places) > 0: result_place = result_places[0] self.cache[query_str] = result_place self.cache[query_str].update({"entityURI": "http://sws.geonames.org/" + str(result_place.get("geonameId")) }) result = self.cache[query_str] geoparse_obj['places_by_entityURI'][result["entityURI"]] = {'name': result["name"], 'type': result["fcodeName"], 'coordinates': [result["lng"], result["lat"]]} places.add(result["entityURI"]) city = result["entityURI"] else: self.cache[query_str] = None json.dump(self.cache, file(self.cache_filename, 'w')) except: logging.error("No city found for %s" % id) logging.error(traceback.format_exc()) geoparse_obj['places'] = list(places) geoparse_obj['city'] = city with file(file_geoparsed, 'w') as f: json.dump(geoparse_obj, f) if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj(geoparse_obj, filename) time.sleep(0.2) except (KeyboardInterrupt, SystemExit): raise except: logging.error(traceback.format_exc()) geo_parsed[filename] = geoparse_obj.get('places', []) self.metadata[filename]['city'] = geoparse_obj.get('city') for entityURI, data in geoparse_obj.get('places_by_entityURI', {}).iteritems(): places_by_entityURI[entityURI] = data places = {} for filename, entityURIs in geo_parsed.iteritems(): year = self.metadata[filename]["year"] for entityURI in entityURIs: if entityURI in places_by_entityURI: if entityURI not in places: places[entityURI] = {} places[entityURI]["name"] = places_by_entityURI[entityURI]["name"] places[entityURI]["type"] = places_by_entityURI[entityURI]["type"] places[entityURI]["coordinates"] = places_by_entityURI[entityURI]["coordinates"] places[entityURI]["weight"] = {year: 1} else: if year not in places[entityURI]["weight"]: places[entityURI]["weight"][year] = 1 else: places[entityURI]["weight"][year] += 1 self.geo_parsed = geo_parsed self.places = places self.places_by_entityURI = places_by_entityURI
def run_geoparser(self): import __builtin__ jarLoad = classPathHacker() sqlitePath = os.path.join(self.cwd, 'lib', 'geodict', 'sqlite-jdbc-3.7.2.jar') jarLoad.addFile(sqlitePath) import lib.geodict.geodict_config self.database_path = os.path.join(self.cwd, 'lib', 'geodict', 'geodict.db') from lib.geodict.geodict_lib import GeodictParser geo_parsed = {} places_by_entityURI = {} self.cache_filename = os.path.join(self.out_dir, 'geoparser.cache') if os.path.exists(self.cache_filename): self.cache = json.load(file(self.cache_filename)) else: self.cache = {} for filename in self.files: logging.info('processing ' + filename) self.update_progress() file_geoparsed = filename.replace('.txt', '_geoparse.json') contexts_json = filename.replace('.txt', '_contexts.json') if os.path.exists(file_geoparsed): try: geoparse_obj = json.load(file(file_geoparsed)) if 'places_by_entityURI' in geoparse_obj: if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj( geoparse_obj, filename) continue else: os.remove(file_geoparsed) except: logging.error('File ' + file_geoparsed + ' could not be read.') logging.error(traceback.format_exc()) if not self.dry_run: geoparse_obj = {'places_by_entityURI': {}, 'references': {}} try: itemid = self.metadata[filename]['itemID'] str_to_parse = self.metadata[filename]['place'] last_index = len(str_to_parse) str_to_parse += codecs.open(filename, 'rU', encoding='utf8').read() city = None places = set() json_filename = filename.replace('.txt', '_geodict.json') if not os.path.exists(json_filename): parser = GeodictParser(self.database_path) places_found = \ list(self.get_places(str_to_parse, parser.find_locations_in_text)) with codecs.open(json_filename, 'w', encoding='utf8') as json_file: json.dump(places_found, json_file) else: with codecs.open(json_filename, 'r', encoding='utf8') as json_file: places_found = json.load(json_file) for (place, reference) in places_found: entityURI = place['entityURI'] geoparse_obj['places_by_entityURI'][entityURI] = { 'name': place['name'], 'type': place['type'], 'coordinates': [place['longitude'], place['latitude']] } if reference[0] < last_index: city = entityURI else: places.add(entityURI) if not entityURI \ in geoparse_obj['references']: geoparse_obj['references'][entityURI] = \ [] geoparse_obj['references'][entityURI].append( (reference[0] - last_index, reference[1] - last_index)) if city is None and self.metadata[filename]['place'] != '': try: query_str = self.metadata[filename]['place'] place_dict = \ geoparse_obj['places_by_entityURI'] if query_str in self.cache: result = self.cache.get(query_str) if result is not None: place_dict[result['entityURI']] = \ {'name': result['name'], 'type': result['fcodeName'], 'coordinates': [result['lng'], result['lat']]} places.add(result['entityURI']) city = result['entityURI'] else: search_for = {'q': query_str} query_url = 'http://ws.geonames.org/' \ + 'searchJSON?%s' \ % urllib.urlencode(search_for) result_obj = \ json.load(urllib2.urlopen(query_url)) result_places = \ result_obj.get('geonames', []) if len(result_places) > 0: result_place = result_places[0] self.cache[query_str] = result_place result = self.cache[query_str] uri = 'http://sws.geonames.org/' \ + str(result_place.get('geonameId')) result['entityURI'] = uri place_dict[uri] = \ {'name': result['name'], 'type': result['fcodeName'], 'coordinates': [result['lng'], result['lat']]} places.add(uri) city = uri else: self.cache[query_str] = None with file(self.cache_filename, 'w') as \ cache_f: json.dump(self.cache, cache_f) except: logging.error('No city found for %s' % itemid) logging.error(traceback.format_exc()) geoparse_obj['places'] = list(places) geoparse_obj['city'] = city with file(file_geoparsed, 'w') as f: json.dump(geoparse_obj, f) if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj(geoparse_obj, filename) time.sleep(0.2) except (KeyboardInterrupt, SystemExit): raise except: logging.error(traceback.format_exc()) geo_parsed[filename] = geoparse_obj.get('places', []) self.metadata[filename]['city'] = geoparse_obj.get('city') place_dict = geoparse_obj.get('places_by_entityURI', {}) for (entityURI, data) in place_dict.iteritems(): places_by_entityURI[entityURI] = data places = {} for (filename, entityURIs) in geo_parsed.iteritems(): year = self.metadata[filename]['year'] for entityURI in entityURIs: if entityURI in places_by_entityURI: entity = places_by_entityURI[entityURI] if entityURI not in places: places[entityURI] = {} new_entity = places[entityURI] new_entity['name'] = entity['name'] new_entity['type'] = entity['type'] new_entity['coordinates'] = entity['coordinates'] new_entity['weight'] = {year: 1} else: if year not in places[entityURI]['weight']: places[entityURI]['weight'][year] = 1 else: places[entityURI]['weight'][year] += 1 self.geo_parsed = geo_parsed self.places = places self.places_by_entityURI = places_by_entityURI
def run_geoparser(self): import __builtin__ jarLoad = classPathHacker() sqlitePath = os.path.join(self.cwd, 'lib', 'geodict', 'sqlite-jdbc-3.7.2.jar') jarLoad.addFile(sqlitePath) import lib.geodict.geodict_config self.database_path = os.path.join(self.cwd, 'lib', 'geodict', 'geodict.db') from lib.geodict.geodict_lib import GeodictParser geo_parsed = {} places_by_entityURI = {} self.cache_filename = os.path.join(self.out_dir, 'geoparser.cache') if os.path.exists(self.cache_filename): self.cache = json.load(file(self.cache_filename)) else: self.cache = {} for filename in self.files: logging.info('processing ' + filename) self.update_progress() file_geoparsed = filename.replace('.txt', '_geoparse.json') contexts_json = filename.replace('.txt', '_contexts.json') if os.path.exists(file_geoparsed): try: geoparse_obj = json.load(file(file_geoparsed)) if 'places_by_entityURI' in geoparse_obj: if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj(geoparse_obj, filename) continue else: os.remove(file_geoparsed) except: logging.error('File ' + file_geoparsed + ' could not be read.') logging.error(traceback.format_exc()) if not self.dry_run: geoparse_obj = {'places_by_entityURI': {}, 'references': {}} try: itemid = self.metadata[filename]['itemID'] str_to_parse = self.metadata[filename]['place'] last_index = len(str_to_parse) str_to_parse += codecs.open(filename, 'rU', encoding='utf8').read() city = None places = set() json_filename = filename.replace('.txt', '_geodict.json') if not os.path.exists(json_filename): parser = GeodictParser(self.database_path) places_found = \ list(self.get_places(str_to_parse, parser.find_locations_in_text)) with codecs.open(json_filename, 'w', encoding='utf8') as json_file: json.dump(places_found, json_file) else: with codecs.open(json_filename, 'r', encoding='utf8') as json_file: places_found = json.load(json_file) for (place, reference) in places_found: entityURI = place['entityURI'] geoparse_obj['places_by_entityURI' ][entityURI] = {'name': place['name'], 'type': place['type'], 'coordinates': [place['longitude'], place['latitude']]} if reference[0] < last_index: city = entityURI else: places.add(entityURI) if not entityURI \ in geoparse_obj['references']: geoparse_obj['references'][entityURI] = \ [] geoparse_obj['references' ][entityURI].append((reference[0] - last_index, reference[1] - last_index)) if city is None and self.metadata[filename]['place' ] != '': try: query_str = self.metadata[filename]['place'] place_dict = \ geoparse_obj['places_by_entityURI'] if query_str in self.cache: result = self.cache.get(query_str) if result is not None: place_dict[result['entityURI']] = \ {'name': result['name'], 'type': result['fcodeName'], 'coordinates': [result['lng'], result['lat']]} places.add(result['entityURI']) city = result['entityURI'] else: search_for = {'q': query_str} query_url = 'http://ws.geonames.org/' \ + 'searchJSON?%s' \ % urllib.urlencode(search_for) result_obj = \ json.load(urllib2.urlopen(query_url)) result_places = \ result_obj.get('geonames', []) if len(result_places) > 0: result_place = result_places[0] self.cache[query_str] = result_place result = self.cache[query_str] uri = 'http://sws.geonames.org/' \ + str(result_place.get('geonameId')) result['entityURI'] = uri place_dict[uri] = \ {'name': result['name'], 'type': result['fcodeName'], 'coordinates': [result['lng'], result['lat']]} places.add(uri) city = uri else: self.cache[query_str] = None with file(self.cache_filename, 'w') as \ cache_f: json.dump(self.cache, cache_f) except: logging.error('No city found for %s' % itemid) logging.error(traceback.format_exc()) geoparse_obj['places'] = list(places) geoparse_obj['city'] = city with file(file_geoparsed, 'w') as f: json.dump(geoparse_obj, f) if not os.path.exists(contexts_json): self.contexts_from_geoparse_obj(geoparse_obj, filename) time.sleep(0.2) except (KeyboardInterrupt, SystemExit): raise except: logging.error(traceback.format_exc()) geo_parsed[filename] = geoparse_obj.get('places', []) self.metadata[filename]['city'] = geoparse_obj.get('city') place_dict = geoparse_obj.get('places_by_entityURI', {}) for (entityURI, data) in place_dict.iteritems(): places_by_entityURI[entityURI] = data places = {} for (filename, entityURIs) in geo_parsed.iteritems(): year = self.metadata[filename]['year'] for entityURI in entityURIs: if entityURI in places_by_entityURI: entity = places_by_entityURI[entityURI] if entityURI not in places: places[entityURI] = {} new_entity = places[entityURI] new_entity['name'] = entity['name'] new_entity['type'] = entity['type'] new_entity['coordinates'] = entity['coordinates' ] new_entity['weight'] = {year: 1} else: if year not in places[entityURI]['weight']: places[entityURI]['weight'][year] = 1 else: places[entityURI]['weight'][year] += 1 self.geo_parsed = geo_parsed self.places = places self.places_by_entityURI = places_by_entityURI