def generate_map_from_pdf(job): print "starting generate_map_from_pdf with", job # unpack job dictionary key, file and maybe filepath key = job['key'] filename = job['filename'] countries = job['countries'] if "countries" in job else [] directory = "/home/usrfd/maps/" + key + "/" file_obj = job['file'] if 'filepath' not in job: # make directory to store excel file and maps mkdir(directory) filepath = directory + "/" + filename print "filepath = ", filepath # save file to disk with open(filepath, 'wb+') as destination: for chunk in file_obj.chunks(): destination.write(chunk) print "wrote file" resolve_locations(location_extractor.extract_locations_with_context(file_obj), order_id=job['order_id'], max_seconds=10, countries=countries)
def generate_map_from_text(job): try: print "starting generate_map_from_text with", job key = job['key'] max_seconds = int(job['max_seconds']) if 'max_seconds' in job else 10 text = job['text'] countries = job['countries'] if "countries" in job else [] # basically this is a hack, so that if you paste in text # it assumes everything that is capitalized could be a place # is there something we can do here for Arabic? names = [name for name in list(set(findall("(?:[A-Z][a-z]{1,15} )*(?:de )?[A-Z][a-z]{1,15}", text))) if len(name) > 3] print "names are", names number_of_names = len(names) print "number_of_names:", number_of_names if number_of_names < 100: location_extractor.load_non_locations() names = [name for name in names if name not in location_extractor.nonlocations] resolve_locations(location_extractor.extract_locations_with_context(text, names), order_id=job['order_id'], max_seconds=max_seconds, countries=countries) finish_order(key) except Exception as e: print e
def create_map_from_docx(job): print "starting create_from_docx with", job directory = "/home/usrfd/maps/" + job['key'] + "/" filename = job['filename'] if 'filepath' not in job: file_obj = job['file'] # make directory to store excel file and maps mkdir(directory) filepath = directory + "/" + filename # save file to disk with open(filepath, 'wb+') as destination: for chunk in file_obj.chunks(): destination.write(chunk) print "wrote file" else: filepath = job['filepath'] document = Document(job['file']) print "documenbt = document" text = "\r\n\r\n".join([paragraph.text for paragraph in document.paragraphs]) print "text is", text[:500] # convert to list of list of column values for table in document.tables: columns = [] for column in table.columns: values = [cell.text for cell in column.cells] columns.append(values) print "columns are ", columns locations = location_extractor.extract_locations_with_context(text) print "in views, locations are", len(locations) features = resolve_locations(locations, max_seconds=10) print "in views, features are", len(features) featureCollection = FeatureCollection(features) serialized = geojson.dumps(featureCollection, sort_keys=True) # make directory to store files path_to_geojson = directory + job['key'] + ".geojson" with open(path_to_geojson, "wb") as f: f.write(serialized)
def extract_locations_from_webpage(url, html=None, max_seconds=5, debug_level=0): try: print("starting extract_locations_from_webpage with debug_level", debug_level) if debug_level > 0: print("\turl:", url) filename = url.replace("/", "_").replace("\\", "_").replace( "'", "_").replace('"', "_").replace(".", "_").replace(":", "_").replace("__", "_") if not html: headers = {"User-Agent": getRandomUserAgentString()} html = get(url, headers=headers).text if debug_level > 0: print('\tgot html') article = Article(url) article.download() article.parse() if debug_level > 0: print("\tparsed article") if len(article.text) > 500: text = article.text print("got text using newspaper") else: headers = {"User-Agent": getRandomUserAgentString()} text = bnlp_clean(html) if debug_level > 0: print("\tgot text") if debug_level > 0: print("[extract_locations_from_webpage] text:", type(text)) locations = extract_locations_with_context_from_html_tables(html) if debug_level > 0: print("[extract_locations_from_webpage] locations:", locations) names_from_tables = [location['name'] for location in locations] if debug_level > 0: print("[extract_locations_from_webpage] names_from_tables:", names_from_tables) locations_extracted_from_context = extract_locations_with_context( text, ignore_these_names=names_from_tables, debug=False, max_seconds=max_seconds - 2) print("locations_extracted_from_context:", locations_extracted_from_context) for location in locations_extracted_from_context: name = location['name'] skip = False for l in locations: if name == l['name']: skip = True if not skip: locations.append(location) if not locations: print("no features, so try with selenium") text = getTextContentViaMarionette(url) locations = extract_locations_with_context(html) return locations except Exception as e: print("Caught Exception in extract_locations_from_webpage:", e)
def extract_locations_from_text(text, case_insensitive=None, debug=False): try: print("starting extract_locations_from_text") if debug: print(" [extractor] case_insensitive:", case_insensitive) pattern = "(?:[A-Z][a-z\u00ed]{1,15} )*(?:de )?[A-Z][a-z\u00ed]{1,15}" names = [ name.lstrip("de ") for name in set(findall(pattern, text)) if len(name) > 3 ] if case_insensitive: names = [ name.lstrip("de ") for name in set(findall(pattern, text, IGNORECASE)) if len(name) > 3 ] try: print(" [extractor] names from pattern:", [names]) except: pass location_extractor.load_non_locations() nonlocations = [ location.lower() for location in location_extractor.nonlocations ] names = [name for name in names if name.lower() not in nonlocations] try: print(" names after filtering out nonlocations:", names) except: pass location_extractor.load_language_into_dictionary_of_keywords("English") for possible_abbreviation in list(set(findall("[A-Z]{2}", text))): print("possible_abbreviation:", possible_abbreviation) print( "keys:", list(location_extractor.dictionary_of_keywords['English'] ['abbreviations'].keys())) #name = location_extractor.dictionary_of_keywords['English']['abbreviations'].get(possible_abbreviation, None) #if name: # names.append({ "abbreviation": possible_abbreviation, "name": name }) #if possible_abbreviation in location_extractor.dictionary_of_keywords['English']['abbreviations']: # names.append(possible_abbreviation) # doing this because sometimes get grammatically incorrect tweets text_length = len(text) if text_length < 1e5: splat = text.split() names.extend([ word.strip().strip(",").strip(";").strip(".") for word in splat ]) if text_length < 500: names.extend([ ngram.strip().strip(",").strip(";").strip(".") for ngram in find_ngrams(splat, 2) ]) #filter out nonlocations again names = [ name for name in names if name.lower() not in nonlocations and len(name) > 3 ] # remove duplicates names = list(set(names)) try: print("names are yeah:", names) except: pass results = location_extractor.extract_locations_with_context( text, names, debug=debug, return_abbreviations=True, case_insensitive=case_insensitive) try: print("results:", results) except: pass return results except Exception as e: print("EXCEPTION in extractor:", e)
def generate_map_from_urls_to_webpages(job): try: print "starting generate_map_from_links_to_urls with", job['key'] key = job['key'] countries = job['countries'] if 'countries' in job else [] admin1limits = job['admin1limits'] if 'admin1limits' in job else [] print "in gen, countries:", countries print "in gen, admin1limits:", admin1limits # make directory to store saved webpage and maps directory = "/home/usrfd/maps/" + key + "/" mkdir(directory) print "urls:", job['urls'] locations = [] all_text = "" filenames_and_urls = [] for url in job['urls']: url = url.strip().strip('"').strip('"') if url: # we want to respect Google, so we avoid adding an automated click through # by just directly getting the url if url.startswith("https://www.google.com/url?"): url = unquote(search("(?<=&url=)[^&]{10,}", url).group(0)) if not url.startswith("http"): print "we assume that the user didn't include the protocol" url = "http://" + url filename = url.replace("/","_").replace("\\","_").replace("'","_").replace('"',"_").replace(".","_").replace(":","_").replace("__","_") filenames_and_urls.append({"url": url, "filename": filename}) for filename_and_url in filenames_and_urls: filename = filename_and_url['filename'] url = filename_and_url['url'] headers = {"User-Agent": getRandomUserAgentString()} text = get(url, headers=headers).text soup = soupify(text) if match("https?://(www.)?bbc.com", url): try: selected_text = soup.select(".story-body")[0].text if len(selected_text) > 100: text = selected_text print "got text using bbc parser:", len(text) except Exception as e: print e elif match("https?://(www.)?dw.com", url): try: selected_text = soup.select("#bodyContent .intro")[0].text + soup.select("#bodyContent .group")[0].text if len(selected_text) > 500: text = selected_text print "got text using bbc parser:", len(text) except Exception as e: print e elif match("https?://(www.)?observer.org.sz", url): try: selected_text = soup.select("#article_holder")[0].text if len(selected_text) > 500: text = selected_text print "got text using bbc parser:", len(text) except Exception as e: print e # for cnn will need to write custom parser that get text and then parses out headline and javascript #elif match("https?://(www.)?cnn.com", url):XXX # try: # selected_text = BeautifulSoup(text).select(".story-body")[0].text # if len(selected_text) > 100: # text = selected_text # print "got text using bbc parser:", len(text) # except Exception as e: # print e elif match("https?://(www.)?reuters.com/article/", url): try: selected_text = soup.select("#article-text")[0].text if len(selected_text) > 100: text = selected_text print "got text using reuters parser:", len(text) except Exception as e: print e elif match("https?://(www.)?nytimes.com/", url): try: selected_text = soup.select("article#story")[0].text if len(selected_text) > 100: text = selected_text print "got text using NYTimes parser:", len(text) except Exception as e: print e text = bnlp_clean(text) with open(directory + filename, "wb") as f: f.write(text.encode('utf-8')) all_text += text print "all_text:", type(all_text) names = [name for name in list(set(findall("(?:[A-Z][a-z]{1,15} )*(?:de )?[A-Z][a-z]{1,15}", text))) if len(name) > 3] print "names are", names number_of_names = len(names) print "number_of_names:", number_of_names location_extractor.load_language_into_dictionary_of_keywords("English") abbreviations = location_extractor.dictionary_of_keywords['English']['abbreviations'] tables = soup.select("table") for table in tables: rows = table.select("tr") if len(rows) > 10: print "more than 10 rows in table!" header = [th.text for th in table.select("thead tr th")] #get location column location_column_index = None state_column_index = None for column_index, head in enumerate(header): head = head.strip().lower() print "head:", [head] if head == "city": location_column_index = column_index elif head == "state": admin1_column_index = column_index print "location_column_index:", location_column_index print "admin1_column_index:", admin1_column_index for row in table.select("tbody tr"): tds = [td.text for td in row.select("td")] print "tds:", tds location = tds[location_column_index].strip() admin1 = tds[admin1_column_index].strip() #just try to directly resolve this location and if can't, then pass to ai locations.append({"name": location, "count": 1, "admin1code": admin1}) print "locations from html table:", locations # don't add it if already found it via table for location in location_extractor.extract_locations_with_context(all_text): name = location['name'] skip = False for l in locations: if name == l['name']: skip = True if not skip: locations.append(location) if not resolve_locations(locations, order_id=job['order_id'], max_seconds=10, countries=countries, admin1codes=admin1limits): print "no features, so try with selenium" all_text = "" for filename_and_url in filenames_and_urls: text = getTextContentViaMarionette(filename_and_url['url']) with open(directory + filename_and_url['filename'], "wb") as f: f.write(text.encode("utf-8")) all_text += text resolve_locations(location_extractor.extract_locations_with_context(all_text), order_id=job['order_id'], max_seconds=10, countries=countries, admin1codes=admin1limits) finish_order(key) except Exception as e: print "ERROR in generate_map_from_urls_to_webpages:", e