Esempio n. 1
0
def generate_map_from_pdf(job):

    print "starting generate_map_from_pdf with", job

    # unpack job dictionary key, file and maybe filepath
    key = job['key']
    filename = job['filename']
    countries = job['countries'] if "countries" in job else []

    directory = "/home/usrfd/maps/" + key + "/"
    file_obj = job['file']

    if 'filepath' not in job:
        # make directory to store excel file and maps
        mkdir(directory)

        filepath = directory + "/" + filename
        print "filepath = ", filepath

        # save file to disk
        with open(filepath, 'wb+') as destination:
            for chunk in file_obj.chunks():
                destination.write(chunk)
        print "wrote file"

    resolve_locations(location_extractor.extract_locations_with_context(file_obj), order_id=job['order_id'], max_seconds=10, countries=countries)
Esempio n. 2
0
def generate_map_from_text(job):

    try:

        print "starting generate_map_from_text with", job

        key = job['key']
        max_seconds = int(job['max_seconds']) if 'max_seconds' in job else 10
        text = job['text']
        countries = job['countries'] if "countries" in job else []
    
        # basically this is a hack, so that if you paste in text
        # it assumes everything that is capitalized could be a place
        # is there something we can do here for Arabic?
        names = [name for name in list(set(findall("(?:[A-Z][a-z]{1,15} )*(?:de )?[A-Z][a-z]{1,15}", text))) if len(name) > 3]
        print "names are", names
        number_of_names = len(names)
        print "number_of_names:", number_of_names
        if number_of_names < 100:
            location_extractor.load_non_locations()
            names = [name for name in names if name not in location_extractor.nonlocations]
            resolve_locations(location_extractor.extract_locations_with_context(text, names), order_id=job['order_id'], max_seconds=max_seconds, countries=countries)

        finish_order(key)

    except Exception as e:
        print e
Esempio n. 3
0
def create_map_from_docx(job):
    print "starting create_from_docx with", job
    directory = "/home/usrfd/maps/" + job['key'] + "/"
    filename = job['filename']
    if 'filepath' not in job:
        file_obj = job['file']

        # make directory to store excel file and maps
        mkdir(directory)

        filepath = directory + "/" + filename

        # save file to disk
        with open(filepath, 'wb+') as destination:
            for chunk in file_obj.chunks():
                destination.write(chunk)
        print "wrote file"
    else:
        filepath = job['filepath']

    document = Document(job['file'])
    print "documenbt = document"
    text = "\r\n\r\n".join([paragraph.text for paragraph in document.paragraphs])
    print "text is", text[:500]
    # convert to list of list of column values
    for table in document.tables:
        columns = []
        for column in table.columns:
            values = [cell.text for cell in column.cells]
            columns.append(values)

        print "columns are ", columns
    locations = location_extractor.extract_locations_with_context(text)
    print "in views,  locations are", len(locations)
    features = resolve_locations(locations, max_seconds=10)
    print "in views, features are", len(features)
   
    featureCollection = FeatureCollection(features)
    serialized = geojson.dumps(featureCollection, sort_keys=True)
 
    # make directory to store files
    path_to_geojson = directory + job['key'] + ".geojson"
    with open(path_to_geojson, "wb") as f:
        f.write(serialized)
Esempio n. 4
0
def extract_locations_from_webpage(url,
                                   html=None,
                                   max_seconds=5,
                                   debug_level=0):
    try:

        print("starting extract_locations_from_webpage with debug_level",
              debug_level)

        if debug_level > 0: print("\turl:", url)

        filename = url.replace("/", "_").replace("\\", "_").replace(
            "'",
            "_").replace('"',
                         "_").replace(".",
                                      "_").replace(":",
                                                   "_").replace("__", "_")

        if not html:
            headers = {"User-Agent": getRandomUserAgentString()}
            html = get(url, headers=headers).text
            if debug_level > 0:
                print('\tgot html')

        article = Article(url)
        article.download()
        article.parse()
        if debug_level > 0: print("\tparsed article")
        if len(article.text) > 500:
            text = article.text
            print("got text using newspaper")
        else:
            headers = {"User-Agent": getRandomUserAgentString()}
            text = bnlp_clean(html)
        if debug_level > 0: print("\tgot text")

        if debug_level > 0:
            print("[extract_locations_from_webpage] text:", type(text))
        locations = extract_locations_with_context_from_html_tables(html)
        if debug_level > 0:
            print("[extract_locations_from_webpage] locations:", locations)
        names_from_tables = [location['name'] for location in locations]
        if debug_level > 0:
            print("[extract_locations_from_webpage] names_from_tables:",
                  names_from_tables)

        locations_extracted_from_context = extract_locations_with_context(
            text,
            ignore_these_names=names_from_tables,
            debug=False,
            max_seconds=max_seconds - 2)
        print("locations_extracted_from_context:",
              locations_extracted_from_context)
        for location in locations_extracted_from_context:
            name = location['name']
            skip = False
            for l in locations:
                if name == l['name']:
                    skip = True
            if not skip:
                locations.append(location)

        if not locations:
            print("no features, so try with selenium")
            text = getTextContentViaMarionette(url)
            locations = extract_locations_with_context(html)

        return locations

    except Exception as e:
        print("Caught Exception in extract_locations_from_webpage:", e)
Esempio n. 5
0
def extract_locations_from_text(text, case_insensitive=None, debug=False):

    try:

        print("starting extract_locations_from_text")
        if debug:
            print("    [extractor] case_insensitive:", case_insensitive)

        pattern = "(?:[A-Z][a-z\u00ed]{1,15} )*(?:de )?[A-Z][a-z\u00ed]{1,15}"
        names = [
            name.lstrip("de ") for name in set(findall(pattern, text))
            if len(name) > 3
        ]
        if case_insensitive:
            names = [
                name.lstrip("de ")
                for name in set(findall(pattern, text, IGNORECASE))
                if len(name) > 3
            ]
        try:
            print("    [extractor] names from pattern:", [names])
        except:
            pass
        location_extractor.load_non_locations()
        nonlocations = [
            location.lower() for location in location_extractor.nonlocations
        ]
        names = [name for name in names if name.lower() not in nonlocations]
        try:
            print("    names after filtering out nonlocations:", names)
        except:
            pass

        location_extractor.load_language_into_dictionary_of_keywords("English")
        for possible_abbreviation in list(set(findall("[A-Z]{2}", text))):
            print("possible_abbreviation:", possible_abbreviation)
            print(
                "keys:",
                list(location_extractor.dictionary_of_keywords['English']
                     ['abbreviations'].keys()))
            #name = location_extractor.dictionary_of_keywords['English']['abbreviations'].get(possible_abbreviation, None)
            #if name:
            #    names.append({ "abbreviation": possible_abbreviation, "name": name })
            #if possible_abbreviation in location_extractor.dictionary_of_keywords['English']['abbreviations']:
            #    names.append(possible_abbreviation)

        # doing this because sometimes get grammatically incorrect tweets
        text_length = len(text)
        if text_length < 1e5:
            splat = text.split()
            names.extend([
                word.strip().strip(",").strip(";").strip(".") for word in splat
            ])

            if text_length < 500:
                names.extend([
                    ngram.strip().strip(",").strip(";").strip(".")
                    for ngram in find_ngrams(splat, 2)
                ])

        #filter out nonlocations again
        names = [
            name for name in names
            if name.lower() not in nonlocations and len(name) > 3
        ]

        # remove duplicates
        names = list(set(names))

        try:
            print("names are yeah:", names)
        except:
            pass

        results = location_extractor.extract_locations_with_context(
            text,
            names,
            debug=debug,
            return_abbreviations=True,
            case_insensitive=case_insensitive)

        try:
            print("results:", results)
        except:
            pass

        return results

    except Exception as e:
        print("EXCEPTION in extractor:", e)
Esempio n. 6
0
def generate_map_from_urls_to_webpages(job):

  try:

    print "starting generate_map_from_links_to_urls with", job['key']

    key = job['key']
    countries = job['countries'] if 'countries' in job else []
    admin1limits = job['admin1limits'] if 'admin1limits' in job else []
    print "in gen, countries:", countries
    print "in gen, admin1limits:", admin1limits

    # make directory to store saved webpage and maps
    directory = "/home/usrfd/maps/" + key + "/"
    mkdir(directory)

    print "urls:", job['urls']
    
    locations = []

    all_text = ""
    filenames_and_urls = []
    for url in job['urls']:
        url = url.strip().strip('"').strip('"')
        if url:

            # we want to respect Google, so we avoid adding an automated click through
            # by just directly getting the url
            if url.startswith("https://www.google.com/url?"):
                url = unquote(search("(?<=&url=)[^&]{10,}", url).group(0))

            if not url.startswith("http"):
                print "we assume that the user didn't include the protocol"
                url = "http://" + url

            filename = url.replace("/","_").replace("\\","_").replace("'","_").replace('"',"_").replace(".","_").replace(":","_").replace("__","_")
            filenames_and_urls.append({"url": url, "filename": filename})

    for filename_and_url in filenames_and_urls:
        filename = filename_and_url['filename']
        url = filename_and_url['url']
        headers = {"User-Agent": getRandomUserAgentString()}
        text = get(url, headers=headers).text
        soup = soupify(text)
        if match("https?://(www.)?bbc.com", url):
            try:
                selected_text = soup.select(".story-body")[0].text
                if len(selected_text) > 100:
                    text = selected_text
                    print "got text using bbc parser:", len(text)
            except Exception as e:
                print e
        elif match("https?://(www.)?dw.com", url):
            try:
                selected_text = soup.select("#bodyContent .intro")[0].text + soup.select("#bodyContent .group")[0].text
                if len(selected_text) > 500:
                    text = selected_text
                    print "got text using bbc parser:", len(text)
            except Exception as e:
                print e
        elif match("https?://(www.)?observer.org.sz", url):
            try:
                selected_text = soup.select("#article_holder")[0].text
                if len(selected_text) > 500:
                    text = selected_text
                    print "got text using bbc parser:", len(text)
            except Exception as e:
                print e
 
        # for cnn will need to write custom parser that get text and then parses out headline and javascript
        #elif match("https?://(www.)?cnn.com", url):XXX
        #    try:
        #        selected_text = BeautifulSoup(text).select(".story-body")[0].text
        #        if len(selected_text) > 100:
        #            text = selected_text
        #            print "got text using bbc parser:", len(text)
        #    except Exception as e:
        #        print e
        elif match("https?://(www.)?reuters.com/article/", url):
            try:
                selected_text = soup.select("#article-text")[0].text
                if len(selected_text) > 100:
                    text = selected_text
                    print "got text using reuters parser:", len(text)
            except Exception as e:
                print e
        elif match("https?://(www.)?nytimes.com/", url):
            try:
                selected_text = soup.select("article#story")[0].text
                if len(selected_text) > 100:
                    text = selected_text
                    print "got text using NYTimes parser:", len(text)
            except Exception as e:
                print e


        text = bnlp_clean(text)

        with open(directory + filename, "wb") as f:
            f.write(text.encode('utf-8'))

        all_text += text
    print "all_text:", type(all_text)

    names = [name for name in list(set(findall("(?:[A-Z][a-z]{1,15} )*(?:de )?[A-Z][a-z]{1,15}", text))) if len(name) > 3]
    print "names are", names
    number_of_names = len(names)
    print "number_of_names:", number_of_names

    location_extractor.load_language_into_dictionary_of_keywords("English")
    abbreviations = location_extractor.dictionary_of_keywords['English']['abbreviations']


    tables = soup.select("table")
    for table in tables:
        rows = table.select("tr")
        if len(rows) > 10:
            print "more than 10 rows in table!"
            header = [th.text for th in table.select("thead tr th")]

            #get location column
            location_column_index = None
            state_column_index = None
            for column_index, head in enumerate(header):
                head = head.strip().lower()
                print "head:", [head]
                if head == "city":
                    location_column_index = column_index
                elif head == "state":
                    admin1_column_index = column_index
            print "location_column_index:", location_column_index
            print "admin1_column_index:", admin1_column_index

            for row in table.select("tbody tr"):
                tds = [td.text for td in row.select("td")] 
                print "tds:", tds
                location = tds[location_column_index].strip()
                admin1 = tds[admin1_column_index].strip()

                #just try to directly resolve this location and if can't, then pass to ai
                locations.append({"name": location, "count": 1, "admin1code": admin1})

            print "locations from html table:", locations
           

    # don't add it if already found it via table
    for location in location_extractor.extract_locations_with_context(all_text):
        name = location['name']
        skip = False
        for l in locations:
            if name == l['name']:
                skip = True
        if not skip:
            locations.append(location)

    if not resolve_locations(locations, order_id=job['order_id'], max_seconds=10, countries=countries, admin1codes=admin1limits):
        print "no features, so try with selenium"
        all_text = ""
        for filename_and_url in filenames_and_urls:
            text = getTextContentViaMarionette(filename_and_url['url'])
            with open(directory + filename_and_url['filename'], "wb") as f:
                f.write(text.encode("utf-8"))
            all_text += text
        resolve_locations(location_extractor.extract_locations_with_context(all_text), order_id=job['order_id'], max_seconds=10, countries=countries, admin1codes=admin1limits)

    finish_order(key)

  except Exception as e:
    print "ERROR in generate_map_from_urls_to_webpages:", e