Esempio n. 1
0
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label in ['LOCATION', 'PERSON', 'ORGANIZATION']:
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((textual.titlecase(place), 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({
            country: {
                'probability': probability,
                'count': count
            }
            for name, probability in sorted(countries) if name in country
        })
    return c_dict
def get_countries(places, spellcheck=False):
    # correcting selling introduces some false positives
    # likelihood of official government documents being spelled incorrectly is low
    countries = []
    for place, label in places:
        if label in ['LOCATION', 'PERSON', 'ORGANIZATION']:
            place = correct_country_mispelling(place)
            if spellcheck:
                match = matching_countries(place.lower())
                if match:
                    countries.append((place, match[1]))
            else:
                if place.lower() in country_names:
                    countries.append((textual.titlecase(place), 1.0))
    c = set(Counter(name for name, _ in countries).iteritems())
    c_dict = {}
    for country, count in c:
        # gets the probability from before the counter
        c_dict.update({country: {'probability': probability, 'count': count} for name, probability in sorted(countries) if name in country})
    return c_dict
Esempio n. 3
0
def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
    #     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x - window if x - window > 0 else 0
    top = lambda x: x + window if x + window < len(text) else len(text)
    #     print indices
    indices = list(textual.find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]
    #     print('{} has surrounding contexts of {}'.format(place, contexts))
    #     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = textual.remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t for t in tokens if t == t.upper() and t.isalpha()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens
                      if token.isalpha()][1:-2]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens
                               ] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper()
                       if x.upper() in codes else textual.titlecase(x))(t)
                      for t in tokens]
            #             print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in almost_everything.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = almost_everything[almost_everything.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

    # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
                #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(
                    possible_countries,
                    map(lambda x: context_count[x], possible_countries))
                #                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(
                    adjust_probabilities(probability,
                                         ambiguous_country_counts))
                break  # break out of for loop when gather first contextual clue
        break  # break out of while loop when there are no contextual clues after looping through all

    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = probabilities.independent_either_probability(
                        probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_
def context_adjustment(place, possible_countries, probability, text):
    # get contextual windows revolving around ambiguous place name
#     print('{} could be in {} with a probability of {} for each'.format(place, possible_countries, probability))
    window = 60
    bottom = lambda x: x-window if x-window > 0 else 0
    top = lambda x: x+window if x+window < len(text) else len(text)
#     print indices
    indices = list(textual.find_all(text, place))
    contexts = [text[bottom(i):top(i)] for i in indices]
#     print('{} has surrounding contexts of {}'.format(place, contexts))
#     print
    new_probabilities = []
    while not new_probabilities:
        # waits until any contextual clues are acquired rather than getting every possible contextual clue which can lead to false positives when get multiple copies of same error
        for context in contexts:
            context = textual.remove_word(context, place)
            tokens = nltk.word_tokenize(context)
            codes = [t for t in tokens if t==t.upper() and t.isalpha()]

            # chop off first and last token which are likely not whole words
            tokens = [token.lower() for token in tokens if token.isalpha()][1:-2]
            bi_tokens = bigrams(tokens)
            tri_tokens = trigrams(tokens)
            tokens = tokens + [' '.join(t) for t in bi_tokens] + [' '.join(t) for t in tri_tokens]

            # fix capitalization of state codes
            tokens = [(lambda x: x.upper() if x.upper() in codes else textual.titlecase(x))(t) for t in tokens]
#             print('Recognized locations in the context are {}'.format(filter(lambda x: x in [i for i in almost_everything.subdivision.tolist()], tokens)))
            context_countries = []

            # check whether contextual token is a country subdivision
            for i in tokens:
                a = almost_everything[almost_everything.subdivision == i]
                if not a.empty:
                    list_ = a.country_name.tolist()
                    context_countries.extend(list_)
    #                 print('{} could refer to {}'.format(i, list_))

            # use the number of contextual countries that are the same as the ambiguous countries to compute new probabilities
            if context_countries:
                context_count = Counter(context_countries)
    #             print('Counts for each context-country are {}'.format(context_count))
                ambiguous_country_counts = zip(possible_countries, map(lambda x: context_count[x], possible_countries))
#                 print('Counts for ambiguous countries are {}'.format(ambiguous_country_counts))
                new_probabilities.extend(adjust_probabilities(probability, ambiguous_country_counts))
                break # break out of for loop when gather first contextual clue
        break # break out of while loop when there are no contextual clues after looping through all

    # combine multiple contexts into a single count and probability per country
    dict_ = {}
    if new_probabilities:
        country_set = {i[0] for i in new_probabilities}
        for country in country_set:
            probs = [i[1] for i in new_probabilities if i[0] == country]
            count = len(probs)
            probability = probs.pop(0)
            if probs:
                for i in probs:
                    probability = probabilities.independent_either_probability(probability, i)
            dict_[country] = {'count': count, 'probability': probability}
    else:
        for country in possible_countries:
            dict_[country] = {'count': 1, 'probability': probability}
    return dict_