Example #1
0
 def __init__(self):
     self.stopwords = stopwords.words('english')
     self.uscities = set([w.lower() for w in gazetteers.words('uscities.txt')])
     self.usstates = set([w.lower() for w in gazetteers.words('usstates.txt')])
     self.countries = set([w.lower() for w in gazetteers.words('countries.txt')])
     self.basicwords = set(words.words('en-basic'))
     self.paragraph_tokens = []
     self.texts = []
    def __init__(self, corpus, outfile, tokens_dir, parses_dir, depparses_dir,
                 train=False):
        self.relations = list()
        self.train = train
        self.corpus = corpus
        self.outfile = outfile
        self.tokenized_sents, self.tok_sents_pos = self.process_tokens_dir(tokens_dir)
        self.parses = self.process_parses_dir(parses_dir)
        self.depparses = self.process_dparses_dir(depparses_dir)
        self.clusterdict = self.make_cluster_dict('50mpaths2')
        self.pronouns = ["I", "me", "my", "mine", "myself", "you", "your", "yours", "yourself",
                        "he", "him", "his", "his", "himself", "she", "her", "hers", "herself", 
                        "it", "its", "itself", "we", "us", "our", "ours", "ourselves", "you", "your", 
                        "yours", "yourselves", "they", "them", "their", "theirs", "themselves"]

        self.locations = set([c.lower() for c in gazetteers.words('countries.txt')] + 
                             [s.lower() for s in gazetteers.words('usstates.txt')])
        self.names = set([name.lower() for name in names.words('male.txt')] +
                 [name.lower() for name in names.words('female.txt')])

        self.feat_fns = [self.words,    #good
                         self.word_types, #good
                         self.pronoun, #good
                         self.name, #good
                         #self.place, #look to get a better list
                         self.num_words_between, #good
                         self.words_between_words, #good
                         self.prev_word, #good
                         #self.post_word, #really bad feature
                         #self.prev_word_pos, #bad
                         self.post_word_pos, #good
                         self.first_word_after_w1, #good
                         self.words_between_POSs, #good 
                         #self.last_word_before_w2
                         self.w1clust, #good
                         self.w2clust, #good
                         self.tree_path,
                         #self.w1pref, #bad
                         #self.w1suf,
                         #self.w2pref,
                         #self.w2suf,
                         #self.w1bow,
                         #self.w2bow
                         self.et1dw1,
                         self.et2dw2,
                         self.h1dw1,
                         self.h2dw2
                         ]
Example #3
0
 def __init__(self):
     self.locations = set(gazetteers.words())
     self.lookahead = 0
     for loc in self.locations:
         nwords = loc.count(' ')
         if nwords > self.lookahead:
             self.lookahead = nwords
    def __init__(self):
        self.gazetteers = [x.lower() for x in gazetteers.words()]
        self.stopwords = [x.lower() for x in stopwords.words('english')]

        self.rx_space = r'\s+'
        self.rx_email = r'[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+'
        self.rx_url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        self.rx_hashtag = r'#(\w+)'
        self.rx_mention = r'@(\w+)'
        self.rx_empty = "empty"
Example #5
0
	def __init__(self):
		# gazetteers is a WordListCorpusReader of many different location words
		self.locations = set(gazetteers.words())
		self.lookahead = 0
		# need to know how many words to lookahead in the tagged sentence to find a location
		for loc in self.locations:
			nwords = loc.count(' ')
			
			if nwords > self.lookahead:
				self.lookahead = nwords
Example #6
0
 def __init__(self):
     self.train_path = "../data/train"
     self.dev_path = "../data/dev"
     self.beta = 0
     self.max_iter = 0
     # self.classifier = None
     self.dict_classifiers = {}
     self.locations = set(gazetteers.words())
     self.names = set(names.words())
     self.pos = None
     self.previous_labels = None
    def __init__(self, lm, confSet, myInfltSet):
        """initializes the language model."""
        self.languageModel = lm
        self.languageModelSQL = LM("web1t")
        self.confusionSet = confSet
        self.inflectionSet = myInfltSet

        self.dict = enchant.Dict("en")
        self.myDict = {}
        for word in names.words():
            self.myDict[word] = 1
        for word in gazetteers.words():
            self.myDict[word] = 2
Example #8
0
 def __init__(self):
     self.train_path = "../data/train"
     self.dev_path = "../data/dev"
     self.beta = 0
     self.max_iter = 0
     self.classifier = None
     self.nltk_names = set(names.words())
     self.nltk_stopwords = set(stopwords.words())
     self.titles = [
         'Master', 'Mr.', 'Mr', 'Miss.', 'Miss', 'Mrs.', 'Mrs', 'Ms.', 'Ms',
         'Mx.', 'Mx', 'Sir', 'Gentleman', 'Sire', 'Mistress', 'Madam',
         'Dame', 'Lord', 'Lady', 'Esq', 'Excellency', 'Dr', 'Professor',
         'QC', 'Cl', 'SCl', 'Eur Lng', 'Chancellor', 'Vice-Chancellor',
         'Principal', 'President', 'Minister', 'Warden', 'Dean', 'Regent',
         'Rector', 'Provost', 'Director', 'Chief Executive', 'manager',
         'chairman', 'secretary', 'leader'
     ]
     self.say = ['say', 'said', 'says']
     # 'speak', 'spoke', 'speaks'
     # 'talk', 'told', 'talks',
     # 'discuss', 'discusses', 'discussed',
     # 'mention', 'mentioned', 'mentions']
     self.gazetteers = set(gazetteers.words())
Example #9
0
##########################

basedir = "stanford-full-pipeline"

all_stanford = LazyDict(basedir, stanford_general_opener)
RAW_SENTENCES = SuperLazyDict(all_stanford, stanford_raw_reader)
POS_SENTENCES = SuperLazyDict(all_stanford, stanford_pos_reader)
SYNTAX_PARSE_SENTENCES = SuperLazyDict(all_stanford, stanford_tree_reader)
NONPARENTED_SENTENCES = SuperLazyDict(all_stanford,
                                      stanford_nonparented_tree_reader)
COREF = SuperLazyDict(all_stanford, stanford_coref_reader)
PRONOUN_SET = set(pronoun_reader())
entity_types = gather_entities()
AUGMENTED_TREES = augmented_tree_reader()
RELATIONSHIPS_AND_GROUPS = set(rels_and_groups_reader())
COUNTRIES = set(gz.words('countries.txt'))
NATIONALITIES = set(gz.words('nationalities.txt'))
OFFICIALS = officials_reader()  #these are bit silly; will probably discard"""
DEPENDENCIES = stanford_dependency_reader()
POSSESSIVE_PRONOUNS = [
    'my', 'mine', 'your', 'yours', 'her', 'hers', 'his', 'our', 'ours',
    'their', 'theirs'
]
TITLE_SET = {
    "chairman", "Chairman", "director", "Director", "president", "President",
    "manager", "managers", "Manager", "executive", "CEO", "Officer", "officer",
    "consultant", "CFO", "COO", "CTO", "CMO", "founder", "shareholder",
    "researcher", "professor", "principal", "Principal", "minister",
    "Minister", "prime", "Prime", "chief", "Chief", "prosecutor", "Prosecutor",
    "queen", "Queen", "leader", "Leader", "secretary", "Secretary",
    "ex-Leader", "ex-leader", "coach", "Coach", "composer", "Composer", "head",
  
ORDINALS = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 
            'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth']
  
DAYS = ['monday', 'tuesday', 'wednesday', 'thursday', 
        'friday', 'saturday', 'sunday']
  
MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
          'august', 'september', 'october', 'november', 'december',
          'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept',
          'oct', 'nov', 'dec']  
  
NAMES = set([name.lower() for filename in ('male.txt', 'female.txt') for name
             in names.words(filename)])
  
USCITIES = set(gazetteers.words('uscities.txt'))
  
# [XX] contains some non-ascii chars
COUNTRIES = set([country for filename in ('isocountries.txt','countries.txt')
                 for country in gazetteers.words(filename)])
  
# States in North America
NA_STATES = set([state.lower() for filename in
                 ('usstates.txt','mexstates.txt','caprovinces.txt') for state in
                 gazetteers.words(filename)])
  
US_STATE_ABBREVIATIONS = set(gazetteers.words('usstateabbrev.txt'))
  
NATIONALITIES = set(gazetteers.words('nationalities.txt'))
  
PERSON_PREFIXES = ['mr', 'mrs', 'ms', 'miss', 'dr', 'rev', 'judge',
Example #11
0
    for name in names.words(filename)
])
PERSON_PREFIXES = [
    'mr', 'mrs', 'ms', 'miss', 'dr', 'rev', 'judge', 'justice', 'honorable',
    'hon', 'rep', 'sen', 'sec', 'minister', 'chairman', 'succeeding', 'says',
    'president'
]
PERSON_SUFFIXES = ['sr', 'jr', 'phd', 'md']
ORG_SUFFIXES = [
    'ltd', 'inc', 'co', 'corp', 'plc', 'llc', 'llp', 'gmbh', 'corporation',
    'associates', 'partners', 'committee', 'institute', 'commission',
    'university', 'college', 'airlines', 'magazine'
]
COUNTRIES = set([
    country for filename in ('isocountries.txt', 'countries.txt')
    for country in gazetteers.words(filename)
])

lancaster_stemmer = LancasterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
tknzr = TweetTokenizer(preserve_case=True,
                       strip_handles=False,
                       reduce_len=False)

#train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
#test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))


def get_tuples(dspath):
    sentences = []
    s = ''
 def __init__(self, corpus):
     self.places = set(gazetteers.words())
     self.people = set(names.words())
     self.stop_words = self.load_stop_words()
     self.corpus = corpus
Example #13
0
def firstPassGrouping():
    words = []

    stemmed = []
    features = {}
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    clean = re.compile("[()\/']")
    split = re.compile("[/]")
    grams = []
    with open('data/features.txt', 'r') as featureIn:
        for line in map(cleanFeatures, featureIn):
            ws = []
            for w in tokenizer.tokenize(clean.sub(' ', line[1])):
                if w not in engStop:
                    stemmed.append((eng.stem(w).lower(), line[1]))
                    words.append((w.lower(), line[1]))
                    ws.append(w.lower())

            grams.append((list(everygrams(ws, min_len=2, max_len=2)), line[1]))
            features[line[0]] = line[1]


    # cuisine, style, price, atmosphere, and occasion


    noGrams = set(map(lambda x: x[1], filter(lambda x: len(x[0]) == 0, grams)))

    grams = list(filter(lambda x: len(x[0]) > 0, grams))
    groupedw = seq(grams) \
        .flat_map(lambda x: set([(w, x[1]) for w in seq(x[0]).flat_map(lambda y: list(y)).to_list()])) \
        .group_by(lambda w: w[0]) \
        .map(lambda x: (x[0], list(map(lambda y: y[1], x[1])))) \
        .to_dict()

    noGramsId = {}
    for g in noGrams:
        noGramsId[g] = g
    simGrouped = {}
    simular = set()
    for k, v in sorted(groupedw.items(), key=lambda x: x[0]):
        # print(k, v)
        nl = v.copy()
        match = noGramsId.get(k, None)
        for nk in noGramsId.keys():
            if len(nk) > 1:
                if nk in v:
                    nl.append(nk)
                    simular.add(nk)
                for vv in v:
                    if nk in vv:
                        nl.append(nk)
                        simular.add(nk)

        if match is not None:
            nl.append(match)
            simGrouped[k] = list(set(nl))
            simular.add(match)
        else:
            if len(k) > 1:
                simGrouped[k] = v

    noSim = noGrams - simular
    #
    nationalities = gazetteers.words()

    featureNationality = []
    for nosim in noSim:
        didConvert = convert(nosim)
        if didConvert is not None:
            if didConvert in nationalities:
                featureNationality.append(nosim)
        else:
            if nosim in nationalities:
                featureNationality.append(nosim)
            else:
                split = nosim.split('-')
                for sp in split:
                    if sp in nationalities:
                        featureNationality.append(nosim)

    # print("-----------------")


    noSim = noSim - set(featureNationality)
    # occasions = ['monday']
    # # cuisine, style, price, atmosphere, and occasion
    for k, v in sorted(simGrouped.items(), key=lambda x: x[0]):
        # print(k,v)
        if k in nationalities:
            featureNationality.append(k)
            featureNationality.extend(v)
            simGrouped.pop(k)
        didConvert = convert(k)
        if didConvert is not None:
            if didConvert in nationalities:
                simGrouped.pop(k)
                featureNationality.append(k)
                featureNationality.extend(v)

    with open('q1/noSim.json', 'w+') as nsOut:
        nsOut.write(json.dumps(list(noSim), indent=2, sort_keys=True))

    with open('q1/featureNationality.json', 'w+') as nsOut:
        nsOut.write(json.dumps(featureNationality, indent=2, sort_keys=True))

    with open('q1/grouped.json', 'w+') as nsOut:
        nsOut.write(json.dumps(simGrouped, indent=2, sort_keys=True))
#!/usr/bin/python
'''
NPR 2017-11-12
https://www.npr.org/2017/11/12/563367879/sunday-puzzle-move-around-to-find-new-meaning

Take the name of a U.S. state capital. Immediately to the right of it write the name 
of a world capital. If you have the right ones, the name of a U.S. state will be 
embedded in consecutive letters within that letter string. What three places are these?
'''

from nltk.corpus import wordnet as wn, gazetteers

#%%
# US states
US_STATES = frozenset(gazetteers.words('usstates.txt'))
US_STATES_LOWER = frozenset(x.lower().replace(' ','') for x in US_STATES)

# COUNTRIES
COUNTRIES = frozenset(gazetteers.words('countries.txt'))

# State and world capitals
state_capitals = set(); world_capitals = set()
for s in wn.all_synsets():
    d = s.definition()
    if 'capital' in d:
        for state in US_STATES:
            if state in d:
                for l in s.lemma_names():
                    if l[0] == l[0].upper() and 'capital' not in l:
                        state_capitals.add(l.lower())
        for country in COUNTRIES:
Example #15
0
from sklearn.cluster import KMeans,MiniBatchKMeans
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import *
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus   import words
from nltk.corpus   import stopwords,gazetteers,names
from sklearn.feature_selection import *
eng_words = set([ w.lower() for w in words.words('en') ])

qn_words = set(['who','what','what',
				'when','where','how',
				'is','should','do',
				'if','would','should'])
stopwords = [ w for w in stopwords.words('english') if  w not in qn_words ]
places = set([ w.lower() for w in gazetteers.words() ])
names  = set([ w.lower() for w in names.words() ])


class Extractor:
	def __init__(self,fun):
		self.extractor = fun
	def fit(self,X,Y):
		pass
	def transform(self,X):
		return [ self.extractor(x) for x in X ] 
	def fit_transform(self,X,_):
		return self.transform(X)

class ToArray:
	def __init__(self):
#%%
"""
NPR 2019-01-06
https://www.npr.org/2019/01/06/682575357/sunday-puzzle-stuck-in-the-middle

Name a major U.S. city in 10 letters. If you have the right one, you can rearrange its letters to get two 5-letter words that are synonyms. What are they?
"""

import sys
sys.path.append('..')
import nprcommontools as nct
from nltk.corpus import gazetteers

#%%
COMMON_WORDS = frozenset(x for x in nct.get_common_words() if len(x) == 5)

#%%
US_CITIES = set(nct.alpha_only(x.lower()) for x in gazetteers.words('uscities.txt') if len(nct.alpha_only(x)) == 10)
city_dict = nct.make_sorted_dict(US_CITIES)

#%%
for c1 in COMMON_WORDS:
    my_synonyms = nct.get_synonyms(c1)
    for c2 in my_synonyms:
        sort_word = nct.sort_string(''.join(c1+c2))
        if sort_word in city_dict:
            print(c1,c2,city_dict[sort_word])
Example #17
0
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from nltk.corpus import gazetteers, names

import brown_driver
import math
import json

locations = gazetteers.words()
proper_names = names.words()


class Tagger:
    def __init__(self):
        self.brown_clusters = brown_driver.cluster_driver()
        self.brown_clusters.init_clusters('paths_100')
        self.import_wiki_data('wiki_outfile.json')

    def import_wiki_data(self, wiki_import):
        wiki_data = open(wiki_import, 'r')
        self.wiki_data = json.load(wiki_data)

    def read_in_data(self, file_name):
        sents = []
        infile = open(file_name)
        for line in infile.readlines():
            pieces = line.split()
            if len(pieces) == 0:
                continue
            data = tuple(pieces[1:])
            if pieces[0] == '0':  # New sentence
def loadData(files, path):
    data = []
    for f in files:
        data.append(gazetteers.words(path + '/' + f))
    return data
#!/usr/bin/python
'''
NPR 2017-04-09
Name a well-known U.S. city in two words.  Replace each of these words with a 
word that rhymes with it, and you'll name a large sea creature in two words.
What is it?
'''
import sys
sys.path.append('..')
from nprcommontools import get_category_members
import rhyme
from nltk.corpus import gazetteers
#%%
ANIMALS = frozenset([x for x in get_category_members('animal') if x.count('_') == 1])
USCITIES = set([x.lower() for x in gazetteers.words('uscities.txt') if x.count(' ') == 1])
# Cheating but honestly why wasn't this in there?
USCITIES.add('santa fe')

#%%
for city in USCITIES:
    c1,c2 = city.split(' ')
    c1_rhymes = rhyme.all_rhymes(c1)
    c2_rhymes = rhyme.all_rhymes(c2)
    for a1 in c1_rhymes:
        for a2 in c2_rhymes:
            if a1 + '_' + a2 in ANIMALS:
                print city, a1, a2
'''
NPR 2018-07-08

https://www.npr.org/2018/07/08/626992499/sunday-puzzle-hot-hot-hot

Name part of the human body. Switch the first two letters to get a 
two-word phrase for something that is worrisome. What is it? 
'''
import sys
sys.path.append('..')
import nprcommontools as nct
        
from nltk.corpus import gazetteers

#%%
US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt')])
US_STATE_ABBREVIATIONS = set([state.lower() for state in
    gazetteers.words('usstateabbrev.txt')])
US_STATES = set([state.lower() for state in gazetteers.words('usstates.txt')])

#%%
for city in US_CITIES:
    city2 = nct.alpha_only(city)
    if len(city2) % 2 == 0:
        continue
    good_flag = True
    while len(city2) > 1:
        abbrev,city2 = city2[:2],city2[2:]
        if abbrev not in US_STATE_ABBREVIATIONS:
            good_flag = False
            break
 def premod_countries(self, mention):
     for word in self.premod(mention):
         if word in gazetteers.words('countries.txt'):
             return True
     return False
Name a state capital. Drop one of its letters. 
The remaining letters can be rearranged to name of another major city 
in the United States. What is it? 
There are two different answers, and you should find both of them.
'''

import sys
sys.append('..')
from nprcommontools import sort_string

from nltk.corpus import wordnet as wn, gazetteers
import re

# U.S. States
states = set(gazetteers.words('usstates.txt'))

# capitals and major cities
cities = set(); capitals = set()
for synset in wn.all_synsets():
    d = synset.definition()
    for state in states:
        if state in d and 'city' in d:
            for l in synset.lemma_names():
                if l[0] == l[0].upper():
                    cities.add(l)
        if state in d and 'capital' in d:
            for l in synset.lemma_names():
                if l[0] == l[0].upper():
                    capitals.add(l)
Example #23
0
    "feb",
    "mar",
    "apr",
    "jun",
    "jul",
    "aug",
    "sep",
    "sept",
    "oct",
    "nov",
    "dec",
]

NAMES = set([name.lower() for filename in ("male.txt", "female.txt") for name in names.words(filename)])

US_CITIES = set([city.lower() for city in gazetteers.words("uscities.txt")])

# [XX] contains some non-ascii chars
COUNTRIES = set(
    [country.lower() for filename in ("isocountries.txt", "countries.txt") for country in gazetteers.words(filename)]
)

# States in North America
NA_STATES = set(
    [
        state.lower()
        for filename in ("usstates.txt", "mexstates.txt", "caprovinces.txt")
        for state in gazetteers.words(filename)
    ]
)
#!/usr/bin/python
'''
NPR 2017-07-16
http://www.npr.org/2017/07/16/537225382/sunday-puzzle-wehn-wrods-get-rearearngd

Name a U.S. city and its state — 12 letters altogether. 
Change two letters in the state's name. 
The result will be the two-word title of a classic novel. What is it?
'''

from nltk.corpus import gazetteers
import re

us_states = frozenset(gazetteers.words('usstates.txt'))

#%%
# via http://norvig.com/spell-correct.html
def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    return set(replaces)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1))

#%%
# Need the ranked Wikipedia entries from http://crosswordnexus.com/wiki
# Read in city, state combinations and also anything else that is 12 letters long
Example #25
0
from util import edict, pdict, normalize_title, load_stoplist
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import gazetteers, names
from collections import Counter
from fever_io import titles_to_jsonl_num, load_split_trainset, titles_to_tf, load_doc_tf
import pickle
from tqdm import tqdm
import numpy as np

places = set(gazetteers.words())
people = set(names.words())
stop = load_stoplist()


def title_edict(t2jnum={}):
    edocs = edict()
    for title in t2jnum:
        l_txt = normalize_title(title)
        if len(l_txt) > 0:
            if edocs[l_txt][0] is None:
                edocs[l_txt] = []
            edocs[l_txt][0].append(title)
    return edocs


def find_titles_in_claim(claim="", edocs=edict()):
    find = pdict(edocs)
    docset = {}
    ctoks = word_tokenize(claim)
    for word in ctoks:
        for dlist, phrase, start in find[word]:
'''
NPR Puzzle 2016-06-26

http://www.npr.org/2016/06/26/483521883/welcome-to-an-affair-of-phrases-each-entwined-by-a-tiny-of

Think of two well-known American cities, each five letters long. 
The first two letters of the first city are the state postal abbreviation of the second city. 
And the first two letters of the second city are the state postal abbreviation of the first city. 
What two cities are these?
'''

from nltk.corpus import gazetteers

# Get list of abbreviations from Gazetteers
state_abbrs = frozenset(abbr for abbr in gazetteers.words('usstateabbrev.txt') if len(abbr) ==2)

# Get list of cities from Gazetteers
cities = frozenset(city for city in gazetteers.words('uscities.txt') if len(city) == 5)

for city in cities:
    if city.upper()[:2] in state_abbrs:
        print city
#!/usr/bin/python
'''
NPR 2018-04-08
http://www.npr.org/puzzle

Name part of the human body, insert a speech hesitation, and you'll name a country — what is it?
'''

from nltk.corpus import gazetteers
import nprcommontools as nct

#%%
BODY_PARTS = nct.get_category_members('body_part')

# COUNTRIES
COUNTRIES = frozenset([x.lower() for x in gazetteers.words('countries.txt')])

#%%
for c in COUNTRIES:
    for b in BODY_PARTS:
        if c.startswith(b[0]) and c.endswith(b[-1]):
            for i in range(1,len(b)-1):
                if c.startswith(b[:i]) and c.endswith(b[i:]):
                    print b,c

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
        c,k = best_match(i)
        assert c == cost[i]
        out.append(s[i-k:i])
        i -= k

    return " ".join(reversed(out))

#%%
# Country names
countries = set([country.lower() for filename in ('isocountries.txt','countries.txt') \
                    for country in gazetteers.words(filename)])
    
#%%
# Words associated with Henry Ford
ford_words = set(_ for _ in get_synonyms('car') if '_' not in _)
ford_words.add('car')

good_countries = frozenset([c for w in ford_words for c in countries if w in c])
#%%
def sentence_score(s):
    """
    Score a sentence based on how common its words are
    """
    score = 0
    for w in s.split(' '):
        if w not in stop_words:
Example #29
0
#from ABBYY import CloudOCR
from nltk.corpus import gazetteers
import re
from shutil import move


scrapingFileNames = r'C:\scraping\Isaac\toBeScraped\namesOfScrapingFiles\\'
scrapingDir = r'C:\scraping\Isaac\toBeScraped\csvTxts\\'
os.chdir(scrapingDir)
scrapingLogs = r'C:\scraping\Isaac\logs'

for file in os.listdir (scrapingFileNames):
	filename = file[:-4]
	fileDirectory=filename

	placelist = gazetteers.words('countries.txt')
	currencyList=gazetteers.words('currencyList.txt')
	filename=filename+'.txt'

	#name of project
	# print filename
	f = open(filename, 'r').read()
	projectCandidates = re.findall('(?:[A-Z][\w-]*\s)+Project', f)
	ProjectName = ''
	projectDict = nltk.defaultdict(int)
	for project in projectCandidates:
		if project == 'The Project' or project == 'Mineral Project':
			continue
		else:
		    projectDict[project] += 1
		    if len(ProjectName) == 0:
Example #30
0
 for i in range(len(label_class)):
     if label_class[i] == 'code share indicator':
         for w in s.lower().split():
             if w in ['no', 'not']:
                 print(label_class[i] + ': ' + 'no')
     if label_class[i] in ['commission', 'infant commission']:
         for w in s.lower().split():
             if w in ['no', 'not']:
                 print(label_class[i] + ': ' + 'no')
         match = re.search('(\d+%)', s)
         if match:
             pct = match.group(1)
             print(label_class[i] + ': ' + pct)
     if label_class[i] == 'sale restriction':
         for w in s.split():
             if w in gazetteers.words('countries.txt'):
                 print(label_class[i] + ': ' + w)
                 break
     if label_class[i] == 'tour code':
         for j in range(len(s.split())):
             if s.lower().split()[j] == 'code':
                 w = s.split()[j+1]
                 if not enchant.Dict("en_US").check(w):
                     print(label_class[i] + ': ' + w)
     if label_class[i] in ['ticketing period', 'travelling period']:
         w = s.split()
         nw = []
         for j in range(len(w)):
             # Process case like "RELEASED: DEC 29, 201514-"
             if w[j].lower() == 'released':
                 if w[j+1].lower() in months or w[j+2].lower() in months:
Example #31
0
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
    'sunday'
}

MONTHS = {
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august',
    'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr',
    'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec'
}

NAMES = set([
    name for filename in ('male.txt', 'female.txt')
    for name in names.words(filename)
])

USCITIES = set([city for city in gazetteers.words('uscities.txt')])

# [XX] contains some non-ascii chars
COUNTRIES = set([
    country for filename in ('isocountries.txt', 'countries.txt')
    for country in gazetteers.words(filename)
])

# States in North America
NA_STATES = set([
    state for filename in ('usstates.txt', 'mexstates.txt', 'caprovinces.txt')
    for state in gazetteers.words(filename)
])

NATIONALITIES = set(
    [nationality for nationality in gazetteers.words('nationalities.txt')])
Example #32
0
# Run specific parsers according to each type of information we want to extract?

# ## Locations

# In[94]:

### Locations
#https://www.geeksforgeeks.org/nlp-location-tags-extraction/
import numpy as np
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree
from nltk.corpus import gazetteers

#sent_pos = nltk.pos_tag(data_lower)
words_tagged = t2.tag(filtered_sent_2)
place_lower = [w.lower() for w in gazetteers.words()]

loc_tag = words_tagged

cnt = 0
for cnt in np.arange(1, len(words_tagged) - 1):
    if words_tagged[cnt][0] in place_lower:
        if words_tagged[cnt][1] == 'NN':
            print(words_tagged[cnt][0])
            loc_tag[cnt] = (words_tagged[cnt][0], 'LOCATION')

    link_place = words_tagged[cnt][0] + ' ' + words_tagged[cnt + 1][0]
    if link_place in place_lower:
        if words_tagged[cnt][1] in ['JJ', 'NN'
                                    ] and words_tagged[cnt + 1][1] == 'NN':
            print(link_place)
    def get_features(self, index, sentence, postags, chunktags):
        word = sentence[index]
        idxf, idxl = 0, len(sentence) - 1
        prevword = '' if index == idxf else sentence[index - 1]
        nextword = '' if index == idxl else sentence[index + 1]

        return {
            'word':
            word,
            'prev_word':
            prevword,
            'next_word':
            nextword,
            'word_len':
            len(word),
            'prev_word_len':
            len(prevword),
            'next_word_len':
            len(nextword),
            'prefix-1':
            word[0].lower(),
            'prefix-2':
            word[:2].lower(),
            'prefix-3':
            word[:3].lower(),
            'prefix-4':
            word[:4].lower(),
            'suffix-1':
            word[-1].lower(),
            'suffix-2':
            word[-2:].lower(),
            'suffix-3':
            word[-3:].lower(),
            'suffix-4':
            word[-4:].lower(),
            'wordshape':
            hp.get_wordshape(word),
            'prev_wordshape':
            hp.get_wordshape(prevword),
            'next_wordshape':
            hp.get_wordshape(nextword),
            'shortwordshape':
            hp.get_shortwordshape(word),
            'prev_shortwordshape':
            hp.get_shortwordshape(prevword),
            'next_shortwordshape':
            hp.get_shortwordshape(nextword),
            'postag':
            postags[index],
            'prev_postag':
            '' if index == idxf else postags[index - 1],
            'next_postag':
            '' if index == idxl else postags[index + 1],
            'chunktag':
            chunktags[index],
            'prev_chunktag':
            '' if index == idxf else chunktags[index - 1],
            'next_chunktag':
            '' if index == idxl else chunktags[index + 1],
            'isupper':
            word.isupper(),
            'prev_isupper':
            '' if index == idxf else prevword.isupper(),
            'next_isupper':
            '' if index == idxl else nextword.isupper(),
            'islower':
            word.islower(),
            'prev_islower':
            '' if index == idxf else prevword.islower(),
            'next_islower':
            '' if index == idxl else nextword.islower(),
            'istitle':
            word.istitle(),
            'prev_istitle':
            '' if index == idxf else prevword.istitle(),
            'next_istitle':
            '' if index == idxl else nextword.istitle(),
            'has_hyphen':
            '-' in word,
            'has_period':
            '.' in word,
            'has_comma':
            ',' in word,
            'allsymbol':
            hp.get_allsymbol(word),
            'allnumber':
            hp.get_allnumber(word),
            'allcharacter':
            hp.get_allcharacter(word),
            'isalnum':
            word.isalnum(),
            'hasnumber':
            hp.get_hasnumber(word),
            'hascharacter':
            hp.get_hascharacter(word),
            'hassymbol':
            hp.get_hassymbol(word),
            'isgazetteer':
            word in gazetteers.words(),
            'prev_isgazetteer':
            prevword in gazetteers.words(),
            'next_isgazetteer':
            nextword in gazetteers.words(),
            'isstopword':
            word.lower() in stopwords.words('english'),
            'prev_isstopword':
            prevword.lower() in stopwords.words('english'),
            'next_isstopword':
            nextword.lower() in stopwords.words('english'),
            'porterstemmer':
            PorterStemmer().stem(word),
            'prev_porterstemmer':
            '' if index == idxf else PorterStemmer().stem(prevword),
            'next_porterstemmer':
            '' if index == idxl else PorterStemmer().stem(nextword),
            'lemmatize':
            WordNetLemmatizer().lemmatize(word),
            'prev_lemmatize':
            '' if index == idxf else WordNetLemmatizer().lemmatize(prevword),
            'next_lemmatize':
            '' if index == idxl else WordNetLemmatizer().lemmatize(nextword)
        }
"certain stories" — and the first word rhymes with something found in those stories. What city is it?
"""

import sys
sys.path.append('..')
from nprcommontools import alpha_only, get_category_members
import rhyme
import json
from nltk.corpus import gazetteers
#%%
with open('../plurals.json','rb') as fid:
    plurals = json.load(fid)

#%%
# U.S. cities from gazetteers
US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt') if city.count(' ') == 1])
# cheating
US_CITIES.add('coral gables')

#%%
# Words that mean "kind of story"
stories = get_category_members('story')
story_plurals = set()
for x in stories:
    try:
        for y in plurals[x]:
            story_plurals.add(y)
    except KeyError:
        pass
    
for city in US_CITIES:
Example #35
0
ORDINALS = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 
            'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth']

DAYS = ['monday', 'tuesday', 'wednesday', 'thursday', 
        'friday', 'saturday', 'sunday']

MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
          'august', 'september', 'october', 'november', 'december',
          'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept',
          'oct', 'nov', 'dec']

NAMES = set([name.lower() for filename in ('male.txt', 'female.txt') for name
             in names.words(filename)])

US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt')])

# [XX] contains some non-ascii chars
COUNTRIES = set([country.lower() for filename in ('isocountries.txt','countries.txt')
                 for country in gazetteers.words(filename)])

# States in North America
NA_STATES = set([state.lower() for filename in
    ('usstates.txt','mexstates.txt','caprovinces.txt') for state in
    gazetteers.words(filename)])
                     
US_STATE_ABBREVIATIONS = set([state.lower() for state in 
    gazetteers.words('usstateabbrev.txt')])

NATIONALITIES = set([nat.lower() for nat in 
    gazetteers.words('nationalities.txt')])
Example #36
0
import re
from nltk.corpus import gazetteers


USCITIES = set(gazetteers.words('uscities.txt'))
COUNTRIES = set([country for filename in ('isocountries.txt','countries.txt')
                 for country in gazetteers.words(filename)])

US_STATES = set([state.lower() for filename in
                 ('usstates.txt','usstateabbrev.txt') for state in
                 gazetteers.words(filename)])



#print USCITIES
print US_STATES
#print COUNTRIES
##########################

basedir = "stanford-full-pipeline"


all_stanford = LazyDict(basedir, stanford_general_opener)
RAW_SENTENCES = SuperLazyDict(all_stanford, stanford_raw_reader)
POS_SENTENCES = SuperLazyDict(all_stanford, stanford_pos_reader)
SYNTAX_PARSE_SENTENCES = SuperLazyDict(all_stanford, stanford_tree_reader)
NONPARENTED_SENTENCES = SuperLazyDict(all_stanford, stanford_nonparented_tree_reader)
COREF = SuperLazyDict(all_stanford, stanford_coref_reader)
PRONOUN_SET = set(pronoun_reader())
entity_types=gather_entities()
AUGMENTED_TREES=augmented_tree_reader()
RELATIONSHIPS_AND_GROUPS=set(rels_and_groups_reader())
COUNTRIES=set(gz.words('countries.txt'))
NATIONALITIES=set(gz.words('nationalities.txt'))
OFFICIALS=officials_reader() #these are bit silly; will probably discard"""
DEPENDENCIES=stanford_dependency_reader()
POSSESSIVE_PRONOUNS=['my','mine','your','yours','her','hers','his','our','ours','their','theirs']
TITLE_SET= {"chairman", "Chairman", "director", "Director", "president", "President", "manager", "managers","Manager", "executive",
            "CEO", "Officer", "officer", "consultant", "CFO", "COO", "CTO", "CMO", "founder", "shareholder",
            "researcher", "professor", "principal", "Principal", "minister", "Minister", "prime", "Prime", "chief",
            "Chief", "prosecutor", "Prosecutor", "queen", "Queen", "leader", "Leader", "secretary", "Secretary",
            "ex-Leader", "ex-leader", "coach", "Coach", "composer", "Composer", "head", "Head", "governor", "Governor",
            "judge", "Judge", "democrat", "Democrat", "republican", "Republican", "senator", "Senator", "congressman",
            "Congressman", "congresswoman", "Congresswoman", "analyst", "Analyst", "sen", "Sen", "Rep", "rep", "MP",
            "mp", "justice", "Justice", "co-chairwoman", "co-chair", "co-chairman", "Mr.", "mr.", "Mr", "mr", "Ms.",
            "ms.", "Mrs.", "mrs.","secretary-general","Secretary-General","doctor","Doctor"}

#obtained from WordNet by getting hypernyms of hypernyms of hypernyms of 'professional.n.01'
Example #38
0
ORDINALS = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 
            'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth']

DAYS = ['monday', 'tuesday', 'wednesday', 'thursday', 
        'friday', 'saturday', 'sunday']

MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july',
          'august', 'september', 'october', 'november', 'december',
          'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept',
          'oct', 'nov', 'dec']

                     
NAMES = set([name.lower() for filename in ('male.txt', 'female.txt') for name
             in names.words(filename)])

USCITIES = set(gazetteers.words('uscities.txt'))

# [XX] contains some non-ascii chars
COUNTRIES = set([country for filename in ('isocountries.txt','countries.txt')
                 for country in gazetteers.words(filename)])

# States in North America
NA_STATES = set([state.lower() for filename in
                 ('usstates.txt','mexstates.txt','caprovinces.txt') for state in
                 gazetteers.words(filename)])
                     
US_STATE_ABBREVIATIONS = set(gazetteers.words('usstateabbrev.txt'))

NATIONALITIES = set(gazetteers.words('nationalities.txt'))
                     
PERSON_PREFIXES = ['mr', 'mrs', 'ms', 'miss', 'dr', 'rev', 'judge',