def __init__(self): self.stopwords = stopwords.words('english') self.uscities = set([w.lower() for w in gazetteers.words('uscities.txt')]) self.usstates = set([w.lower() for w in gazetteers.words('usstates.txt')]) self.countries = set([w.lower() for w in gazetteers.words('countries.txt')]) self.basicwords = set(words.words('en-basic')) self.paragraph_tokens = [] self.texts = []
def __init__(self, corpus, outfile, tokens_dir, parses_dir, depparses_dir, train=False): self.relations = list() self.train = train self.corpus = corpus self.outfile = outfile self.tokenized_sents, self.tok_sents_pos = self.process_tokens_dir(tokens_dir) self.parses = self.process_parses_dir(parses_dir) self.depparses = self.process_dparses_dir(depparses_dir) self.clusterdict = self.make_cluster_dict('50mpaths2') self.pronouns = ["I", "me", "my", "mine", "myself", "you", "your", "yours", "yourself", "he", "him", "his", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "we", "us", "our", "ours", "ourselves", "you", "your", "yours", "yourselves", "they", "them", "their", "theirs", "themselves"] self.locations = set([c.lower() for c in gazetteers.words('countries.txt')] + [s.lower() for s in gazetteers.words('usstates.txt')]) self.names = set([name.lower() for name in names.words('male.txt')] + [name.lower() for name in names.words('female.txt')]) self.feat_fns = [self.words, #good self.word_types, #good self.pronoun, #good self.name, #good #self.place, #look to get a better list self.num_words_between, #good self.words_between_words, #good self.prev_word, #good #self.post_word, #really bad feature #self.prev_word_pos, #bad self.post_word_pos, #good self.first_word_after_w1, #good self.words_between_POSs, #good #self.last_word_before_w2 self.w1clust, #good self.w2clust, #good self.tree_path, #self.w1pref, #bad #self.w1suf, #self.w2pref, #self.w2suf, #self.w1bow, #self.w2bow self.et1dw1, self.et2dw2, self.h1dw1, self.h2dw2 ]
def __init__(self): self.locations = set(gazetteers.words()) self.lookahead = 0 for loc in self.locations: nwords = loc.count(' ') if nwords > self.lookahead: self.lookahead = nwords
def __init__(self): self.gazetteers = [x.lower() for x in gazetteers.words()] self.stopwords = [x.lower() for x in stopwords.words('english')] self.rx_space = r'\s+' self.rx_email = r'[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+' self.rx_url = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' self.rx_hashtag = r'#(\w+)' self.rx_mention = r'@(\w+)' self.rx_empty = "empty"
def __init__(self): # gazetteers is a WordListCorpusReader of many different location words self.locations = set(gazetteers.words()) self.lookahead = 0 # need to know how many words to lookahead in the tagged sentence to find a location for loc in self.locations: nwords = loc.count(' ') if nwords > self.lookahead: self.lookahead = nwords
def __init__(self): self.train_path = "../data/train" self.dev_path = "../data/dev" self.beta = 0 self.max_iter = 0 # self.classifier = None self.dict_classifiers = {} self.locations = set(gazetteers.words()) self.names = set(names.words()) self.pos = None self.previous_labels = None
def __init__(self, lm, confSet, myInfltSet): """initializes the language model.""" self.languageModel = lm self.languageModelSQL = LM("web1t") self.confusionSet = confSet self.inflectionSet = myInfltSet self.dict = enchant.Dict("en") self.myDict = {} for word in names.words(): self.myDict[word] = 1 for word in gazetteers.words(): self.myDict[word] = 2
def __init__(self): self.train_path = "../data/train" self.dev_path = "../data/dev" self.beta = 0 self.max_iter = 0 self.classifier = None self.nltk_names = set(names.words()) self.nltk_stopwords = set(stopwords.words()) self.titles = [ 'Master', 'Mr.', 'Mr', 'Miss.', 'Miss', 'Mrs.', 'Mrs', 'Ms.', 'Ms', 'Mx.', 'Mx', 'Sir', 'Gentleman', 'Sire', 'Mistress', 'Madam', 'Dame', 'Lord', 'Lady', 'Esq', 'Excellency', 'Dr', 'Professor', 'QC', 'Cl', 'SCl', 'Eur Lng', 'Chancellor', 'Vice-Chancellor', 'Principal', 'President', 'Minister', 'Warden', 'Dean', 'Regent', 'Rector', 'Provost', 'Director', 'Chief Executive', 'manager', 'chairman', 'secretary', 'leader' ] self.say = ['say', 'said', 'says'] # 'speak', 'spoke', 'speaks' # 'talk', 'told', 'talks', # 'discuss', 'discusses', 'discussed', # 'mention', 'mentioned', 'mentions'] self.gazetteers = set(gazetteers.words())
########################## basedir = "stanford-full-pipeline" all_stanford = LazyDict(basedir, stanford_general_opener) RAW_SENTENCES = SuperLazyDict(all_stanford, stanford_raw_reader) POS_SENTENCES = SuperLazyDict(all_stanford, stanford_pos_reader) SYNTAX_PARSE_SENTENCES = SuperLazyDict(all_stanford, stanford_tree_reader) NONPARENTED_SENTENCES = SuperLazyDict(all_stanford, stanford_nonparented_tree_reader) COREF = SuperLazyDict(all_stanford, stanford_coref_reader) PRONOUN_SET = set(pronoun_reader()) entity_types = gather_entities() AUGMENTED_TREES = augmented_tree_reader() RELATIONSHIPS_AND_GROUPS = set(rels_and_groups_reader()) COUNTRIES = set(gz.words('countries.txt')) NATIONALITIES = set(gz.words('nationalities.txt')) OFFICIALS = officials_reader() #these are bit silly; will probably discard""" DEPENDENCIES = stanford_dependency_reader() POSSESSIVE_PRONOUNS = [ 'my', 'mine', 'your', 'yours', 'her', 'hers', 'his', 'our', 'ours', 'their', 'theirs' ] TITLE_SET = { "chairman", "Chairman", "director", "Director", "president", "President", "manager", "managers", "Manager", "executive", "CEO", "Officer", "officer", "consultant", "CFO", "COO", "CTO", "CMO", "founder", "shareholder", "researcher", "professor", "principal", "Principal", "minister", "Minister", "prime", "Prime", "chief", "Chief", "prosecutor", "Prosecutor", "queen", "Queen", "leader", "Leader", "secretary", "Secretary", "ex-Leader", "ex-leader", "coach", "Coach", "composer", "Composer", "head",
ORDINALS = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth'] DAYS = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec'] NAMES = set([name.lower() for filename in ('male.txt', 'female.txt') for name in names.words(filename)]) USCITIES = set(gazetteers.words('uscities.txt')) # [XX] contains some non-ascii chars COUNTRIES = set([country for filename in ('isocountries.txt','countries.txt') for country in gazetteers.words(filename)]) # States in North America NA_STATES = set([state.lower() for filename in ('usstates.txt','mexstates.txt','caprovinces.txt') for state in gazetteers.words(filename)]) US_STATE_ABBREVIATIONS = set(gazetteers.words('usstateabbrev.txt')) NATIONALITIES = set(gazetteers.words('nationalities.txt')) PERSON_PREFIXES = ['mr', 'mrs', 'ms', 'miss', 'dr', 'rev', 'judge',
for name in names.words(filename) ]) PERSON_PREFIXES = [ 'mr', 'mrs', 'ms', 'miss', 'dr', 'rev', 'judge', 'justice', 'honorable', 'hon', 'rep', 'sen', 'sec', 'minister', 'chairman', 'succeeding', 'says', 'president' ] PERSON_SUFFIXES = ['sr', 'jr', 'phd', 'md'] ORG_SUFFIXES = [ 'ltd', 'inc', 'co', 'corp', 'plc', 'llc', 'llp', 'gmbh', 'corporation', 'associates', 'partners', 'committee', 'institute', 'commission', 'university', 'college', 'airlines', 'magazine' ] COUNTRIES = set([ country for filename in ('isocountries.txt', 'countries.txt') for country in gazetteers.words(filename) ]) lancaster_stemmer = LancasterStemmer() wordnet_lemmatizer = WordNetLemmatizer() tknzr = TweetTokenizer(preserve_case=True, strip_handles=False, reduce_len=False) #train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) #test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) def get_tuples(dspath): sentences = [] s = ''
def __init__(self, corpus): self.places = set(gazetteers.words()) self.people = set(names.words()) self.stop_words = self.load_stop_words() self.corpus = corpus
def firstPassGrouping(): words = [] stemmed = [] features = {} tokenizer = RegexpTokenizer('\s+', gaps=True) clean = re.compile("[()\/']") split = re.compile("[/]") grams = [] with open('data/features.txt', 'r') as featureIn: for line in map(cleanFeatures, featureIn): ws = [] for w in tokenizer.tokenize(clean.sub(' ', line[1])): if w not in engStop: stemmed.append((eng.stem(w).lower(), line[1])) words.append((w.lower(), line[1])) ws.append(w.lower()) grams.append((list(everygrams(ws, min_len=2, max_len=2)), line[1])) features[line[0]] = line[1] # cuisine, style, price, atmosphere, and occasion noGrams = set(map(lambda x: x[1], filter(lambda x: len(x[0]) == 0, grams))) grams = list(filter(lambda x: len(x[0]) > 0, grams)) groupedw = seq(grams) \ .flat_map(lambda x: set([(w, x[1]) for w in seq(x[0]).flat_map(lambda y: list(y)).to_list()])) \ .group_by(lambda w: w[0]) \ .map(lambda x: (x[0], list(map(lambda y: y[1], x[1])))) \ .to_dict() noGramsId = {} for g in noGrams: noGramsId[g] = g simGrouped = {} simular = set() for k, v in sorted(groupedw.items(), key=lambda x: x[0]): # print(k, v) nl = v.copy() match = noGramsId.get(k, None) for nk in noGramsId.keys(): if len(nk) > 1: if nk in v: nl.append(nk) simular.add(nk) for vv in v: if nk in vv: nl.append(nk) simular.add(nk) if match is not None: nl.append(match) simGrouped[k] = list(set(nl)) simular.add(match) else: if len(k) > 1: simGrouped[k] = v noSim = noGrams - simular # nationalities = gazetteers.words() featureNationality = [] for nosim in noSim: didConvert = convert(nosim) if didConvert is not None: if didConvert in nationalities: featureNationality.append(nosim) else: if nosim in nationalities: featureNationality.append(nosim) else: split = nosim.split('-') for sp in split: if sp in nationalities: featureNationality.append(nosim) # print("-----------------") noSim = noSim - set(featureNationality) # occasions = ['monday'] # # cuisine, style, price, atmosphere, and occasion for k, v in sorted(simGrouped.items(), key=lambda x: x[0]): # print(k,v) if k in nationalities: featureNationality.append(k) featureNationality.extend(v) simGrouped.pop(k) didConvert = convert(k) if didConvert is not None: if didConvert in nationalities: simGrouped.pop(k) featureNationality.append(k) featureNationality.extend(v) with open('q1/noSim.json', 'w+') as nsOut: nsOut.write(json.dumps(list(noSim), indent=2, sort_keys=True)) with open('q1/featureNationality.json', 'w+') as nsOut: nsOut.write(json.dumps(featureNationality, indent=2, sort_keys=True)) with open('q1/grouped.json', 'w+') as nsOut: nsOut.write(json.dumps(simGrouped, indent=2, sort_keys=True))
#!/usr/bin/python ''' NPR 2017-11-12 https://www.npr.org/2017/11/12/563367879/sunday-puzzle-move-around-to-find-new-meaning Take the name of a U.S. state capital. Immediately to the right of it write the name of a world capital. If you have the right ones, the name of a U.S. state will be embedded in consecutive letters within that letter string. What three places are these? ''' from nltk.corpus import wordnet as wn, gazetteers #%% # US states US_STATES = frozenset(gazetteers.words('usstates.txt')) US_STATES_LOWER = frozenset(x.lower().replace(' ','') for x in US_STATES) # COUNTRIES COUNTRIES = frozenset(gazetteers.words('countries.txt')) # State and world capitals state_capitals = set(); world_capitals = set() for s in wn.all_synsets(): d = s.definition() if 'capital' in d: for state in US_STATES: if state in d: for l in s.lemma_names(): if l[0] == l[0].upper() and 'capital' not in l: state_capitals.add(l.lower()) for country in COUNTRIES:
from sklearn.cluster import KMeans,MiniBatchKMeans from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import * from sklearn.preprocessing import StandardScaler from nltk.tokenize import wordpunct_tokenize from nltk.corpus import words from nltk.corpus import stopwords,gazetteers,names from sklearn.feature_selection import * eng_words = set([ w.lower() for w in words.words('en') ]) qn_words = set(['who','what','what', 'when','where','how', 'is','should','do', 'if','would','should']) stopwords = [ w for w in stopwords.words('english') if w not in qn_words ] places = set([ w.lower() for w in gazetteers.words() ]) names = set([ w.lower() for w in names.words() ]) class Extractor: def __init__(self,fun): self.extractor = fun def fit(self,X,Y): pass def transform(self,X): return [ self.extractor(x) for x in X ] def fit_transform(self,X,_): return self.transform(X) class ToArray: def __init__(self):
#%% """ NPR 2019-01-06 https://www.npr.org/2019/01/06/682575357/sunday-puzzle-stuck-in-the-middle Name a major U.S. city in 10 letters. If you have the right one, you can rearrange its letters to get two 5-letter words that are synonyms. What are they? """ import sys sys.path.append('..') import nprcommontools as nct from nltk.corpus import gazetteers #%% COMMON_WORDS = frozenset(x for x in nct.get_common_words() if len(x) == 5) #%% US_CITIES = set(nct.alpha_only(x.lower()) for x in gazetteers.words('uscities.txt') if len(nct.alpha_only(x)) == 10) city_dict = nct.make_sorted_dict(US_CITIES) #%% for c1 in COMMON_WORDS: my_synonyms = nct.get_synonyms(c1) for c2 in my_synonyms: sort_word = nct.sort_string(''.join(c1+c2)) if sort_word in city_dict: print(c1,c2,city_dict[sort_word])
import sklearn_crfsuite from sklearn_crfsuite import metrics from nltk.corpus import gazetteers, names import brown_driver import math import json locations = gazetteers.words() proper_names = names.words() class Tagger: def __init__(self): self.brown_clusters = brown_driver.cluster_driver() self.brown_clusters.init_clusters('paths_100') self.import_wiki_data('wiki_outfile.json') def import_wiki_data(self, wiki_import): wiki_data = open(wiki_import, 'r') self.wiki_data = json.load(wiki_data) def read_in_data(self, file_name): sents = [] infile = open(file_name) for line in infile.readlines(): pieces = line.split() if len(pieces) == 0: continue data = tuple(pieces[1:]) if pieces[0] == '0': # New sentence
def loadData(files, path): data = [] for f in files: data.append(gazetteers.words(path + '/' + f)) return data
#!/usr/bin/python ''' NPR 2017-04-09 Name a well-known U.S. city in two words. Replace each of these words with a word that rhymes with it, and you'll name a large sea creature in two words. What is it? ''' import sys sys.path.append('..') from nprcommontools import get_category_members import rhyme from nltk.corpus import gazetteers #%% ANIMALS = frozenset([x for x in get_category_members('animal') if x.count('_') == 1]) USCITIES = set([x.lower() for x in gazetteers.words('uscities.txt') if x.count(' ') == 1]) # Cheating but honestly why wasn't this in there? USCITIES.add('santa fe') #%% for city in USCITIES: c1,c2 = city.split(' ') c1_rhymes = rhyme.all_rhymes(c1) c2_rhymes = rhyme.all_rhymes(c2) for a1 in c1_rhymes: for a2 in c2_rhymes: if a1 + '_' + a2 in ANIMALS: print city, a1, a2
''' NPR 2018-07-08 https://www.npr.org/2018/07/08/626992499/sunday-puzzle-hot-hot-hot Name part of the human body. Switch the first two letters to get a two-word phrase for something that is worrisome. What is it? ''' import sys sys.path.append('..') import nprcommontools as nct from nltk.corpus import gazetteers #%% US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt')]) US_STATE_ABBREVIATIONS = set([state.lower() for state in gazetteers.words('usstateabbrev.txt')]) US_STATES = set([state.lower() for state in gazetteers.words('usstates.txt')]) #%% for city in US_CITIES: city2 = nct.alpha_only(city) if len(city2) % 2 == 0: continue good_flag = True while len(city2) > 1: abbrev,city2 = city2[:2],city2[2:] if abbrev not in US_STATE_ABBREVIATIONS: good_flag = False break
def premod_countries(self, mention): for word in self.premod(mention): if word in gazetteers.words('countries.txt'): return True return False
Name a state capital. Drop one of its letters. The remaining letters can be rearranged to name of another major city in the United States. What is it? There are two different answers, and you should find both of them. ''' import sys sys.append('..') from nprcommontools import sort_string from nltk.corpus import wordnet as wn, gazetteers import re # U.S. States states = set(gazetteers.words('usstates.txt')) # capitals and major cities cities = set(); capitals = set() for synset in wn.all_synsets(): d = synset.definition() for state in states: if state in d and 'city' in d: for l in synset.lemma_names(): if l[0] == l[0].upper(): cities.add(l) if state in d and 'capital' in d: for l in synset.lemma_names(): if l[0] == l[0].upper(): capitals.add(l)
"feb", "mar", "apr", "jun", "jul", "aug", "sep", "sept", "oct", "nov", "dec", ] NAMES = set([name.lower() for filename in ("male.txt", "female.txt") for name in names.words(filename)]) US_CITIES = set([city.lower() for city in gazetteers.words("uscities.txt")]) # [XX] contains some non-ascii chars COUNTRIES = set( [country.lower() for filename in ("isocountries.txt", "countries.txt") for country in gazetteers.words(filename)] ) # States in North America NA_STATES = set( [ state.lower() for filename in ("usstates.txt", "mexstates.txt", "caprovinces.txt") for state in gazetteers.words(filename) ] )
#!/usr/bin/python ''' NPR 2017-07-16 http://www.npr.org/2017/07/16/537225382/sunday-puzzle-wehn-wrods-get-rearearngd Name a U.S. city and its state — 12 letters altogether. Change two letters in the state's name. The result will be the two-word title of a classic novel. What is it? ''' from nltk.corpus import gazetteers import re us_states = frozenset(gazetteers.words('usstates.txt')) #%% # via http://norvig.com/spell-correct.html def edits1(word): "All edits that are one edit away from `word`." letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] return set(replaces) def edits2(word): "All edits that are two edits away from `word`." return set(e2 for e1 in edits1(word) for e2 in edits1(e1)) #%% # Need the ranked Wikipedia entries from http://crosswordnexus.com/wiki # Read in city, state combinations and also anything else that is 12 letters long
from util import edict, pdict, normalize_title, load_stoplist from nltk import word_tokenize, sent_tokenize from nltk.corpus import gazetteers, names from collections import Counter from fever_io import titles_to_jsonl_num, load_split_trainset, titles_to_tf, load_doc_tf import pickle from tqdm import tqdm import numpy as np places = set(gazetteers.words()) people = set(names.words()) stop = load_stoplist() def title_edict(t2jnum={}): edocs = edict() for title in t2jnum: l_txt = normalize_title(title) if len(l_txt) > 0: if edocs[l_txt][0] is None: edocs[l_txt] = [] edocs[l_txt][0].append(title) return edocs def find_titles_in_claim(claim="", edocs=edict()): find = pdict(edocs) docset = {} ctoks = word_tokenize(claim) for word in ctoks: for dlist, phrase, start in find[word]:
''' NPR Puzzle 2016-06-26 http://www.npr.org/2016/06/26/483521883/welcome-to-an-affair-of-phrases-each-entwined-by-a-tiny-of Think of two well-known American cities, each five letters long. The first two letters of the first city are the state postal abbreviation of the second city. And the first two letters of the second city are the state postal abbreviation of the first city. What two cities are these? ''' from nltk.corpus import gazetteers # Get list of abbreviations from Gazetteers state_abbrs = frozenset(abbr for abbr in gazetteers.words('usstateabbrev.txt') if len(abbr) ==2) # Get list of cities from Gazetteers cities = frozenset(city for city in gazetteers.words('uscities.txt') if len(city) == 5) for city in cities: if city.upper()[:2] in state_abbrs: print city
#!/usr/bin/python ''' NPR 2018-04-08 http://www.npr.org/puzzle Name part of the human body, insert a speech hesitation, and you'll name a country — what is it? ''' from nltk.corpus import gazetteers import nprcommontools as nct #%% BODY_PARTS = nct.get_category_members('body_part') # COUNTRIES COUNTRIES = frozenset([x.lower() for x in gazetteers.words('countries.txt')]) #%% for c in COUNTRIES: for b in BODY_PARTS: if c.startswith(b[0]) and c.endswith(b[-1]): for i in range(1,len(b)-1): if c.startswith(b[:i]) and c.endswith(b[i:]): print b,c
# Backtrack to recover the minimal-cost string. out = [] i = len(s) while i>0: c,k = best_match(i) assert c == cost[i] out.append(s[i-k:i]) i -= k return " ".join(reversed(out)) #%% # Country names countries = set([country.lower() for filename in ('isocountries.txt','countries.txt') \ for country in gazetteers.words(filename)]) #%% # Words associated with Henry Ford ford_words = set(_ for _ in get_synonyms('car') if '_' not in _) ford_words.add('car') good_countries = frozenset([c for w in ford_words for c in countries if w in c]) #%% def sentence_score(s): """ Score a sentence based on how common its words are """ score = 0 for w in s.split(' '): if w not in stop_words:
#from ABBYY import CloudOCR from nltk.corpus import gazetteers import re from shutil import move scrapingFileNames = r'C:\scraping\Isaac\toBeScraped\namesOfScrapingFiles\\' scrapingDir = r'C:\scraping\Isaac\toBeScraped\csvTxts\\' os.chdir(scrapingDir) scrapingLogs = r'C:\scraping\Isaac\logs' for file in os.listdir (scrapingFileNames): filename = file[:-4] fileDirectory=filename placelist = gazetteers.words('countries.txt') currencyList=gazetteers.words('currencyList.txt') filename=filename+'.txt' #name of project # print filename f = open(filename, 'r').read() projectCandidates = re.findall('(?:[A-Z][\w-]*\s)+Project', f) ProjectName = '' projectDict = nltk.defaultdict(int) for project in projectCandidates: if project == 'The Project' or project == 'Mineral Project': continue else: projectDict[project] += 1 if len(ProjectName) == 0:
for i in range(len(label_class)): if label_class[i] == 'code share indicator': for w in s.lower().split(): if w in ['no', 'not']: print(label_class[i] + ': ' + 'no') if label_class[i] in ['commission', 'infant commission']: for w in s.lower().split(): if w in ['no', 'not']: print(label_class[i] + ': ' + 'no') match = re.search('(\d+%)', s) if match: pct = match.group(1) print(label_class[i] + ': ' + pct) if label_class[i] == 'sale restriction': for w in s.split(): if w in gazetteers.words('countries.txt'): print(label_class[i] + ': ' + w) break if label_class[i] == 'tour code': for j in range(len(s.split())): if s.lower().split()[j] == 'code': w = s.split()[j+1] if not enchant.Dict("en_US").check(w): print(label_class[i] + ': ' + w) if label_class[i] in ['ticketing period', 'travelling period']: w = s.split() nw = [] for j in range(len(w)): # Process case like "RELEASED: DEC 29, 201514-" if w[j].lower() == 'released': if w[j+1].lower() in months or w[j+2].lower() in months:
'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday' } MONTHS = { 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec' } NAMES = set([ name for filename in ('male.txt', 'female.txt') for name in names.words(filename) ]) USCITIES = set([city for city in gazetteers.words('uscities.txt')]) # [XX] contains some non-ascii chars COUNTRIES = set([ country for filename in ('isocountries.txt', 'countries.txt') for country in gazetteers.words(filename) ]) # States in North America NA_STATES = set([ state for filename in ('usstates.txt', 'mexstates.txt', 'caprovinces.txt') for state in gazetteers.words(filename) ]) NATIONALITIES = set( [nationality for nationality in gazetteers.words('nationalities.txt')])
# Run specific parsers according to each type of information we want to extract? # ## Locations # In[94]: ### Locations #https://www.geeksforgeeks.org/nlp-location-tags-extraction/ import numpy as np from nltk.chunk import ChunkParserI from nltk.chunk.util import conlltags2tree from nltk.corpus import gazetteers #sent_pos = nltk.pos_tag(data_lower) words_tagged = t2.tag(filtered_sent_2) place_lower = [w.lower() for w in gazetteers.words()] loc_tag = words_tagged cnt = 0 for cnt in np.arange(1, len(words_tagged) - 1): if words_tagged[cnt][0] in place_lower: if words_tagged[cnt][1] == 'NN': print(words_tagged[cnt][0]) loc_tag[cnt] = (words_tagged[cnt][0], 'LOCATION') link_place = words_tagged[cnt][0] + ' ' + words_tagged[cnt + 1][0] if link_place in place_lower: if words_tagged[cnt][1] in ['JJ', 'NN' ] and words_tagged[cnt + 1][1] == 'NN': print(link_place)
def get_features(self, index, sentence, postags, chunktags): word = sentence[index] idxf, idxl = 0, len(sentence) - 1 prevword = '' if index == idxf else sentence[index - 1] nextword = '' if index == idxl else sentence[index + 1] return { 'word': word, 'prev_word': prevword, 'next_word': nextword, 'word_len': len(word), 'prev_word_len': len(prevword), 'next_word_len': len(nextword), 'prefix-1': word[0].lower(), 'prefix-2': word[:2].lower(), 'prefix-3': word[:3].lower(), 'prefix-4': word[:4].lower(), 'suffix-1': word[-1].lower(), 'suffix-2': word[-2:].lower(), 'suffix-3': word[-3:].lower(), 'suffix-4': word[-4:].lower(), 'wordshape': hp.get_wordshape(word), 'prev_wordshape': hp.get_wordshape(prevword), 'next_wordshape': hp.get_wordshape(nextword), 'shortwordshape': hp.get_shortwordshape(word), 'prev_shortwordshape': hp.get_shortwordshape(prevword), 'next_shortwordshape': hp.get_shortwordshape(nextword), 'postag': postags[index], 'prev_postag': '' if index == idxf else postags[index - 1], 'next_postag': '' if index == idxl else postags[index + 1], 'chunktag': chunktags[index], 'prev_chunktag': '' if index == idxf else chunktags[index - 1], 'next_chunktag': '' if index == idxl else chunktags[index + 1], 'isupper': word.isupper(), 'prev_isupper': '' if index == idxf else prevword.isupper(), 'next_isupper': '' if index == idxl else nextword.isupper(), 'islower': word.islower(), 'prev_islower': '' if index == idxf else prevword.islower(), 'next_islower': '' if index == idxl else nextword.islower(), 'istitle': word.istitle(), 'prev_istitle': '' if index == idxf else prevword.istitle(), 'next_istitle': '' if index == idxl else nextword.istitle(), 'has_hyphen': '-' in word, 'has_period': '.' in word, 'has_comma': ',' in word, 'allsymbol': hp.get_allsymbol(word), 'allnumber': hp.get_allnumber(word), 'allcharacter': hp.get_allcharacter(word), 'isalnum': word.isalnum(), 'hasnumber': hp.get_hasnumber(word), 'hascharacter': hp.get_hascharacter(word), 'hassymbol': hp.get_hassymbol(word), 'isgazetteer': word in gazetteers.words(), 'prev_isgazetteer': prevword in gazetteers.words(), 'next_isgazetteer': nextword in gazetteers.words(), 'isstopword': word.lower() in stopwords.words('english'), 'prev_isstopword': prevword.lower() in stopwords.words('english'), 'next_isstopword': nextword.lower() in stopwords.words('english'), 'porterstemmer': PorterStemmer().stem(word), 'prev_porterstemmer': '' if index == idxf else PorterStemmer().stem(prevword), 'next_porterstemmer': '' if index == idxl else PorterStemmer().stem(nextword), 'lemmatize': WordNetLemmatizer().lemmatize(word), 'prev_lemmatize': '' if index == idxf else WordNetLemmatizer().lemmatize(prevword), 'next_lemmatize': '' if index == idxl else WordNetLemmatizer().lemmatize(nextword) }
"certain stories" — and the first word rhymes with something found in those stories. What city is it? """ import sys sys.path.append('..') from nprcommontools import alpha_only, get_category_members import rhyme import json from nltk.corpus import gazetteers #%% with open('../plurals.json','rb') as fid: plurals = json.load(fid) #%% # U.S. cities from gazetteers US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt') if city.count(' ') == 1]) # cheating US_CITIES.add('coral gables') #%% # Words that mean "kind of story" stories = get_category_members('story') story_plurals = set() for x in stories: try: for y in plurals[x]: story_plurals.add(y) except KeyError: pass for city in US_CITIES:
ORDINALS = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth', 'eleventh', 'twelfth'] DAYS = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'] MONTHS = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec'] NAMES = set([name.lower() for filename in ('male.txt', 'female.txt') for name in names.words(filename)]) US_CITIES = set([city.lower() for city in gazetteers.words('uscities.txt')]) # [XX] contains some non-ascii chars COUNTRIES = set([country.lower() for filename in ('isocountries.txt','countries.txt') for country in gazetteers.words(filename)]) # States in North America NA_STATES = set([state.lower() for filename in ('usstates.txt','mexstates.txt','caprovinces.txt') for state in gazetteers.words(filename)]) US_STATE_ABBREVIATIONS = set([state.lower() for state in gazetteers.words('usstateabbrev.txt')]) NATIONALITIES = set([nat.lower() for nat in gazetteers.words('nationalities.txt')])
import re from nltk.corpus import gazetteers USCITIES = set(gazetteers.words('uscities.txt')) COUNTRIES = set([country for filename in ('isocountries.txt','countries.txt') for country in gazetteers.words(filename)]) US_STATES = set([state.lower() for filename in ('usstates.txt','usstateabbrev.txt') for state in gazetteers.words(filename)]) #print USCITIES print US_STATES #print COUNTRIES
########################## basedir = "stanford-full-pipeline" all_stanford = LazyDict(basedir, stanford_general_opener) RAW_SENTENCES = SuperLazyDict(all_stanford, stanford_raw_reader) POS_SENTENCES = SuperLazyDict(all_stanford, stanford_pos_reader) SYNTAX_PARSE_SENTENCES = SuperLazyDict(all_stanford, stanford_tree_reader) NONPARENTED_SENTENCES = SuperLazyDict(all_stanford, stanford_nonparented_tree_reader) COREF = SuperLazyDict(all_stanford, stanford_coref_reader) PRONOUN_SET = set(pronoun_reader()) entity_types=gather_entities() AUGMENTED_TREES=augmented_tree_reader() RELATIONSHIPS_AND_GROUPS=set(rels_and_groups_reader()) COUNTRIES=set(gz.words('countries.txt')) NATIONALITIES=set(gz.words('nationalities.txt')) OFFICIALS=officials_reader() #these are bit silly; will probably discard""" DEPENDENCIES=stanford_dependency_reader() POSSESSIVE_PRONOUNS=['my','mine','your','yours','her','hers','his','our','ours','their','theirs'] TITLE_SET= {"chairman", "Chairman", "director", "Director", "president", "President", "manager", "managers","Manager", "executive", "CEO", "Officer", "officer", "consultant", "CFO", "COO", "CTO", "CMO", "founder", "shareholder", "researcher", "professor", "principal", "Principal", "minister", "Minister", "prime", "Prime", "chief", "Chief", "prosecutor", "Prosecutor", "queen", "Queen", "leader", "Leader", "secretary", "Secretary", "ex-Leader", "ex-leader", "coach", "Coach", "composer", "Composer", "head", "Head", "governor", "Governor", "judge", "Judge", "democrat", "Democrat", "republican", "Republican", "senator", "Senator", "congressman", "Congressman", "congresswoman", "Congresswoman", "analyst", "Analyst", "sen", "Sen", "Rep", "rep", "MP", "mp", "justice", "Justice", "co-chairwoman", "co-chair", "co-chairman", "Mr.", "mr.", "Mr", "mr", "Ms.", "ms.", "Mrs.", "mrs.","secretary-general","Secretary-General","doctor","Doctor"} #obtained from WordNet by getting hypernyms of hypernyms of hypernyms of 'professional.n.01'