def expand(E, depdex, recursive_digging=True): E_out = copy(E) buried = [] while E: deps = sum([depdex[e] for e in E if e in depdex], []) #this probably seems weird, but it's to remove loops in the graph # should they exist (and I think they do). This way, we only handle # dependencies once for a point, then boom, never again. for e in E: if e in depdex: del depdex[e] # create barriers--that is, stuff in a PP should be a separate entity # for our purposes deptypedex = ft.indexBy(0, deps) bartypes = set([t for t in deptypedex if barriers.match(t)]) localtypes = set(deptypedex) - bartypes buried.extend( sum([[f for d, e, f in deptypedex[t]] for t in bartypes], [])) deps = sum([deptypedex[t] for t in localtypes], []) E = [f for d, e, f in deps] E_out.extend(E) if recursive_digging: return [E_out] + sum([expand([e], depdex) for e in buried], []) else: return [E_out]
def instantiation_factors(doc, schema): "" # extract caps from schema events, chains = schema["raw_schema"] schema_caps = [] for slots, args in chains: slots = sorted(slots) for i, slot in enumerate(slots): for tlot in slots[i:]: schema_caps.append((slot, tlot)) # extract caps from doc doc_caps = [cap for cap, t in doc["freq_joint"] if cap in schema_caps] CAP = "CAP" counts = ft.histo(ft.indexBy(CAP, [{CAP: x} for x in doc_caps])) return counts
def remove_redundant(dependencies): """" This is intended for the cleaners argument of extract_cn. So I was a moron, and never removed some redundant data from the corpus before co-reference. This worked out a bit gnarly; pairs like this have been showing up. (((u'do', u'aux'), (u'do', u'rcmod')), 8.671361343047339, 0.714857142857143, 6.1938295307481), So because they both appear with the same coreferent, and they repeat, they're getting a high pmi. This is bullshit. These are, I suspect, because of a bug in the corpus extractor mixing with a bug in CoreNLPPython (just turn the double line breaks into periods, that would be sensible! (maybe I should have read the manual)), where text joined together without a period, but with double linebreaks, behaves like a single sentence. This happens at the beginning of documents; maybe else where, that's for tweaking as applied. (never mind, this doesn't seem to be the cause of the bug, if one at all. Never the less, this ought to be implemented anyway.) """ dep_table = [{"pair": (a[-1], b[-1]), "original_pair": (a,b), "distance_distance": a[1]-b[1], "rel": rel} for rel,a,b in dependencies] depdex = ft.indexBy("pair", dep_table) for verb in depdex: if len(depdex[verb]) >= 2: pprint(depdex[verb]) return dependencies
docs = ft.tag( docs, "tokenized_%s" % tag, lambda N: [toker.tokenize(n) if n else "*NONE*" for n in N], [tag]) docs = ft.tag(docs, "entities", lambda P, L, O: P + L + O, ["tokenized_%s" % tag for tag in metatags]) for doc in docs: if not doc["entities"]: print "No entities for this document." continue doc = CNLP.prep_tokens(doc) entities = NNP_entities(doc) entities = [{"entity": e} for e in entities] entcounts = ft.histo(ft.indexBy("entity", entities)) entcounts = sorted([(-c, e) for e, c in entcounts.items()]) entcounts = entcounts[:n] #int(avg_doc_entlen)] entities = [e for c, e in entcounts] print doc["doc-id"] pprint(entities) pprint(doc["entities"]) eq_RE = entity_set_eq(entities, doc["entities"]) eq_ER = entity_set_eq(doc["entities"], entities) FP = len(entities) - eq_RE FN = len(doc["entities"]) - eq_ER quasiTP = entity_intersect(entities, doc["entities"])
def expand_entities(E, dep_raw): "Obtain entire subtree with root of e" depdex = ft.indexBy(1, dep_raw) return sum([expand([e], depdex) for e in E], [])
import ft, datetime from pprint import pprint from copy import deepcopy # load_csv test data = ft.load_csv(open("tests/mid_e_verb_morph.csv")) # PIPE test print("ft.summary(data): ") pprint(ft.summary(data)) print("*"*50) dex = ft.indexBy("suf", data, pipe=lambda s: s[0] if s else s) pprint(dex) print("^^^Indexed by first letter of suf^^^") print("Counts") pprint(ft.histo(dex)) # Singleton test foo = ft.singletons("i", range(1,10)) foodex = ft.indexBy("i", foo) print("Singletons") pprint(foodex) # Merge test premerged_data = deepcopy(data) merged_data = ft.merge(data, differs=["tns","prs","num","mud"]) print() print("Merge result:") pprint(merged_data)
import os import nltk.corpus import ft import matplotlib.pyplot as plt import numpy as np europarl_path = "../../nltk_data/corpora/europarl_raw/" path, langs, crap = os.walk(europarl_path).next() lang_word_counts = {} rank_counts = {} for lang in langs: reader = nltk.corpus.EuroparlCorpusReader(europarl_path+lang, ".*") lang_word_counts[lang] = ft.histo( ft.indexBy("word", ft.singletons("word", [w for w in reader.words()]))) rank_counts[lang] = sorted(lang_word_counts[lang].items(), key = lambda x: -x[1]) for lang in rank_counts: plt.loglog(range(1, len(rank_counts[lang])+1), [c for w,c in rank_counts[lang]], label = lang) plt.legend() plt.show()
for feature_tuple in combos_search: print "START", feature_tuple print "Recovering data from dump..." get = lambda f: condensed_counts[feature_tuple][f] c_counts = get("c_counts") cj_counts = get("cj_counts") freq_raw = get("freq_raw") freq = get("freq") # Verb Counts for Seeding print "Counting verbs for seeding..." verb_count_dex = ft.indexBy("verb", [{ "verb": v, "counts": c } for ((v, d), ), c in c_counts.items()]) local_verb_totals = { v: sum([C["counts"] for C in verb_count_dex[v]]) for v in verb_count_dex } local_verb_max = { v: max([C["counts"] for C in verb_count_dex[v]]) for v in verb_count_dex } verb_below_min = set( [v for v, c in local_verb_max.items() if c < C_COUNTS_MIN]) print "Total verb types:", len(local_verb_totals) print "Total verb types to remove:", len(verb_below_min) print "Removing verbs that appeared an estimated less than", C_COUNTS_MIN, "..."
def condense_c(data): "Turns data into a structure ready for approximate_p" data_table = [{"pair": d} for d in data] #creates a free table datadex = ft.indexBy("pair", data_table) #creates a dex return ft.histo(datadex) #converts the dex to counts
def extract_cn(corefs, words, deps, conjunctions, whole_chains=False, types=[]): """ Extract the joint counts C(e(w,d), e(v,g)) for all w,d and v,g in a doc. 'C(e(x, d), e(y, f)) is the number of times the two events e(x, d) and e(y, f) had a coreferring entity filling the values of the dependencies d and f.' In other words, it counts how many times something is both a d to an x and an f to a y. Like, John murdered Sandy, so the cops arrested him. John is a subject to murder and an object to arrested, so C(e(murder, subject), e(arrest, object)) += 1 Args: corefs = coreference chains words = relevant words for extraction deps = dependencies conjunctions = number of combinations of verbs in a chain to count whole_chains = instead of returning combinations of vd pairs, just return the chains tags = if a tag is to be included with the data, wraps the vd n-tuples in another tuple with the tag """ #debuc refers to debuccalization--a linguistic process where #voiceless stops become glottal fricatives. the word means "the process of #chopping off the head" debuc_words = remove_strings(words) word_table = [{"coord":(i,j), "lemma":l} for i,j,l in words] worddex = ft.indexBy("coord", word_table) lemma = lambda i,j: worddex[(i,j)][0]["lemma"] pairs = [] if not types: types = [False]*len(corefs) for chain, typ in zip(corefs, types): debuc_chain = remove_strings(chain) # Find dependencies related to chain relevant_dep = [] for dep in deps: rel, a_full, b_full = dep a,b = remove_strings([a_full,b_full]) if a in debuc_chain or b in debuc_chain: # Find dependencies that are relevant if a in debuc_words: tar = a_full elif b in debuc_words: tar = b_full else: tar = False # Produce pairs and append if tar: i,j,raw_string = tar relevant_dep.append((lemma(i,j), rel)) #Combinatorics--for generating joint probability distributions #Use append to retain the source of the collocations #pairs.append([x for x in combinations(relevant_dep, conjunctions)]) #Use extend to collapse all collocations into a single list if not whole_chains: comb = combinations(set(relevant_dep), conjunctions) if not typ: pairs.extend([tuple(sorted(x)) for x in comb]) else: pairs.extend([(tuple(sorted(x)), typ) for x in comb]) elif whole_chains: #for whole chains mode, we don't do that. pairs.append(set(relevant_dep)) else: raise Exception("Impossible!") return pairs
def coref_type(coref_bundle, sentences, pref_types = False): """ Determines the argument filler type as per Chambers and Jurafsky 2009, with some variants. (0) If possible, it prefers spec_types. It looks for these, left of the head nouns in each DP. If (1) Next, it tries the named entity recognizer for unambiguous results. (2) Next, it uses the single most frequent non-pronominal lemma. (3) Next, it uses the pronouns to make a last ditch choice. (4) In a last ditch effort, it chooses the shortest head noun in the coreference chain. (5) At this point, it concludes there's no valid type. Args: coref = a flattened coreference chain sentences = the CoreNLP output "sentences" pref_types = default False, if specified, a tuple (target_words, whole_dps) where target_words = a list of preferred head nouns whole_dps = the rest of each dp outside its coref """ max_counts = lambda D: max_set(D.items(), lambda x: x[1]) # pre-requirement coref,c_raw = coref_bundle try: cnlps = get_cnlp(coref, sentences) except: return TYPE_ERROR #print "\n\n" #pprint(coref) #(0) Apply the pre-counted list of HNs. If one of the most common head # nouns appears left of the head, it is the type. if pref_types: #Get the proper nouns left of the head dp_wholes = coref_kinda_flat(c_raw) head_spaces = get_head_space(coref, sentences, dp_wholes) head_squished = [w for s,i,w in sum(head_spaces,[])] #head_squished = [w for w in head_squished if pref_types(w["Lemma"])] head_squished = [w for w in head_squished if pref_types(w)] lemmas = ft.histo(ft.indexBy("Lemma", head_squished)) hell_yea = max_counts(lemmas) if len(hell_yea) == 1: return hell_yea[0][0].lower() #(1-X) preparation # we gotta dig deeper for these #get corenlp output for each word, and get the parts out we want NEcounts = ft.histo(ft.indexBy("NamedEntityTag", cnlps)) Lcounts_all = ft.histo(ft.indexBy("Lemma", cnlps)) Lcounts = dict([l for l in Lcounts_all.items() if l[0] not in pn_lemmas]) #get the max_set of Named Entity counts and Lemma counts NE_crap = {"O", "MISC"} NE_crapless = dict([l for l in NEcounts.items() if l[0] not in NE_crap]) NE_max = max_counts(NE_crapless) L_max = max_counts(Lcounts) #head noun counting # (this is done as a weird side effect--a slightly cleaned up version is # applied as the first step here.) temp = singlitate(L_max) if temp in HN_counter: HN_counter[temp] += 1 else: HN_counter[temp] = 1 #pprint(NE_max) #pprint(Lcounts) #print "\n" #Data extraction is finally done. #(1) If we have a solid NE instance, return that. NE_max = singlitate(NE_max) NE_max = NE_max if NE_max not in NE_crap else False if NE_max: return NE_max #(3) We're really hurtin now. It tries to build a type based on pn. L_max_pn = set([pn for pn, c in max_counts(Lcounts_all)]) #L_max_pn = singlitate(L_max_pn) #THIS NEEDS SOME TWEAKING for pn_tag in pronoun_groups: if L_max_pn & pronoun_groups[pn_tag]: return pn_tag #EMERGENCY ATTTEMPT return "THINGY" #(2) Is there a single most frequent, non-pn lemma? Return that. L_max = singlitate(L_max) if L_max: return L_max.lower() #(4) Real desperate here. # take the shortest head, all lowercase: if L_max_pn: worst = min(L_max_pn, key=lambda s: len(s)).lower() print "Selecting, possibly, the worst possible choice" print L_max_pn print worst print "\n" return worst #(5) Ut-oh print "WARNING: No type for this poor fellow:" pprint(coref) pprint(NEcounts) pprint(NE_max) pprint(Lcounts) pprint(L_max) print "oh, and" pprint(L_max_pn) print "\n" return "NO-VALID-TYPE"
def query_price_guide(query_tuple, coin_ft): """ With a validated query, return one or more coins from the free table matching the description :param query_tuple: (tuple) output from validate_query :param coin_ft: (list(dict)) free table of coin prices, made with pcgs_scraper :return : """ query_year, query_denom, query_mint, query_orig, query_norm = query_tuple # lots of loops, make life easy result_found = False # index the ft by year to quick search for year coins_by_year = ft.indexBy('year_short', coin_ft) for year, year_coins in coins_by_year.items(): if result_found: break if year == query_year: coins_by_denom = ft.indexBy('denom', year_coins) for denom, denom_coins in coins_by_denom.items(): if result_found: break if denom == query_denom: if query_mint is not None: coins_by_mint = ft.indexBy('mint', denom_coins) for mint, mint_coins in coins_by_mint.items(): if mint == query_mint: # ding ding ding: year, denom, & mint match results = mint_coins result_found = True break else: # mint loop completed without breaks, means no mint # match was found ... return denomination matches # anyway (max 4-5 coins, user can choose) results = denom_coins result_found = True break else: # query_mint is none, return denominations results = denom_coins result_found = True break else: # denomination loop completed without breaking, no match found # perhaps that denomination was not minted in the specified year results = None break else: # year loop completed without breaking, no year match found # perhaps the specified year was out of range, or mistyped results = None # rank results by Levenshtein Edit Distance if results is not None: if len(results) > 1: results = rank_results(query_norm, results) return results
def merge_grade_bins(filepath): """ data from scrape_all is separated by grade_bins, combine into a single entry for each pcgs number The reason this is a separate function from the scrape function is to ensure that the scraped data is saved as soon as possible to avoid errors causing data loss after waiting for all the prices to be scraped :param filepath: path to scraped data pkl from scrape_all :return price_guide: (dict) lookup table for """ # NOTE: there will be a lot of entries with a None pcgs_num, these are # typically the prices for full type sets of a certain coin on the price # detail page, which this script currently does not account for. To get # that information, capture the title of each subsection of the table and # you can relate it price_guide = {} scraped_data = pickle.load(open(filepath, 'rb')) by_pcgs_num = ft.indexBy('pcgs_num', scraped_data) for pcgs_num, entries in by_pcgs_num.items(): if pcgs_num is None: continue # see above comment # 1: merge price information into a single dict that points from # to $ assert len(entries) == 3, 'Something went wrong, more than 3 bins' # make absolutely certain that prices are in the correct order, overkill temp_order = [None, None, None] desigs = [] for entry in entries: desigs.append(entry['desig']) # save desig for step 2 if entry['grades'] == 'grades-1-20': temp_order[0] = entry['prices'] elif entry['grades'] == 'grades-25-60': temp_order[1] = entry['prices'] elif entry['grades'] == 'grades-61-70': temp_order[2] = entry['prices'] this_num_prices = [] # prices for this pcgs number for grade_bin in temp_order: for price in grade_bin: this_num_prices.append(price) assert len(this_num_prices) == len(GRADES), \ f'Wrong number of grades for PCGS#{pcgs_num}: ' \ f'{len(this_num_prices)}' price_by_grade = dict(zip(GRADES, this_num_prices)) # 2: Ensure that the desig is always two place if at least one is merged_desig = [] # start with len == 0 for desig in desigs: if len(desig) > len(merged_desig): # longest desig wins merged_desig = desig merged_entry = { 'pcgs_num': pcgs_num, 'desig': merged_desig, 'prices': price_by_grade, 'merged_from': entries, } price_guide[pcgs_num] = merged_entry print('Saving price guide to pkl and json files...') pickle.dump(price_guide, open('data/scraped_pcgs_prices.pkl', 'wb')) with open('data/scraped_pcgs_prices.json', 'w') as outfile: json.dump(price_guide, outfile)