Beispiel #1
0
def expand(E, depdex, recursive_digging=True):
    E_out = copy(E)
    buried = []
    while E:
        deps = sum([depdex[e] for e in E if e in depdex], [])

        #this probably seems weird, but it's to remove loops in the graph
        # should they exist (and I think they do). This way, we only handle
        # dependencies once for a point, then boom, never again.
        for e in E:
            if e in depdex: del depdex[e]

        # create barriers--that is, stuff in a PP should be a separate entity
        # for our purposes
        deptypedex = ft.indexBy(0, deps)
        bartypes = set([t for t in deptypedex if barriers.match(t)])
        localtypes = set(deptypedex) - bartypes
        buried.extend(
            sum([[f for d, e, f in deptypedex[t]] for t in bartypes], []))
        deps = sum([deptypedex[t] for t in localtypes], [])

        E = [f for d, e, f in deps]
        E_out.extend(E)
    if recursive_digging:
        return [E_out] + sum([expand([e], depdex) for e in buried], [])
    else:
        return [E_out]
Beispiel #2
0
def instantiation_factors(doc, schema):
    ""
    # extract caps from schema
    events, chains = schema["raw_schema"]

    schema_caps = []
    for slots, args in chains:
        slots = sorted(slots)
        for i, slot in enumerate(slots):
            for tlot in slots[i:]:
                schema_caps.append((slot, tlot))

    # extract caps from doc
    doc_caps = [cap for cap, t in doc["freq_joint"] if cap in schema_caps]

    CAP = "CAP"
    counts = ft.histo(ft.indexBy(CAP, [{CAP: x} for x in doc_caps]))

    return counts
Beispiel #3
0
def remove_redundant(dependencies):
    """"
    This is intended for the cleaners argument of extract_cn.

    So I was a moron, and never removed some redundant data from the corpus
    before co-reference. This worked out a bit gnarly; pairs like this
    have been showing up. 
    (((u'do', u'aux'), (u'do', u'rcmod')),
          8.671361343047339,
          0.714857142857143,
          6.1938295307481),
    So because they both appear with the same coreferent, and they repeat,
    they're getting a high pmi. This is bullshit.

    These are, I suspect, because of a bug in the corpus extractor mixing 
    with a bug in CoreNLPPython (just turn the double line breaks into periods,
    that would be sensible! (maybe I should have read the manual)), where text
    joined together without a period, but with double linebreaks, behaves
    like a single sentence. This happens at the beginning of documents; maybe
    else where, that's for tweaking as applied.


    (never mind, this doesn't seem to be the cause of the bug, if one at all.
    Never the less, this ought to be implemented anyway.)
    """
    
    dep_table = [{"pair": (a[-1], b[-1]), 
                  "original_pair": (a,b),
                  "distance_distance": a[1]-b[1],
                  "rel": rel} for rel,a,b in dependencies]
    depdex = ft.indexBy("pair", dep_table)
    for verb in depdex:
        if len(depdex[verb]) >= 2:
            pprint(depdex[verb])
    
    return dependencies
Beispiel #4
0
            docs = ft.tag(
                docs, "tokenized_%s" % tag,
                lambda N: [toker.tokenize(n) if n else "*NONE*" for n in N],
                [tag])

        docs = ft.tag(docs, "entities", lambda P, L, O: P + L + O,
                      ["tokenized_%s" % tag for tag in metatags])
        for doc in docs:
            if not doc["entities"]:
                print "No entities for this document."
                continue

            doc = CNLP.prep_tokens(doc)
            entities = NNP_entities(doc)
            entities = [{"entity": e} for e in entities]
            entcounts = ft.histo(ft.indexBy("entity", entities))

            entcounts = sorted([(-c, e) for e, c in entcounts.items()])
            entcounts = entcounts[:n]  #int(avg_doc_entlen)]
            entities = [e for c, e in entcounts]

            print doc["doc-id"]
            pprint(entities)
            pprint(doc["entities"])

            eq_RE = entity_set_eq(entities, doc["entities"])
            eq_ER = entity_set_eq(doc["entities"], entities)

            FP = len(entities) - eq_RE
            FN = len(doc["entities"]) - eq_ER
            quasiTP = entity_intersect(entities, doc["entities"])
Beispiel #5
0
def expand_entities(E, dep_raw):
    "Obtain entire subtree with root of e"
    depdex = ft.indexBy(1, dep_raw)
    return sum([expand([e], depdex) for e in E], [])
Beispiel #6
0
import ft, datetime
from pprint import pprint
from copy import deepcopy

# load_csv test

data = ft.load_csv(open("tests/mid_e_verb_morph.csv"))

# PIPE test
print("ft.summary(data): ")
pprint(ft.summary(data))
print("*"*50)
dex = ft.indexBy("suf", data, pipe=lambda s: s[0] if s else s)
pprint(dex)
print("^^^Indexed by first letter of suf^^^")
print("Counts")
pprint(ft.histo(dex))


# Singleton test
foo = ft.singletons("i", range(1,10))
foodex = ft.indexBy("i", foo)
print("Singletons")
pprint(foodex)

# Merge test
premerged_data = deepcopy(data)
merged_data = ft.merge(data, differs=["tns","prs","num","mud"])
print()
print("Merge result:")
pprint(merged_data)
import os
import nltk.corpus
import ft
import matplotlib.pyplot as plt
import numpy as np

europarl_path = "../../nltk_data/corpora/europarl_raw/"
path, langs, crap = os.walk(europarl_path).next()

lang_word_counts = {}
rank_counts = {}
for lang in langs:
    reader = nltk.corpus.EuroparlCorpusReader(europarl_path+lang, ".*")
    lang_word_counts[lang] = ft.histo(
                                ft.indexBy("word", 
                                    ft.singletons("word",
                                        [w for w in reader.words()])))
    rank_counts[lang] = sorted(lang_word_counts[lang].items(), key = lambda x: -x[1])

for lang in rank_counts:
    plt.loglog(range(1, len(rank_counts[lang])+1), 
             [c for w,c in rank_counts[lang]], label = lang)
plt.legend()
plt.show()




for feature_tuple in combos_search:

    print "START", feature_tuple

    print "Recovering data from dump..."

    get = lambda f: condensed_counts[feature_tuple][f]
    c_counts = get("c_counts")
    cj_counts = get("cj_counts")
    freq_raw = get("freq_raw")
    freq = get("freq")

    # Verb Counts for Seeding
    print "Counting verbs for seeding..."
    verb_count_dex = ft.indexBy("verb", [{
        "verb": v,
        "counts": c
    } for ((v, d), ), c in c_counts.items()])
    local_verb_totals = {
        v: sum([C["counts"] for C in verb_count_dex[v]])
        for v in verb_count_dex
    }
    local_verb_max = {
        v: max([C["counts"] for C in verb_count_dex[v]])
        for v in verb_count_dex
    }
    verb_below_min = set(
        [v for v, c in local_verb_max.items() if c < C_COUNTS_MIN])

    print "Total verb types:", len(local_verb_totals)
    print "Total verb types to remove:", len(verb_below_min)
    print "Removing verbs that appeared an estimated less than", C_COUNTS_MIN, "..."
Beispiel #9
0
def condense_c(data):
    "Turns data into a structure ready for approximate_p"
    data_table = [{"pair": d} for d in data] #creates a free table
    datadex = ft.indexBy("pair", data_table) #creates a dex
    return ft.histo(datadex) #converts the dex to counts
Beispiel #10
0
def extract_cn(corefs, words, deps, conjunctions, whole_chains=False, types=[]):
    """
    Extract the joint counts C(e(w,d), e(v,g)) for all w,d and v,g in a doc.
    
    'C(e(x, d), e(y, f)) is the number of times the two events e(x, d) and 
    e(y, f) had a coreferring entity filling the values of the dependencies 
    d and f.'

    In other words, it counts how many times something is both a d to an x
    and an f to a y. Like,
        John murdered Sandy, so the cops arrested him.
    John is a subject to murder and an object to arrested, so 
        C(e(murder, subject), e(arrest, object)) += 1

    Args:
        corefs = coreference chains
        words = relevant words for extraction
        deps = dependencies
        conjunctions = number of combinations of verbs in a chain to count
        whole_chains = instead of returning combinations of vd pairs, just
                        return the chains
        tags = if a tag is to be included with the data, wraps the vd n-tuples
                in another tuple with the tag
    """
    #debuc refers to debuccalization--a linguistic process where 
    #voiceless stops become glottal fricatives. the word means "the process of
    #chopping off the head"
    debuc_words = remove_strings(words)
    word_table = [{"coord":(i,j), "lemma":l} for i,j,l in words]
    worddex = ft.indexBy("coord", word_table)
    lemma = lambda i,j: worddex[(i,j)][0]["lemma"]
    
    pairs = []
    if not types: types = [False]*len(corefs)
    for chain, typ in zip(corefs, types):
        debuc_chain = remove_strings(chain) 

        # Find dependencies related to chain
        relevant_dep = []
        for dep in deps:
            rel, a_full, b_full = dep
            a,b = remove_strings([a_full,b_full])
            if a in debuc_chain or b in debuc_chain:
                # Find dependencies that are relevant
                if a in debuc_words: tar = a_full
                elif b in debuc_words: tar = b_full
                else: tar = False
                
                # Produce pairs and append
                if tar:
                    i,j,raw_string = tar
                    relevant_dep.append((lemma(i,j), rel))

        #Combinatorics--for generating joint probability distributions
        #Use append to retain the source of the collocations
        #pairs.append([x for x in combinations(relevant_dep, conjunctions)])

        #Use extend to collapse all collocations into a single list
        if not whole_chains:
            comb = combinations(set(relevant_dep), conjunctions)
            if not typ:
                pairs.extend([tuple(sorted(x)) for x in comb])
            else:
                pairs.extend([(tuple(sorted(x)), typ) for x in comb])
                
        elif whole_chains:
            #for whole chains mode, we don't do that. 
            pairs.append(set(relevant_dep))
        else:
            raise Exception("Impossible!")

    return pairs
Beispiel #11
0
def coref_type(coref_bundle, sentences, pref_types = False):
    """
    Determines the argument filler type as per Chambers and Jurafsky 2009, with
    some variants.

    (0) If possible, it prefers spec_types. It looks for these, left of the 
        head nouns in each DP. If 
    (1) Next, it tries the named entity recognizer for unambiguous results.
    (2) Next, it uses the single most frequent non-pronominal lemma.
    (3) Next, it uses the pronouns to make a last ditch choice.
    (4) In a last ditch effort, it chooses the shortest head noun in the 
        coreference chain.
    (5) At this point, it concludes there's no valid type.

    Args:
        coref = a flattened coreference chain
        sentences = the CoreNLP output "sentences"
        pref_types = default False, 
                     if specified, a tuple
                     (target_words, whole_dps)
                     where
                        target_words = a list of preferred head nouns
                        whole_dps = the rest of each dp outside its coref
    """

    max_counts = lambda D: max_set(D.items(), lambda x: x[1])
    # pre-requirement
    coref,c_raw = coref_bundle
    try:
        cnlps = get_cnlp(coref, sentences)
    except:
        return TYPE_ERROR
    #print "\n\n"
    #pprint(coref) 
    
    #(0) Apply the pre-counted list of HNs. If one of the most common head 
    #       nouns appears left of the head, it is the type.
    if pref_types:
        #Get the proper nouns left of the head
        dp_wholes = coref_kinda_flat(c_raw)
        
        head_spaces = get_head_space(coref, sentences, dp_wholes)
        head_squished = [w for s,i,w in sum(head_spaces,[])]
        #head_squished = [w for w in head_squished if pref_types(w["Lemma"])]
        head_squished = [w for w in head_squished if pref_types(w)]
        
        lemmas = ft.histo(ft.indexBy("Lemma", head_squished))
        hell_yea = max_counts(lemmas)

        if len(hell_yea) == 1:
             return hell_yea[0][0].lower()

    #(1-X) preparation 
    # we gotta dig deeper for these

    #get corenlp output for each word, and get the parts out we want
    NEcounts = ft.histo(ft.indexBy("NamedEntityTag", cnlps))
    Lcounts_all = ft.histo(ft.indexBy("Lemma", cnlps))
    Lcounts = dict([l for l in Lcounts_all.items() if l[0] not in pn_lemmas])

    #get the max_set of Named Entity counts and Lemma counts
    NE_crap = {"O", "MISC"}
    NE_crapless = dict([l for l in NEcounts.items() if l[0] not in NE_crap])
    NE_max = max_counts(NE_crapless)
    L_max = max_counts(Lcounts)
    
    #head noun counting
    # (this is done as a weird side effect--a slightly cleaned up version is
    #   applied as the first step here.)
    temp = singlitate(L_max)
    if temp in HN_counter: HN_counter[temp] += 1
    else: HN_counter[temp] = 1

    #pprint(NE_max)
    #pprint(Lcounts)
    #print "\n"

    #Data extraction is finally done.


    #(1) If we have a solid NE instance, return that.
    NE_max = singlitate(NE_max)
    NE_max = NE_max if NE_max not in NE_crap else False
    if NE_max: return NE_max
    
    
    #(3) We're really hurtin now. It tries to build a type based on pn.
    L_max_pn = set([pn for pn, c in max_counts(Lcounts_all)])
    #L_max_pn = singlitate(L_max_pn)
    
    #THIS NEEDS SOME TWEAKING
    for pn_tag in pronoun_groups:
        if L_max_pn & pronoun_groups[pn_tag]: return pn_tag
    
    #EMERGENCY ATTTEMPT
    return "THINGY"

    #(2) Is there a single most frequent, non-pn lemma? Return that.
    L_max = singlitate(L_max)
    if L_max: return L_max.lower()
    
    #(4) Real desperate here.
    # take the shortest head, all lowercase:
    if L_max_pn: 
        worst = min(L_max_pn, key=lambda s: len(s)).lower()
        print "Selecting, possibly, the worst possible choice"
        print L_max_pn
        print worst
        print "\n"
        return worst


    #(5) Ut-oh
    print "WARNING: No type for this poor fellow:"
    pprint(coref)
    pprint(NEcounts)
    pprint(NE_max)
    pprint(Lcounts)
    pprint(L_max)
    print "oh, and"
    pprint(L_max_pn)
    print "\n"
    return "NO-VALID-TYPE"
Beispiel #12
0
def query_price_guide(query_tuple, coin_ft):
    """
    With a validated query, return one or more coins from the free table
    matching the description

    :param query_tuple: (tuple) output from validate_query
    :param coin_ft: (list(dict)) free table of coin prices, made with
        pcgs_scraper
    :return :
    """
    query_year, query_denom, query_mint, query_orig, query_norm = query_tuple

    # lots of loops, make life easy
    result_found = False

    # index the ft by year to quick search for year
    coins_by_year = ft.indexBy('year_short', coin_ft)
    for year, year_coins in coins_by_year.items():
        if result_found:
            break
        if year == query_year:
            coins_by_denom = ft.indexBy('denom', year_coins)
            for denom, denom_coins in coins_by_denom.items():
                if result_found:
                    break
                if denom == query_denom:
                    if query_mint is not None:
                        coins_by_mint = ft.indexBy('mint', denom_coins)
                        for mint, mint_coins in coins_by_mint.items():
                            if mint == query_mint:
                                # ding ding ding: year, denom, & mint match
                                results = mint_coins
                                result_found = True
                                break
                        else:
                            # mint loop completed without breaks, means no mint
                            # match was found ... return denomination matches
                            # anyway (max 4-5 coins, user can choose)
                            results = denom_coins
                            result_found = True
                            break
                    else:
                        # query_mint is none, return denominations
                        results = denom_coins
                        result_found = True
                        break
            else:
                # denomination loop completed without breaking, no match found
                # perhaps that denomination was not minted in the specified year
                results = None
                break
    else:
        # year loop completed without breaking, no year match found
        # perhaps the specified year was out of range, or mistyped
        results = None

    # rank results by Levenshtein Edit Distance
    if results is not None:
        if len(results) > 1:
            results = rank_results(query_norm, results)

    return results
def merge_grade_bins(filepath):
    """
    data from scrape_all is separated by grade_bins, combine into a single
    entry for each pcgs number

    The reason this is a separate function from the scrape function is to ensure
    that the scraped data is saved as soon as possible to avoid errors causing
    data loss after waiting for all the prices to be scraped

    :param filepath: path to scraped data pkl from scrape_all
    :return price_guide: (dict) lookup table for
    """
    # NOTE: there will be a lot of entries with a None pcgs_num, these are
    # typically the prices for full type sets of a certain coin on the price
    # detail page, which this script currently does not account for. To get
    # that information, capture the title of each subsection of the table and
    # you can relate it

    price_guide = {}

    scraped_data = pickle.load(open(filepath, 'rb'))
    by_pcgs_num = ft.indexBy('pcgs_num', scraped_data)

    for pcgs_num, entries in by_pcgs_num.items():
        if pcgs_num is None:
            continue  # see above comment

        # 1: merge price information into a single dict that points from # to $
        assert len(entries) == 3, 'Something went wrong, more than 3 bins'
        # make absolutely certain that prices are in the correct order, overkill
        temp_order = [None, None, None]
        desigs = []
        for entry in entries:
            desigs.append(entry['desig'])  # save desig for step 2
            if entry['grades'] == 'grades-1-20':
                temp_order[0] = entry['prices']
            elif entry['grades'] == 'grades-25-60':
                temp_order[1] = entry['prices']
            elif entry['grades'] == 'grades-61-70':
                temp_order[2] = entry['prices']
        this_num_prices = []  # prices for this pcgs number
        for grade_bin in temp_order:
            for price in grade_bin:
                this_num_prices.append(price)
        assert len(this_num_prices) == len(GRADES), \
            f'Wrong number of grades for PCGS#{pcgs_num}: ' \
            f'{len(this_num_prices)}'
        price_by_grade = dict(zip(GRADES, this_num_prices))

        # 2: Ensure that the desig is always two place if at least one is
        merged_desig = []  # start with len == 0
        for desig in desigs:
            if len(desig) > len(merged_desig):  # longest desig wins
                merged_desig = desig

        merged_entry = {
            'pcgs_num': pcgs_num,
            'desig': merged_desig,
            'prices': price_by_grade,
            'merged_from': entries,
        }
        price_guide[pcgs_num] = merged_entry

    print('Saving price guide to pkl and json files...')
    pickle.dump(price_guide, open('data/scraped_pcgs_prices.pkl', 'wb'))
    with open('data/scraped_pcgs_prices.json', 'w') as outfile:
        json.dump(price_guide, outfile)