Esempio n. 1
0
def add_stats(text, anaphor, doc, nouns, head2text):
    head = getHead(text)
    if head.endswith("%"): return  #skip percents
    if head[-1].isdigit(): return  #skip numbers
    if utils.isConj(head): return  #just skip these guys too
    if head == "himself": return  #NOTE for some reason, the filter doesn't
    #catch this, must be happening after head
    #noun is created.
    if head == "themselves": return

    anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd())

    #update the head2text dict
    if text not in head2text[head]:
        head2text[head].append(text)
    #make sure the head nouns are reasonable
    #print "{0} => {1}".format(text, head)

    #then look for thangs
    if text not in list(nouns.keys()):
        nouns[text] = Nominal(text)
        nouns[text].updateDocs(doc.getName())
    else:
        nouns[text].updateCount()
        nouns[text].updateDocs(doc.getName())

    if anaphor_np["GRAMMAR"] == "SUBJECT":
        nouns[text].subj += 1
    elif anaphor_np["GRAMMAR"] == "OBJECT":
        nouns[text].dobj += 1

    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #record stats
        sd = doc.sentence_distance(antecedent, anaphor)
        nouns[text].sentence_distance(sd)
        nouns[text].most_recent_antecedents.append(
            antecedent.getText().lower())

        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            nouns[text].subj_ante += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            nouns[text].dobj_ante += 1

        if antecedent.getText().lower() == anaphor.getText().lower():
            nouns[text].string_matches += 1

        if specificity_utils.isProper(antecedent_np):
            nouns[text].prp_ante += 1
        elif specificity_utils.isNominal(antecedent_np):
            nouns[text].nom_ante += 1
        elif specificity_utils.isPronoun(antecedent_np):
            nouns[text].pro_ante += 1

    else:
        #this guy starts the chain
        nouns[text].starts_chain += 1
    files = []
    with open(sys.argv[2], 'r') as inFile:
        files.extend([x for x in inFile.readlines() if not x.startswith("#")])
    prog = ProgressBar(len(files))

    heads2nps = defaultdict(list)
    i = 0
    for f in files:
        prog.update_time(i)
        i += 1
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        f = f.strip()
        nps = reconcile.getNPs(f)
        for np in nps:
            if specificity_utils.isNominal(np) or \
                    specificity_utils.isPronoun(np) or \
                    np["DATE"] != "NONE":
                head = specificity_utils.getHead(
                    utils.textClean(np.getText()).lower())
                if head.find(" and ") > -1:
                    continue
                if head in heads:
                    heads2nps[head].append(utils.textClean(np.getText()))
    sys.stderr.write("\r \r\n")

    for head in list(heads2nps.keys()):
        counts = {}
        definite = False
        for np in heads2nps[head]:
            if np == head:
Esempio n. 3
0
def add_stats(text, head, anaphor, doc, nouns, head2text):

    #catches a problem with the following report
    if head == 'the':
        head = text.split()[-1]

    if head.endswith("%"): return  #skip percents
    if head[-1].isdigit(): return  #skip numbers
    if utils.isConj(head): return  #just skip these guys too
    if head == "himself": return  #NOTE for some reason, the filter doesn't
    #catch this, must be happening after head
    #noun is created.
    if head == "themselves": return
    if head == "head": return
    if head == "where": return
    if head == "there": return
    if head == "here": return

    anaphor_np = doc.nps.getAnnotBySpan(anaphor.getStart(), anaphor.getEnd())

    #update the head2text dict
    if text not in head2text[head]:
        head2text[head].append(text)
    #make sure the head nouns are reasonable
    #print "{0} => {1}".format(text, head)

    #then look for thangs
    if text not in list(nouns.keys()):
        nouns[text] = VirtualPronoun(text)
        nouns[text].updateDocs(doc.getName())
    else:
        nouns[text].updateCount()
        nouns[text].updateDocs(doc.getName())

    if anaphor_np["GRAMMAR"] == "SUBJECT":
        nouns[text].subj += 1
    elif anaphor_np["GRAMMAR"] == "OBJECT":
        nouns[text].dobj += 1

    #begin modifier code
    definite = "the {0}".format(head)
    indefinite1 = "a {0}".format(head)
    indefinite2 = "an {0}".format(head)

    #pos = reconcile.getPOS(doc.getName())
    #head_index = specificity_utils.getHeadIndex(anaphor_np, head)
    #np_pos = pos.getSubset(anaphor.getStart(), anaphor.getEnd())
    #np_words = text.split()
    if text.startswith(definite):
        nouns[text].bare_definite += 1
    #elif text.startswith(indefinite1) or text.startswith(indefinite2):
    #nouns[text].indefinite += 1
    #else:
    ##NOTE: just checking to see if there is some kind of modification now
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #if text.startswith("the "):
    #get parts of speech for the np:
    #else:
    ##not definite, but still modified
    #if len(np_pos) == len(np_words):
    ##sys.stderr.write("Mismatch tag and word length: {0} => {1}\n".format(np_pos.getList(), np_words))
    #continue

    #for i in range(0, head_index):
    #if np_pos[i]["TAG"] == "DT":
    #continue
    #elif np_pos[i]["TAG"] == "JJ":
    ##print "Adjective: {0}".format(np_words[i])
    #nouns[text].adjective_modifiers.append(np_words[i])
    #elif np_pos[i]["TAG"].startswith("N"):
    ##print "Noun: {0} {1}".format(np_words[i], np_pos[i]["TAG"])
    #if np_pos[i]["TAG"].startswith("NNP"):
    #nouns[text].proper_modifiers.append(np_words[i])
    #else:
    #nouns[text].common_modifiers.append(np_words[i])
    #else:
    ##print "?: {0}".format(np_words[i])
    #nouns[text].other_modifiers.append(np_words[i])

    #capture post modifiers
    #if text.find(head + " of ") > -1:
    #of_start = text.find(head + " of ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].of_attachments.append(of_object.strip())

    #if text.find(head + " on ") > -1:
    #of_start = text.find(head + " on ")
    #of_object = text[len(head) + of_start + 3:]
    #nouns[text].on_attachments.append(of_object.strip())

    #if text.find(head + " that ") > -1:
    #that_start = text.find(head + " that ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].that_attachments.append(that_clause.strip())

    #if text.find(head + " with ") > -1:
    #that_start = text.find(head + " with ")
    #that_clause = text[len(head) + that_start+5:]
    #nouns[text].with_attachments.append(that_clause.strip())

    #if text.find(head + " by ") > -1:
    #by_start = text.find(head + " by ")
    #by_object = text[len(head) + by_start+3:]
    #nouns[text].by_attachments.append(by_object.strip())

    #if text.find(head + " which ") > -1:
    #which_start = text.find(head + " which ")
    #which_clause = text[len(head) + which_start+6:]
    #nouns[text].which_attachments.append(which_clause.strip())

    #if len(np_pos) >= head_index+2 and len(np_words) >= head_index+2:
    #if np_pos[head_index+1]["TAG"] == "VBD":
    #nouns[text].verbed.append(np_words[head_index+1])

    #if np_pos[head_index+1]["TAG"] == "VBG":
    #nouns[text].verbing.append(np_words[head_index+1])
    #end modifier code

    #find which chain the anaphor is from and add the chain statistics
    anaphor_chain = None
    for chain in list(doc.gold_chains.keys()):
        for mention in doc.gold_chains[chain]:
            if anaphor == mention:
                anaphor_chain = chain
                break

    chain_name = "{0}:{1}".format(doc.getName(), anaphor_chain)
    if chain_name not in nouns[text].chains:
        nouns[text].chains.append(chain_name)

    if anaphor_chain is not None:
        chain_length = len(doc.gold_chains[anaphor_chain])
        nouns[text].chain_size[doc.getName()] = chain_length

        #coverage
        #chain_start = doc.gold_chains[chain][0].getStart()
        #chain_end   = doc.gold_chains[chain][-1].getEnd()
        #chain_size  = chain_end - chain_start
        #chain_coverage = float(chain_size) / len(doc.text)

        # number of sentences touched / number of sentences
        covered_sentences = 0
        for sent in doc.sentences:
            for mention in doc.gold_chains[anaphor_chain]:
                if sent.contains(mention):
                    covered_sentences += 1
                    break

        chain_coverage = float(covered_sentences) / len(doc.sentences)
        nouns[text].chain_coverage[doc.getName()] = chain_coverage

        for chain in list(doc.gold_chains.keys()):
            if chain == anaphor_chain:
                continue
            if len(doc.gold_chains[chain]) > chain_length:
                break
        else:
            nouns[text].largest_chain += 1

        common_only = True
        for mention in doc.gold_chains[anaphor_chain]:
            if mention == anaphor:
                continue
            mention_head = getHead(utils.textClean(mention.getText()))
            if mention_head not in nouns[text].all_entities:
                nouns[text].all_entities.append(mention_head)

            #does this chain contain proper names?
            mention_np = doc.nps.getAnnotBySpan(mention.getStart(),
                                                mention.getEnd())
            if specificity_utils.isProper(mention_np):
                common_only = False

        if chain_name not in list(nouns[text].nom_chain_only.keys()):
            nouns[text].nom_chain_only[chain_name] = common_only
    else:
        sys.stderr.write("Anaphor chain not found?\n")

    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #record stats
        sd = doc.sentence_distance(antecedent, anaphor)
        nouns[text].sentence_distance(sd)
        nouns[text].most_recent_antecedents.append(
            antecedent.getText().lower())

        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            nouns[text].subj_ante += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            nouns[text].dobj_ante += 1

        if antecedent.getText().lower() == anaphor.getText().lower():
            nouns[text].string_matches += 1

        if specificity_utils.isProper(antecedent_np):
            nouns[text].prp_ante += 1
        elif specificity_utils.isNominal(antecedent_np):
            nouns[text].nom_ante += 1
        elif specificity_utils.isPronoun(antecedent_np):
            nouns[text].pro_ante += 1
    else:
        #this guy starts the chain
        nouns[text].starts_chain += 1
def collectFPs(gold_nps, this_files_common_nouns):
    for np in gold_nps:
        if specificity_utils.isNominal(np):
            this_files_common_nouns.append(np)
def add_stats(pronoun_class, doc, anaphor, text):
    if text in list(pronoun_class.keys()):
        pronoun_class[text].updateCount()
    else:
        pronoun_class[text] = Pronoun(text)

    #find the closest antecedent
    antecedent = doc.closest_antecedent(anaphor)
    if antecedent is not None:
        #print anaphor.ppprint(),
        #print antecedent.ppprint()

        #uniqueness -- what is the rate at which a pronoun is coreferent
        #with "new" words? I once called this generality -- captured with
        #antecedent
        pronoun_class[text].addAntecedent(
            utils.textClean(antecedent.getText()).lower())

        #string matches -- how often does this pronoun resolve to
        #instances of itself?
        ant_text = utils.textClean(antecedent.getText()).lower()
        if ant_text == text  \
            or (ant_text in ("he", "him") and text in ("he", "him")) \
            or (ant_text in ("they", "them") and text in ("they", "them")):
            pronoun_class[text].string_matches += 1

        #find the distance of the closest antecedent
        # 1. in word
        wd = doc.word_distance(antecedent, anaphor)
        pronoun_class[text].word_distance(antecedent, wd)

        # 2. in sentences
        sd = doc.sentence_distance(antecedent, anaphor)
        pronoun_class[text].sent_distance(antecedent, sd)

        #ant_pdtb = doc.getContainedPDTB(antecedent)
        #ana_pdtb = doc.getContainedPDTB(anaphor)
        # 3. pdtb parse distance ? what discourse parse values are useful?
        #for pdtb1 in ant_pdtb:
        #    for pdtb2 in ana_pdtb:
        #        if pdtb1 == pdtb2:
        #            #    a. if the anaphor and antecedent are in the same argument of a
        #            #    discourse relation?
        #            pronoun_class[text].pdtb["SAME_ARG"] = pronoun_class[text].pdtb["SAME_ARG"] + 1
        #
        #        if (pdtb1.getATTR("TYPE") == pdtb2.getATTR("TYPE")) and (pdtb1.getATTR("SID") == pdtb2.getATTR("SID")):
        #            #    b. if the anaphor and antecedent are in different arguments of the
        #            #    same discourse relation
        #            pronoun_class[text].pdtb["DIFF_ARG"] = pronoun_class[text].pdtb["DIFF_ARG"] + 1
        #else:
        ##    c. if the anaphor and antecedent are not in the same discourse
        ##    relation at all
        #    pronoun_class[text].pdtb["NONE"] = pronoun_class[text].pdtb["NONE"] + 1

        #how often is this pronoun coreferent with a nominal?
        if specificity_utils.isNominal(antecedent):
            pronoun_class[text].nominal_antecedent += 1

        #how often is this pronoun coreferent with a proper name?
        antecedent_np = doc.nps.getAnnotBySpan(antecedent.getStart(),
                                               antecedent.getEnd())
        if antecedent_np.getATTR("contains_pn") is not None:
            if antecedent_np.getATTR("contains_pn") == antecedent.getText():
                pronoun_class[text].proper_antecedent += 1
        elif specificity_utils.isProper(antecedent_np):
            pronoun_class[text].proper_antecedent += 1

        #how often are antecedents of this pronoun in the subj or dobj
        #position of a verb?
        if antecedent_np["GRAMMAR"] == "SUBJECT":
            pronoun_class[text].subj += 1
        elif antecedent_np["GRAMMAR"] == "OBJECT":
            pronoun_class[text].dobj += 1
    else:
        pronoun_class[text].starts_chain()