Example #1
0
    for head in heads:
        head2nouns[head] = Noun(head)

    #cycle over all the files
    #sys.stdout.flush()
    #sys.stdout.write("\r")
    #prog = ProgressBar(len(files))
    i = 0
    for f in files:
        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()
        i += 1
        #read in the nps
        nps = reconcile.getNPs(f)
        sentences = reconcile.getSentences(f)

        #see which nps correspond to these heads
        for np in nps:
            np_text = utils.textClean(np.getText())
            np_head = specificity_utils.getHead(np_text)

            if np_head in heads:
                #print "{0:35} => {1}".format(np_text, np_head)
                head2nouns[np_head].addDoc(f)
                head2nouns[np_head].addText(np_text)
                head2nouns[np_head].count += 1
                head2nouns[np_head].addDefinite(np)

                if np["GRAMMAR"] == "SUBJECT":
                    head2nouns[np_head].subj += 1
Example #2
0
    if options.stats:
        options.evaluate = False
        options.verbose = False
    print()

    if options.all:
        options.heuristics = 8

    if options.vverbose:
        options.verbose = True

    nps = reconcile.getNPs_annots(options.directory)
    pronouns = reconcile.getPronouns(options.directory, "ALL")
    possessive_pronouns = reconcile.getPronouns(options.directory,
                                                "POSSESSIVE")
    sents = reconcile.getSentences(options.directory)
    nes = reconcile.getNEsByClass(options.directory, "PERSON")

    all_pairs = []
    counts = {}

    #remove dates
    nps = utils.remove_dates(nps)
    pronouns = utils.remove_dates(pronouns)
    possessive_pronouns = utils.remove_dates(possessive_pronouns)

    if options.remove_its:
        nps = utils.remove_hard_pronouns(nps)
        pronouns = utils.remove_hard_pronouns(pronouns)
        pronouns = utils.remove_hard_pronouns(pronouns)
        possessive_pronouns = utils.remove_hard_pronouns(possessive_pronouns)
Example #3
0
def process_gold(f, np, head, text, head2qp, gold_chains):
    #find the chain of this np
    np_chain = None
    for chain in list(gold_chains.keys()):
        for mention in gold_chains[chain]:
            if np == mention:
                np_chain = gold_chains[chain]
                break

    #find closest antecedent
    prev = None
    for other in np_chain:
        if np == other:
            break
        else:
            prev = other

    sentences = reconcile.getSentences(f)
    if prev is not None:
        head2qp[head].has_antecedent += 1
        i = 0
        anaphor_sent = 0
        antecedent_sent = 0
        #find sentence distance
        for sent in sentences:
            if sent.contains(np):
                anaphor_sent = i
            if sent.contains(prev):
                antecedent_sent = i
            i += 1
        sentence_distance = abs(anaphor_sent - antecedent_sent)
        #how many antecedents within three sentences of antecedent
        head2qp[head].sent_distances.append(sentence_distance)

    #if it is a bare definite, how many are base antecedents?
    if (text == "the " + head) or \
            (text == "that " + head) or \
            (text == "this " + head) or \
            (text == "those " + head) or \
            (text == "these " + head):
        if np_chain[0] == np:
            head2qp[head].bdef_starts_chain += 1
    else:
        if np_chain[0] == np:
            head2qp[head].starts_chain += 1
            #is this is indef ba? how well does this track with the total % of
            #indefs?
            if (text.startswith("a ") or text.startswith("an ")):
                head2qp[head].faux_ba += 1

    #TODO how many string matches?
    #TODO how many proper names?
    #TODO diversity of antecedents

    #chain coverage in document [% of document sentences chain touches]
    covered_sentences = list(range(len(sentences)))
    i = 0
    for sent in sentences:
        for mention in np_chain:
            if sent.contains(mention):
                covered_sentences.remove(i)
                break
        i += 1
    chain_coverage = float(len(sentences) -
                           len(covered_sentences)) / len(sentences)
    head2qp[head].chain_coverage[f] = chain_coverage
# Created By : Nathan Gilbert
#
import sys
import pydot

from pyconcile import reconcile

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <response-file>" % (sys.argv[0]))
        sys.exit(1)

    dataDir = sys.argv[1][:sys.argv[1].find("/")]
    responseFile = sys.argv[1][sys.argv[1].find("/"):]
    clusterer="SingleLink"
    sentences = reconcile.getSentences(dataDir)
    gold_chains = reconcile.getGoldChains(dataDir)

    #get reconcile's edges
    response_chains = reconcile.getResponseChains(dataDir,
            responseFile+"/"+clusterer)

    response_pairs = reconcile.getResponsePairs(dataDir, responseFile, 0.5)
    response_pairs = reconcile.labelCorrectPairs(gold_chains, response_pairs)

    #pydot graph
    graph = pydot.Dot("reconcile_clusters", graph_type='digraph')

    #add in all the NP
    #NOTE: as long as we are working with gold mentions, the response and gold
    #will match. otherwise, will need to switch over to gold nps to see proper
    noun2antecedents = defaultdict(dict)

    fileList = open(options.filelist, 'r')
    #lines that start with # are ignored
    files = [x for x in fileList.readlines() if not x.startswith("#")]
    fileList.close()
    for f in files:
        f = f.strip()
        print("Working on document: %s" % f)

        #load in the document statistics
        d = Document(f)

        #the total number of sentences in this text file. 
        #?double check with nltk?
        total_sentences_doc = len(reconcile.getSentences(f))

        #process a document, get all the nominal stats that are requested. 
        gold_chains = reconcile.getGoldChains(f, True)
        d.addGoldChains(gold_chains)

        for gc in list(gold_chains.keys()):
            base_antecedent = True
            previous_semantic_tag = ""
            prev_sent = -1
            prev_tile = -1
            prev_type = ""

            for mention in gold_chains[gc]:
                if HEADS_ONLY:
                    head_clean = ' '.join(map(string.strip, \