Esempio n. 1
0
    #sys.stdout.write("\r")
    #prog = ProgressBar(len(files))
    i = 0
    for f in files:
        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()
        i += 1
        #read in the nps
        nps = reconcile.getNPs(f)
        sentences = reconcile.getSentences(f)

        #see which nps correspond to these heads
        for np in nps:
            np_text = utils.textClean(np.getText())
            np_head = specificity_utils.getHead(np_text)

            if np_head in heads:
                #print "{0:35} => {1}".format(np_text, np_head)
                head2nouns[np_head].addDoc(f)
                head2nouns[np_head].addText(np_text)
                head2nouns[np_head].count += 1
                head2nouns[np_head].addDefinite(np)

                if np["GRAMMAR"] == "SUBJECT":
                    head2nouns[np_head].subj += 1
                elif np["GRAMMAR"] == "OBJECT":
                    head2nouns[np_head].dobj += 1

                np_sentence = getSentence(np, sentences)
                if np_sentence == 0:
    prog = ProgressBar(len(files))

    heads2nps = defaultdict(list)
    i = 0
    for f in files:
        prog.update_time(i)
        i += 1
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        f = f.strip()
        nps = reconcile.getNPs(f)
        for np in nps:
            if specificity_utils.isNominal(np) or \
                    specificity_utils.isPronoun(np) or \
                    np["DATE"] != "NONE":
                head = specificity_utils.getHead(
                    utils.textClean(np.getText()).lower())
                if head.find(" and ") > -1:
                    continue
                if head in heads:
                    heads2nps[head].append(utils.textClean(np.getText()))
    sys.stderr.write("\r \r\n")

    for head in list(heads2nps.keys()):
        counts = {}
        definite = False
        for np in heads2nps[head]:
            if np == head:
                continue
            if np == "the " + head or np == "a " + head or np == "that " + head:
                definite = True
                continue
Esempio n. 3
0
    total_scores = {"vps_guessed" : 0,
                    "vps_correct" : 0
                    }

    RESPONSE_TYPE = ""
    for f in files:
        f=f.strip()
        print("Working on file: {0}".format(f))
        gold_chains = reconcile.getGoldChains(f)
        pairs = []
        if "-hobbs" in sys.argv:
            RESPONSE_TYPE = "Hobbs"
            #read in the hobbs annotations
            hobbs_pairs = reconcile.getProResPairs(f, "hobbs")
            for pair in hobbs_pairs:
                ana_head = specificity_utils.getHead(utils.textClean(pair[1].getText())).lower()
                if ana_head in VPs:
                    pairs.append(pair)
        elif "-rec" in sys.argv:
            #TODO if we choose this route then there needs to be some mods
            #since each vp can be resolved multiple times.
            # 1. only count the closest antecedent?
            # 2. don't count string matches?
            # 3. look at what is in the "pro_antes" property (that gives us the
            # Cogniac decision.
            # 4. take the average accuracy for each noun.
            RESPONSE_TYPE = "Reconcile"
            #predictions = "features.goldnps/predictions.DecisionTree.muc6_DecisionTree_goldnps"
            predictions = "features.goldnps-vps/predictions.DecisionTree.muc6_DecisionTree_goldnps-vps"
            pairs = reconcile.getResponsePairs(f, predictions)
            for pair in reconcile_pairs:
Esempio n. 4
0
        gold_nes = reconcile.getGoldNEs(f)

        #get faux pronouns
        try:
            faux_pronoun_pairs = reconcile.getFauxPairs(f, PREDICTIONS)
        except:
            #then one was not created for this document (ACE04-33 is one.)
            continue

        #get the sentences 
        sentences = reconcile.getSentences(f)

        #remove the pairs we don't care about
        tracked_pairs = []
        for pair in faux_pronoun_pairs:
            ana_head = specificity_utils.getHead(pair[1].getText()).lower()
            if ana_head in FAUX_PRONOUNS:
                if ana_head not in list(tracked_nouns.keys()):
                    tracked_nouns[ana_head] = Noun(ana_head)
                tracked_pairs.append(pair)

        #label the correct or incorrect pairs
        labeled_faux_pairs = reconcile.labelCorrectPairs(gold_chains,
                tracked_pairs)

        for lpair in labeled_faux_pairs:
            ana_head = specificity_utils.getHead(lpair[1].getText()).lower()
            key = "{0}:{1}:{2}".format(f, lpair[1].getStart(),
                    lpair[1].getEnd())

            tracked_nouns[ana_head].instances[key] = utils.textClean(lpair[1].getText())
Esempio n. 5
0
    head2correct = {}  #heads that have a correct antecedent.
    head2wrong = {}  #heads that have an incorrect antecedent. appears in a
    #chain with nothing else it is coreferent with
    head2none = {}  #heads that were not given an antecedent. --focus on
    #this one first
    head2counts = {}

    outputfile = "/features.goldnps/predictions.StanfordSieve.stanfordsieve/SingleLink"
    for f in files:
        #gather all the chains that were generated by the system.
        gold_chains = reconcile.getGoldChains(f)
        response_chains = reconcile.getResponseChains(f, outputfile)
        nps = reconcile.getNPs(f)

        for np in nps:
            head = specificity_utils.getHead(utils.textClean(
                np.getText())).lower()
            if head in heads:
                #this is the number of times that a NP appeared in a doc
                head2counts[head] = head2counts.get(head, 0) + 1
            #print "{0} : {1}".format(np.pprint(), head)

        #for chain in response_chains:
        #    if len(response_chains[chain]) > 1:
        #        for mention in response_chains[chain]:
        #            print mention.pprint()
        #        print

        #find all the gold vps that were not assigned any cluster.
        for chain in list(response_chains.keys()):
            if len(response_chains[chain]) == 1:
                mention = response_chains[chain][0]
    if len(sys.argv) < 2:
        print("Usage: %s <filelist>" % (sys.argv[0]))
        sys.exit(1)

    files = []
    with open(sys.argv[1], 'r') as fileList:
        files.extend(
            [x for x in fileList.readlines() if not x.startswith("#")])

    heads = []
    for f in files:
        f = f.strip()
        print("Working on {0}".format(f))
        common_nouns = reconcile.getFauxPronouns(f)

        for cn in common_nouns:
            text = utils.textClean(cn.getText().replace("\n", " ")).lower()
            head = specificity_utils.getHead(text).rstrip('\'\"-,.:;!?')

            if head in data.ALL_PRONOUNS:
                continue

            if head not in heads:
                heads.append(head)

    with open("{0}_faux_pronouns".format(sys.argv[1]), 'w') as outFile:
        for head in heads:
            if len(re.findall(r'\w+', head)) > 1:
                continue
            outFile.write("{0}\n".format(head))
Esempio n. 7
0
        labels = {}
        noun_class = {}
        sundance_nes = reconcile.getSundanceNEs(f)
        stanford_nes = reconcile.getNEs(f)

        for np in common_nouns:
            #print np.ppprint()
            key = "{0}:{1}".format(np.getStart(), np.getEnd())

            #may need to get head for a lot of NPs.
            text = utils.textClean(np.getText().lower())
            if np["HEAD"] is not None:
                head = np["HEAD"]
                head_span = (np["HEAD_START"], np["HEAD_END"])
            else:
                head = specificity_utils.getHead(text)
                head_span = specificity_utils.getHeadSpan(np, head)

            if head_span is not None:
                #print "{0} : {1}".format(rawText[head_span[0]:head_span[1]],
                #        head_span)
                stanford_cls = stanford_semantic_type(head_span, stanford_nes)
                sundance_cls = sundance_semantic_type(head_span, sundance_nes)

            #is it a date or time?
            if np["DATE"] != "NONE" or utils.isDate(
                    utils.textClean(np.getText())):
                noun_class[key] = "DATE/TIME"

            head_synset = None
            if key not in list(labels.keys()):