Ejemplo n.º 1
0
def getACECommonNouns(f):
    nps = reconcile.getNPs(f)
    gold_nps = reconcile.parseGoldAnnots(f)
    common_nps = AnnotationSet("common_nouns")
    for np in nps:
        gold_np = gold_nps.getAnnotBySpan(np.getStart(), np.getEnd())
        if not gold_np["GOLD_SINGLETON"] and gold_np["is_nominal"]:
            common_nps.add(np)
    return common_nps
Ejemplo n.º 2
0
def processACE(f, head2qp):
    global USE_GOLD
    ace_annots = reconcile.parseGoldAnnots(f)
    nps = reconcile.getNPs(f)
    stanford_deps = reconcile.getStanfordDep(f)
    gold_chains = reconcile.getGoldChains(f)
    for np in nps:
        ace_np = ace_annots.getAnnotBySpan(np.getStart(), np.getEnd())
        head = None
        text = None
        if PRONOUNS:
            if qp_utils.isPronoun(np):
                head = ace_np["HEAD"].lower()
                text = np.getText()
            else:
                continue
        else:
            if ace_np["is_nominal"]:
                head = utils.textClean(ace_np["HEAD"].strip().lower())
                text = utils.textClean(np.getText())
            else:
                continue

        #bookkeeping
        if head not in list(head2qp.keys()):
            head2qp[head] = QuasiPronoun(head)
        else:
            head2qp[head].updateDocs(f)
            head2qp[head].updateCount()

        if ace_np["GOLD_SINGLETON"]:
            head2qp[head].singleton += 1
            if (text.startswith("a ") or text.startswith("an ")):
                head2qp[head].faux_ba += 1
        else:
            #does it start the chain?
            if USE_GOLD:
                process_gold(f, np, head, text, head2qp, gold_chains)
        process_syntax(f, np, head, text, head2qp, stanford_deps)
        heads.extend(list(map(string.strip, headFile.readlines())))

    files = []
    with open(sys.argv[2], 'r') as inFile:
        files.extend([x for x in inFile.readlines() if not x.startswith("#")])
    prog = ProgressBar(len(files))

    heads2nps = defaultdict(list)
    i = 0
    for f in files:
        prog.update_time(i)
        i += 1
        sys.stderr.write("\r%s" % (str(prog)))
        sys.stderr.flush()
        f = f.strip()
        nps = reconcile.getNPs(f)
        for np in nps:
            if specificity_utils.isNominal(np) or \
                    specificity_utils.isPronoun(np) or \
                    np["DATE"] != "NONE":
                head = specificity_utils.getHead(
                    utils.textClean(np.getText()).lower())
                if head.find(" and ") > -1:
                    continue
                if head in heads:
                    heads2nps[head].append(utils.textClean(np.getText()))
    sys.stderr.write("\r \r\n")

    for head in list(heads2nps.keys()):
        counts = {}
        definite = False
Ejemplo n.º 4
0
    elif d == 3:
        return "CCFFFF"
    elif d == 4:
        return "66FFFF"
    else:
        return "FFFF00"
    return "FFFF00"


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: %s <file-dir>" % (sys.argv[0]))
        sys.exit(1)

    #NOTE: assumes that the nps file will only have gold_nps in it.
    gold_nps = reconcile.getNPs(sys.argv[1])
    labels = {}
    buf = {}

    rawText = ""
    with open(sys.argv[1] + "/raw.txt", 'r') as txtFile:
        rawText = ''.join(txtFile.readlines())

    for np in gold_nps:
        #print np.ppprint()
        key = "{0}:{1}".format(np.getStart(), np.getEnd())
        #may need to get head for a lot of NPs.
        text = utils.textClean(np.getText().lower())
        if text in data.ALL_PRONOUNS:
            #if text in ("it","its","they","them","their"):
            #    labels[key] = 0
Ejemplo n.º 5
0
    print("Processing documents...")
    for f in files:
        d += 1
        #progress bar updates
        sys.stdout.write("\r")
        prog.update_time(d)
        sys.stdout.write("\r%s" % (str(prog)))
        sys.stdout.flush()
        #print "Processing document %d/%d" % (d, len(files))
        gold_chains = reconcile.getGoldChains(sys.argv[1] + "/" + f)
        total_coref_chains += len(list(gold_chains.keys()))

        with open(sys.argv[1] + "/" + f + "/raw.txt", 'r') as raw_txt_file:
            raw_txt = ''.join(raw_txt_file.readlines())

        nps = reconcile.getNPs(sys.argv[1] + "/" + f)
        nes = reconcile.getNEs(sys.argv[1] + "/" + f)
        sun_nes = reconcile.getSundanceNEs(sys.argv[1] + "/" + f)
        features = feature_utils.getFeatures(sys.argv[1] + "/" + f, fdir)
        total_feature_pairs += len(list(features.keys()))

        doc_stats = {}
        doc_stats = process_raw_txt(raw_txt, sent_tokenizer)
        doc_stats["common2common"] = 0
        doc_stats["common2proper"] = 0
        doc_stats["common2pronoun"] = 0
        doc_stats["proper2common"] = 0
        doc_stats["proper2proper"] = 0
        doc_stats["proper2pronoun"] = 0
        doc_stats["pronoun2common"] = 0
        doc_stats["pronoun2proper"] = 0
Ejemplo n.º 6
0
def process(f, head2qp, annotated_file):
    stanford_deps = reconcile.getStanfordDep(f)
    pos = reconcile.getPOS(f)
    if annotated_file:
        nps = reconcile.getNPs(f)
        for np in nps:
            head = None
            text = None
            if PRONOUNS:
                if qp_utils.isPronoun(np):
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                if qp_utils.isNominal(np, pos):
                    text = utils.textClean(np.getText())
                    np_tags = pos.getSubset(np.getStart(), np.getEnd())
                    head = utils.textClean(
                        qp_utils.getHead2(text.lower(), np_tags))
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateCount(True)
                head2qp[head].updateDocs(f, True)
            else:
                head2qp[head].updateDocs(f, True)
                head2qp[head].updateCount(True)

            if USE_GOLD:
                gold_chains = reconcile.getGoldChains(f)
                process_gold(f, np, head, text, head2qp, gold_chains)
            process_syntax(f, np, head, text, head2qp, stanford_deps)
    else:
        stanford_nps = reconcile.getStanfordNPs(f)
        for np in stanford_nps:
            if PRONOUNS:
                if np["is_pronoun"]:
                    head = np.getText().lower()
                    text = np.getText()
                else:
                    continue
            else:
                #skip some problems with the parser or numbers
                if np["HEAD"].startswith("$") or np["HEAD"].endswith(
                        "%") or np["HEAD"] == ".":
                    continue

                if np["is_nominal"]:
                    text = utils.textClean(np.getText())
                    head = np["HEAD"].lower()
                else:
                    continue

            #bookkeeping
            if head not in list(head2qp.keys()):
                head2qp[head] = QuasiPronoun(head)
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            else:
                head2qp[head].updateDocs(f, False)
                head2qp[head].updateCount(False)
            process_syntax(f, np, head, text, head2qp, stanford_deps)