Beispiel #1
0
    #sys.stdout.write("\r")
    #prog = ProgressBar(len(files))
    i = 0
    for f in files:
        if f.startswith("#"): continue

        #prog.update_time(i)
        #sys.stdout.write("\r%s" % (str(prog)))
        #sys.stdout.flush()

        i += 1
        f = f.strip()
        doc = Document(f)
        gold_nps = reconcile.getNPs(f)
        gold_chains = reconcile.getGoldChains(f)
        doc.addGoldChains(gold_chains)

        for np in gold_nps:
            text = utils.textClean(np.getText().lower()).strip()
            if TRUE_PRONOUNS:
                if text in TRUE:
                    add_stats(text, np, doc, nouns, head2text)
            else:
                if specificity_utils.isNominal(np):
                    #head = getHead(text)
                    #if head.endswith("%"): continue #skip percents
                    #if head[-1].isdigit(): continue #skip numbers
                    #if utils.isConj(head): continue #just skip these guys too
                    add_stats(text, np, doc, nouns, head2text)

    #sys.stdout.write("\r \r\n")
    files = [x for x in fileList.readlines() if not x.startswith("#")]
    fileList.close()
    for f in files:
        f = f.strip()
        print("Working on document: %s" % f)

        #load in the document statistics
        d = Document(f)

        #the total number of sentences in this text file. 
        #?double check with nltk?
        total_sentences_doc = len(reconcile.getSentences(f))

        #process a document, get all the nominal stats that are requested. 
        gold_chains = reconcile.getGoldChains(f, True)
        d.addGoldChains(gold_chains)

        for gc in list(gold_chains.keys()):
            base_antecedent = True
            previous_semantic_tag = ""
            prev_sent = -1
            prev_tile = -1
            prev_type = ""

            for mention in gold_chains[gc]:
                if HEADS_ONLY:
                    head_clean = ' '.join(map(string.strip, \
                        mention.getATTR("HEAD_TEXT").split())).strip()
                    text_ident = head_clean.lower()
                    text_ident = mention.getATTR("HEAD_TEXT").strip().lower()
                else: