Beispiel #1
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))
Beispiel #2
0
def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path,
                                                  str(lcount / 1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))