Python Document.did Beispiele

Programmiersprache: Python

Namespace / Paketname: text.document

Klasse / Typ: Document

Methode / Funktion: did

Beispiele auf hotexamples.com: 2

Python Document.did - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die text.document.Document.did, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Document(16)

process_document(14)

sentence_tokenize(5)

sentences(4)

did(1)

invalid_sids(1)

Beispiel #1

Datei anzeigen

Datei: generate_corpus.py Projekt: AndreLamurias/IBEnt

def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path, str(lcount/1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))

Beispiel #2

Datei anzeigen

def process_documents(corpus_path):
    corpus = Corpus(corpus_path)
    final_text = []
    corenlp_client = StanfordCoreNLP('http://localhost:9000')
    lcount = 0
    starts = set()
    with codecs.open(corpus_path, 'r', 'utf-8') as docfile:
        for l in docfile:
            print lcount
            if l[:10] in starts:
                print "repeated abstract:", l[:10]
                continue
            lcount += 1
            starts.add(l[:10])
            values = l.strip().split("\t")
            pmid = values[0]
            abs_text = " ".join(values[1:])
            newdoc = Document(abs_text, did="PMID" + pmid)
            newdoc.process_document(corenlp_client)
            #for sentence in newdoc.sentences:
            #    print [t.text for t in sentence.tokens]
            newtext = ""
            newdoc.did = "PMID" + pmid
            corpus.documents["PMID" + pmid] = newdoc
            """for s in newdoc.sentences:
                for t in s.tokens:
                    newtext += t.text + " "
            final_text.append(newtext)"""
            # if lcount > 10:
            #     break
            if lcount % 1000 == 0:
                corpus.save("{}_{}.pickle".format(corpus_path,
                                                  str(lcount / 1000)))
                corpus = Corpus(corpus_path)
    corpus.save("{}_{}.pickle".format(corpus_path, str(lcount / 1000)))