Python Corpus.Document Examples

Programming Language: Python

Class/Type: Corpus

Method/Function: Document

Examples at hotexamples.com: 2

Python Corpus.Document - 2 examples found. These are the top rated real world Python examples of Corpus.Document from package differentiable-plasticity extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Corpus(12)

TRECReader(8)

TRECWriter(5)

Word(5)

Sentence(4)

Document(2)

MonoCorpus(2)

Vocabulary(2)

ValCorpus(1)

list_of_documents(1)

sentences(1)

setname(1)

Example #1

Show file

 def process(self, title, text):
     import Corpus
     self.count += 1
     title = title.replace(' ', '_').encode('utf8')
     text = text.encode('utf8')
     if self.name_set.__contains__(title):
         self.writer.write(Corpus.Document(str(self.id), '<title>%s</title>\n%s' % (title, text)))
         print self.count, self.id, title
         self.id += 1

Example #2

Show file

File: crf.py Project: jinghe/hunter-gatherer

def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir):
    get_classpath(lib_dir)
    check_java_compile(lib_dir)
    pattern_set = set(
        map(lambda line: line.split()[0],
            open(pattern_path).readlines()))
    base_tag_trec_path = '%s.basetag' % trec_path
    command = [
        'java', '-Xms13G', '-Xmx13G', '-classpath', class_path,
        stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path
    ]
    print ' '.join(command)
    subprocess.call(command)

    t = time.time()
    reader = Corpus.TRECReader()
    reader.open(base_tag_trec_path)
    doc = reader.next()
    indecies = [0]
    ids = []
    all_tagged_text = None
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string('\n'.join(
            filter(lambda line: not line.startswith('<'),
                   doc.text.split('\n'))))
        if all_tagged_text:
            all_tagged_text += tagged_text
        else:
            all_tagged_text = tagged_text
        indecies.append(len(all_tagged_text))
        tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set)
        ids.append(doc.ID)
        doc = reader.next()
    reader.close()
    os.remove(base_tag_trec_path)

    #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set)
    print len(tagged_text)
    writer = Corpus.TRECWriter(out_path)
    for i in xrange(len(ids)):
        doc = Corpus.Document(
            ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__())
        writer.write(doc)
    writer.close()
    global prune_t, label_t
    print time.time() - t, prune_t, label_t