コード例 #1
0
 def _data_init(self, words, **kwargs):
   #phrases = Phrases([list(w) for w in words])
   #normalized_phrases = {k.decode('utf-8'):v for k,v in phrases.vocab.items()}
   _,vocab_counts,_ = Phrases.learn_vocab(words,2000,delimiter=b'')
   vocab_counts = {k.decode('utf-8'):v for k,v in vocab_counts.items()}
   unigram_scores = {k:v for k,v in vocab_counts.items() if len(k) == 1}
   ngram_scores = {k:v for k,v in vocab_counts.items() if len(k) > 1}
   unigrams = sorted(unigram_scores, key=lambda k:unigram_scores[k], reverse=True)
   ngrams = sorted(ngram_scores, key=lambda k:ngram_scores[k], reverse=True)
   unigram_limit = len(unigrams)
   if 'unigram_limit' in kwargs and kwargs['unigram_limit'] is not None:
     unigram_limit = kwargs['unigram_limit']
   #TODO: determine best parameter for controlling bigram vocabulary
   #TODO: allow for n-grams?
   #all_phrases = l1_phrases + l2_phrases[:len(l1_phrases)]
   all_phrases = unigrams[:unigram_limit] + ngrams[:unigram_limit] #TODO: determine bigram number some other way?
   #self.vocab = trie.Trie()
   #self.vocab_reverse = trie.Trie()
   self.vocab.add('') #optional?
   self.vocab_reverse.add('')
   for p in all_phrases:
     self.vocab.add(p)
     self.vocab_reverse.add(p[::-1])
コード例 #2
0
                        backend="multiprocessing",
                        prefer="processes")
    do = delayed(partial(tokenize_sentence_corpus, corpus_out_path))
    tasks = (do(i, batch) for i, batch in enumerate(partitions))

    executor(tasks)


# process_texts(documents_path, year='2020', court='01', corpus_out_path=unigram_sentences_path, batch_size=8, n_jobs=2,
#               debug=True)

stop_words = get_custom_stop_words()

pruned_words, counters, total_words = Phrases.learn_vocab(
    sentences=LineSentence(unigram_sentences_path),
    max_vocab_size=800000000,
    common_terms=stop_words,
    progress_per=100)

counters = sorted(counters.items(),
                  key=lambda key_value: key_value[1],
                  reverse=True)

count = 0
for key, value in counters:
    count += 1
    print(any2unicode(key), value)
print(count)

bigram_model = Phrases(LineSentence(unigram_sentences_path),
                       max_vocab_size=800000000,
コード例 #3
0
    'tst - recurso de revista rr 1473005620085030137 (tst).',
    'data de publicação: 14/08/2015',
    'ementa: i - agravo de instrumento em recurso de revista da reclamada.',
    'justiça gratuita.',
    '"demonstrada divergência jurisprudencial específica, impõe-se o provimento do agravo de instrumento para determinar o processamento do recurso de revista da reclamada."',
    'agravo de instrumento provido.',
    'ii - recurso de revista da reclamada 1 - sindicato.',
    'substituição processual.'
]

# tokenized_sentences = [[word for word in sentence.split()] for sentence in sentences]
tokenized_sentences = [get_relevant_tokens(sentence) for sentence in sentences]

pruned_words, counters, total_words = Phrases.learn_vocab(
    sentences=tokenized_sentences,
    max_vocab_size=800000000,
    common_terms=stop_words,
    progress_per=1)

counters = sorted(counters.items(),
                  key=lambda key_value: key_value[1],
                  reverse=True)

count = 0
for key, value in counters:
    count += 1
    print(any2unicode(key), value)
print(count)

bigram_model = Phrases(tokenized_sentences,
                       max_vocab_size=800000000,