Python ConllCorpusReader.sents Exemples

Langage de programmation: Python

Espace de nommage/Pack: nltk.corpus.reader

Méthode/Fonction: sents

Exemples au hotexamples.com: 5

Python ConllCorpusReader.sents - 5 exemples trouvés. Ce sont les exemples réels les mieux notés de nltk.corpus.reader.ConllCorpusReader.sents extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

ConllCorpusReader(15)

tagged_sents(8)

iob_sents(4)

sents(3)

tagged_words(3)

words(2)

chunked_sents(1)

chunked_words(1)

fileids(1)

iob_words(1)

parsed_sents(1)

raw(1)

srl_instances(1)

srl_spans(1)

Méthodes fréquemment utilisées

ConllCorpusReader (15)

tagged_sents (8)

iob_sents (4)

sents (3)

tagged_words (3)

words (2)

chunked_sents (1)

chunked_words (1)

fileids (1)

iob_words (1)

Méthodes fréquemment utilisées

parsed_sents (1)

raw (1)

srl_instances (1)

srl_spans (1)

Exemple #1

0

Afficher le fichier

def main() -> None: """Точка входа в приложение.""" corpus_root = Path('corpus') # Настроим логирование результатов global _logger setup_logger(_logger, corpus_root / 'collocations.log') # Загрузим стоп-слова nltk.download('stopwords', '.env/share/nltk_data') stop_words = set(stopwords.words('russian')) # Импортируем корпус tags_root = corpus_root / 'pos_tagging' reader = ConllCorpusReader( str(tags_root), [f.name for f in tags_root.glob('*.tags')], columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'], separator='\t') _logger.info('Документов: %d', len(reader.fileids())) _logger.info('Токенов в первом документе (%s): %d', reader.fileids()[0], len(reader.words(reader.fileids()[0]))) _logger.info('Загружаем предложения') sentences = reader.sents() # Строим таблицы сопряжённости для всех слов в корпусе _logger.info('Считаем таблицу сопряжённости по всем словам') bigram_finder = BigramCollocationFinder.from_documents( [w.lower() for w in sent] for sent in tqdm(sentences)) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder) # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова _logger.info( 'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте') bigram_finder.apply_freq_filter(5) bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder)

Exemple #2

0

Afficher le fichier

def sents(self, fileids=None, categories=None): return ConllCorpusReader.sents(self, self._resolve(fileids, categories))

Exemple #3

0

Afficher le fichier

Fichier : catchunked.py Projet : RomanZacharia/python_text_processing_w_nltk2_cookbook

def sents(self, fileids=None, categories=None): return ConllCorpusReader.sents(self, self._resolve(fileids, categories))

Exemple #4

0

Afficher le fichier

Fichier : evaluation.py Projet : andrey2111/ANLP_viterbi

(prob, state) = max((V[t-1][y0] + trans_p.logprob((y0, y)), y0) for y0 in states) V[t][y] = prob newpath[y] = path[state] + [y] # Don't need to remember the old paths path = newpath # Return the most likely sequence over the given time frame n = len(obs) - 1 (prob, state) = max((V[n][y], y) for y in states) return path[state] Vit1 = [] Vit2 = [] Vit3 = [] Vit4 = [] for sent in conllreader.sents(): Vit1.append(zip(sent, viterbi(sent, states, Train.A0j, Train.Aij, Train.Biw))) Vit2.append(zip(sent, viterbi(sent, states, Train.A0jLap, Train.AijLap, Train.BiwLap))) Vit3.append(zip(sent, viterbi(sent, states, Train.A0jGT, Train.AijGT, Train.BiwGT))) Vit4.append(zip(sent, viterbi(sent, states, Train.A0jMLE, Train.AijMLE, Train.BiwMLE))) # function for writing tagged corpora to files in CoNLL format def write_conll(filename, tagged_corpus): with open(filename, 'w') as out_file: for tagged_sent in tagged_corpus: tagged_words = ('\t'.join(w_t) for w_t in tagged_sent) out_file.write('\n'.join(tagged_words) + '\n\n') write_conll('unsmoothed.tt', Vit1) write_conll('laplace.tt', Vit2)

Exemple #5

0

Afficher le fichier

Fichier : evaluation.py Projet : gonchandrei/ANLP_viterbi

V[t][y] = prob newpath[y] = path[state] + [y] # Don't need to remember the old paths path = newpath # Return the most likely sequence over the given time frame n = len(obs) - 1 (prob, state) = max((V[n][y], y) for y in states) return path[state] Vit1 = [] Vit2 = [] Vit3 = [] Vit4 = [] for sent in conllreader.sents(): Vit1.append( zip(sent, viterbi(sent, states, Train.A0j, Train.Aij, Train.Biw))) Vit2.append( zip(sent, viterbi(sent, states, Train.A0jLap, Train.AijLap, Train.BiwLap))) Vit3.append( zip(sent, viterbi(sent, states, Train.A0jGT, Train.AijGT, Train.BiwGT))) Vit4.append( zip(sent, viterbi(sent, states, Train.A0jMLE, Train.AijMLE, Train.BiwMLE))) # function for writing tagged corpora to files in CoNLL format def write_conll(filename, tagged_corpus):