def parse_open_corpora(self, path_to_corpus='./annot.opcorpora.xml'): """ Parse corpora. :param path_to_corpus: path to xml file :return: doc-term matrix: [num_docs, num_words]: csr sparse :return: vocab: dict of used words :return: year: list of year of documents :return: topic: list of topics """ corpus = opencorpora.CorpusReader(path_to_corpus) documents = [] year = [] topic = [] for document in corpus.iter_documents(): document_words = self._get_document_words(document) if len(document_words) == 0: continue raw_sentences = " ".join(document_words) documents.append(raw_sentences) doc_categories = document.categories() #process metadata regex_year = re.compile("Год:.*") doc_year = [ m.group(0) for l in doc_categories for m in [regex_year.search(l)] if m ] if len(doc_year): year.append(int(doc_year[0].split(':')[-1])) else: year.append(-1) regex_topic = re.compile("Тема:.*") doc_topic = [ m.group(0) for l in doc_categories for m in [regex_topic.search(l)] if m ] if len(doc_topic): topic.append(doc_topic[0].split(':')[-1].lower()) else: topic.append('UNK') doc_term_matr = self._tf_vectorizer.fit_transform(documents) vocabulary = self._tf_vectorizer.vocabulary_ close_word_pairs = self._get_close_word_pairs(corpus, vocabulary) return doc_term_matr, vocabulary, year, topic, close_word_pairs
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora if logger is None: logger = logging.getLogger(__name__) class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) disambig_words = list( with_progress(_disambiguated_words(reader), "Reading disambiguated words from corpus")) disambig_words = with_progress(disambig_words, "Filtering out non-ambiguous words") ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t)) for (w, t) in disambig_words if len(morph.tag(w)) > 1) if gr != set(['UNKN'])] logger.info("Computing P(t|w)") def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def parse_open_corpora(self, path_to_corpus='./annot.opcorpora.xml'): """ Parse corpora. :param path_to_corpus: path to xml file :return: doc-term matrix: [num_docs, num_words]: csr sparse :return: vocab: dict of used words :return: year: list of year of documents :return: topic: list of topics """ corpus = opencorpora.CorpusReader(path_to_corpus) documents = [] year = [] topic = [] for document in corpus.iter_documents(): document_words = self._get_document_words(document) if len(document_words) == 0: continue raw_sentences = " ".join(document_words) documents.append(raw_sentences) doc_categories = document.categories() year_doc, topic_doc = self._get_metadata(doc_categories) year.append(year_doc) topic.append(topic_doc) doc_term_matr = self._tf_vectorizer.fit_transform(documents) vocabulary = self._tf_vectorizer.vocabulary_ close_word_pairs = self._get_close_word_pairs(corpus, vocabulary) return doc_term_matr, vocabulary, year, topic, close_word_pairs
def estimate_conditional_tag_probability(morph, corpus_filename): """ Estimate P(t|w) based on OpenCorpora xml dump. Probability is estimated based on counts of disambiguated ambiguous words, using simple Laplace smoothing. """ import nltk import opencorpora class _ConditionalProbDist(nltk.ConditionalProbDist): """ This ConditionalProbDist subclass passes 'condition' variable to probdist_factory. See https://github.com/nltk/nltk/issues/500 """ def __init__(self, cfdist, probdist_factory): self._probdist_factory = probdist_factory for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], condition) reader = opencorpora.CorpusReader(corpus_filename) ambiguous_words = ((w.lower(), tag2grammemes(t)) for (w, t) in _disambiguated_words(reader) if len(morph.tag(w)) > 1) ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words if gr != set(['UNKN'])) def probdist_factory(fd, condition): bins = max(len(morph.tag(condition)), fd.B()) return nltk.LaplaceProbDist(fd, bins=bins) cfd = nltk.ConditionalFreqDist(ambiguous_words) cpd = _ConditionalProbDist(cfd, probdist_factory) return cpd, cfd
def get_corpus_revision(path): return opencorpora.CorpusReader(path).get_annotation_info()['revision']
import nltk from nltk4russian.tagger import PMContextTagger from nltk4russian.util import read_corpus_to_nltk from nltk.tag.brill import * import nltk.tag.brill_trainer as bt import opencorpora #brill templates Template._cleartemplates() templates = nltk.tag.brill.fntbl37() #чтение подкорпуса Open Corpora + фильтрация пустых предложений corpus = opencorpora.CorpusReader('annot.opcorpora.no_ambig.xml') sents_OC = list(filter(lambda x: len(x), corpus.iter_tagged_sents())) #чтение подкорпуса НКРЯ (media1.tab, можно подключить другие из папки nltk4russian/data): with open('media1.tab', encoding='utf-8') as media: sents_RNC = list(read_corpus_to_nltk(media)) #чтение подкорпуса LENTA with open('LENTA_RNC.txt', encoding='utf-8') as LENTA: sents1 = list(read_corpus_to_nltk(LENTA)) #чтение подкорпуса VK with open('VK_RNC.txt', encoding='utf-8') as VK: sents2 = list(read_corpus_to_nltk(VK)) #чтение подкорпуса JZ with open('JZ_RNC.txt', encoding='utf-8') as JZ: sents3 = list(read_corpus_to_nltk(JZ))
import opencorpora corpus = opencorpora.CorpusReader('annot.opcorpora.xml') for i in corpus.iter_documents(): print(i)