def parse_open_corpora(self, path_to_corpus='./annot.opcorpora.xml'):
        """
        Parse corpora.
        :param path_to_corpus: path to xml file
        :return: doc-term matrix: [num_docs, num_words]: csr sparse
        :return: vocab: dict of used words
        :return: year: list of year of documents
        :return: topic: list of topics
        """
        corpus = opencorpora.CorpusReader(path_to_corpus)
        documents = []
        year = []
        topic = []

        for document in corpus.iter_documents():

            document_words = self._get_document_words(document)

            if len(document_words) == 0:
                continue

            raw_sentences = " ".join(document_words)

            documents.append(raw_sentences)
            doc_categories = document.categories()

            #process metadata
            regex_year = re.compile("Год:.*")
            doc_year = [
                m.group(0) for l in doc_categories
                for m in [regex_year.search(l)] if m
            ]

            if len(doc_year):
                year.append(int(doc_year[0].split(':')[-1]))
            else:
                year.append(-1)

            regex_topic = re.compile("Тема:.*")
            doc_topic = [
                m.group(0) for l in doc_categories
                for m in [regex_topic.search(l)] if m
            ]

            if len(doc_topic):
                topic.append(doc_topic[0].split(':')[-1].lower())
            else:
                topic.append('UNK')

        doc_term_matr = self._tf_vectorizer.fit_transform(documents)

        vocabulary = self._tf_vectorizer.vocabulary_

        close_word_pairs = self._get_close_word_pairs(corpus, vocabulary)

        return doc_term_matr, vocabulary, year, topic, close_word_pairs
Example #2
0
def estimate_conditional_tag_probability(morph, corpus_filename, logger=None):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    if logger is None:
        logger = logging.getLogger(__name__)

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition],
                                                   condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    disambig_words = list(
        with_progress(_disambiguated_words(reader),
                      "Reading disambiguated words from corpus"))

    disambig_words = with_progress(disambig_words,
                                   "Filtering out non-ambiguous words")
    ambiguous_words = [(w, gr) for (w, gr) in ((w.lower(), tag2grammemes(t))
                                               for (w, t) in disambig_words
                                               if len(morph.tag(w)) > 1)
                       if gr != set(['UNKN'])]

    logger.info("Computing P(t|w)")

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Example #3
0
    def parse_open_corpora(self, path_to_corpus='./annot.opcorpora.xml'):
        """
        Parse corpora.
        :param path_to_corpus: path to xml file
        :return: doc-term matrix: [num_docs, num_words]: csr sparse
        :return: vocab: dict of used words
        :return: year: list of year of documents
        :return: topic: list of topics
        """
        corpus = opencorpora.CorpusReader(path_to_corpus)
        documents = []
        year = []
        topic = []

        for document in corpus.iter_documents():

            document_words = self._get_document_words(document)

            if len(document_words) == 0:
                continue

            raw_sentences = " ".join(document_words)

            documents.append(raw_sentences)
            doc_categories = document.categories()

            year_doc, topic_doc = self._get_metadata(doc_categories)
            year.append(year_doc)
            topic.append(topic_doc)

        doc_term_matr = self._tf_vectorizer.fit_transform(documents)

        vocabulary = self._tf_vectorizer.vocabulary_

        close_word_pairs = self._get_close_word_pairs(corpus, vocabulary)

        return doc_term_matr, vocabulary, year, topic, close_word_pairs
Example #4
0
def estimate_conditional_tag_probability(morph, corpus_filename):
    """
    Estimate P(t|w) based on OpenCorpora xml dump.

    Probability is estimated based on counts of disambiguated
    ambiguous words, using simple Laplace smoothing.
    """
    import nltk
    import opencorpora

    class _ConditionalProbDist(nltk.ConditionalProbDist):
        """
        This ConditionalProbDist subclass passes 'condition' variable to
        probdist_factory. See https://github.com/nltk/nltk/issues/500
        """
        def __init__(self, cfdist, probdist_factory):
            self._probdist_factory = probdist_factory
            for condition in cfdist:
                self[condition] = probdist_factory(cfdist[condition],
                                                   condition)

    reader = opencorpora.CorpusReader(corpus_filename)

    ambiguous_words = ((w.lower(), tag2grammemes(t))
                       for (w, t) in _disambiguated_words(reader)
                       if len(morph.tag(w)) > 1)
    ambiguous_words = ((w, gr) for (w, gr) in ambiguous_words
                       if gr != set(['UNKN']))

    def probdist_factory(fd, condition):
        bins = max(len(morph.tag(condition)), fd.B())
        return nltk.LaplaceProbDist(fd, bins=bins)

    cfd = nltk.ConditionalFreqDist(ambiguous_words)
    cpd = _ConditionalProbDist(cfd, probdist_factory)
    return cpd, cfd
Example #5
0
def get_corpus_revision(path):
    return opencorpora.CorpusReader(path).get_annotation_info()['revision']
import nltk 
from nltk4russian.tagger import PMContextTagger
from nltk4russian.util import read_corpus_to_nltk
from nltk.tag.brill import *
import nltk.tag.brill_trainer as bt
import opencorpora

#brill templates
Template._cleartemplates() 
templates = nltk.tag.brill.fntbl37()

#чтение подкорпуса Open Corpora + фильтрация пустых предложений
corpus = opencorpora.CorpusReader('annot.opcorpora.no_ambig.xml')
sents_OC = list(filter(lambda x: len(x), corpus.iter_tagged_sents()))

#чтение подкорпуса НКРЯ (media1.tab, можно подключить другие из папки nltk4russian/data):
with open('media1.tab', encoding='utf-8') as media:
    sents_RNC = list(read_corpus_to_nltk(media))

#чтение подкорпуса LENTA
with open('LENTA_RNC.txt', encoding='utf-8') as LENTA:
    sents1 = list(read_corpus_to_nltk(LENTA))

#чтение подкорпуса VK
with open('VK_RNC.txt', encoding='utf-8') as VK:
    sents2 = list(read_corpus_to_nltk(VK))

#чтение подкорпуса JZ
with open('JZ_RNC.txt', encoding='utf-8') as JZ:
    sents3 = list(read_corpus_to_nltk(JZ))
    
Example #7
0
import opencorpora
corpus = opencorpora.CorpusReader('annot.opcorpora.xml')
for i in corpus.iter_documents():
    print(i)