Beispiel #1
0
class Loader(object):
    """
    Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу
    """
    def __init__(self, gram_dump_path, word_dump_path):
        self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
        self.word_vocabulary = WordVocabulary(word_dump_path)
        self.morph = pymorphy2.MorphAnalyzer()

    def parse_corpora(
            self,
            filenames: List[str]) -> Tuple[GrammemeVectorizer, WordVocabulary]:
        """
        Построить WordFormVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.grammeme_vectorizer.init_possible_vectors()
        return self.grammeme_vectorizer, self.word_vocabulary

    def __process_line(self, line: str) -> None:
        text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
        self.word_vocabulary.add_word(text)
        self.grammeme_vectorizer.add_grammemes(pos_tag, grammemes)
        to_ud = converters.converter('opencorpora-int', 'ud14')
        for parse in self.morph.parse(text):
            ud_tag = to_ud(str(parse.tag), text)
            pos = ud_tag.split()[0]
            gram = ud_tag.split()[1].split("|")
            dropped = ["Animacy", "Aspect", "NumType"]
            gram = [
                grammem for grammem in gram
                if sum([drop in grammem for drop in dropped]) == 0
            ]
            gram = "|".join(gram)
            self.grammeme_vectorizer.add_grammemes(pos, gram)
class CorporaInformationLoader(object):
    """
    Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу
    """
    def __init__(self):
        self.grammeme_vectorizer = GrammemeVectorizer()
        self.word_form_vocabulary = WordFormVocabulary()
        self.lemma_to_word_forms = defaultdict(
            set)  # type: Dict[str, Set[WordForm]]
        self.lemma_case = {}
        self.lemma_counter = Counter()  # type: Counter

    def parse_corpora(
            self, filenames: List[str]
    ) -> Tuple[WordFormVocabulary, GrammemeVectorizer]:
        """
        Построить WordFormVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.__add_seq_end()
        self.grammeme_vectorizer.init_possible_vectors()
        self.word_form_vocabulary.init_by_vocabulary(self.lemma_counter,
                                                     self.lemma_to_word_forms,
                                                     self.lemma_case)
        self.word_form_vocabulary.lemma_indices[SEQ_END_WF] = 1
        return self.word_form_vocabulary, self.grammeme_vectorizer

    def __add_seq_end(self):
        self.lemma_to_word_forms[SEQ_END].add(SEQ_END_WF)
        self.lemma_case[SEQ_END] = SEQ_END_WF.case
        self.lemma_counter[SEQ_END] = sys.maxsize

    def __process_line(self, line: str) -> None:
        try:
            text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
            lemma = lemma.lower() + '_' + pos_tag
            gram_vector_index = self.grammeme_vectorizer.add_grammemes(
                pos_tag, grammemes)
            self.lemma_to_word_forms[lemma].add(
                WordForm(lemma, gram_vector_index, text.lower()))
            self.lemma_counter[lemma] += 1
            self.__update_lemma_case(lemma, text)
        except ValueError:
            pass

    def __update_lemma_case(self, lemma: str, text: str) -> None:
        if lemma not in self.lemma_case:
            self.lemma_case[lemma] = LemmaCase.UPPER_CASE if text.isupper() else \
                              LemmaCase.PROPER_CASE if text[0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.UPPER_CASE:
            if not text.isupper():
                self.lemma_case[lemma] = LemmaCase.PROPER_CASE if text[
                    0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.PROPER_CASE:
            if not text[0].isupper():
                self.lemma_case[lemma] = LemmaCase.NORMAL_CASE