def update_dicts(
     self, par
 ):  # par - по чему итерируемся, например: 'paragraph' (подробнее в coll)
     for file in os.listdir(self.dir):
         d1, d2 = coll.iter_by_docs(file, self.dir, par, 0)
         self.num_to_text.update(d1)
         self.text_to_num.update(d2)
         self.num_to_name.update(
             coll.iter_by_docs(file, self.dir, 'art_name', 1))
Exemple #2
0
def docs_parser(dir):
    cash = dict()
    morph = MorphAnalyzer()
    stop_words = stopwords.words("russian")
    for file in os.listdir(dir):
        for i_d in iter_by_docs(file, dir, 'chapter', 1):
            t = Tokenizer(i_d)
def all_articles_in_codexes(dir: str) -> tp.List[tp.Tuple[str, str]]:
    # Возвращает список всех статей
    all_codex_name = [filename for filename in os.listdir(dir)]
    ans_codexes: tp.List[tp.Tuple[str, str]] = []
    for codex in all_codex_name:
        ans_codexes.extend(
            coll.iter_by_docs(codex, dir, 'art_name2', 1).keys())
    return ans_codexes
    def build_inversed_index(
            self, par):  # par - по чему итерируемся, например: 'paragraph'

        t = tqdm(total=len(os.listdir(self.dir)))
        for file in os.listdir(self.dir):
            for i_d in coll.iter_by_docs(file, self.dir, par, 1):
                self.tokenizer.text = i_d
                tokens = self.tokenizer.tokenize(self.cash, self.morph,
                                                 self.stop_words)
                for token in tokens:
                    if token in self.inv_ind:
                        if (self.text_to_num[i_d], tokens.count(token)
                            ) not in self.inv_ind[token]:
                            self.inv_ind[token].append(
                                (self.text_to_num[i_d], tokens.count(token)))
                    else:
                        self.inv_ind[token] = [(self.text_to_num[i_d],
                                                tokens.count(token))]
                self.num_to_len[self.text_to_num[i_d]] = len(list(tokens))
            t.update(1)
        t.close()
Exemple #5
0
 def dict_for_art_names(self):
     codex_path = os.path.join(PATH_TO_ROOT, "codexes")
     for cod in os.listdir(codex_path):
         _, art_n = coll.iter_by_docs(cod, codex_path, 'art_name', 1)
         self.art_names.update(art_n)
        new_codnorm = set()
        codnorm = co.cod_norm
        for cn in codnorm:
            cod = cn[0]
            norm = cn[1]
            for c in cod:
                if (str(c), norm) in set_numbers:
                    new_codnorm.add(
                        (nc.name_codexes[c].lower(), 'ст ' + norm[:-1]))
        co.cod_norm = list(new_codnorm)
        ans_dict[j] = dict()
        ans_dict[j]["Question"] = co.question
        ans_dict[j]["Answer_Lawyer"] = co.answer
        for i in range(len(co.cod_norm)):
            co.cod_norm[i] = ' '.join(co.cod_norm[i])
        ans_dict[j]["Answer"] = ', '.join(co.cod_norm)
        #json.dump(ans_dict, pic, ensure_ascii=False, indent=2)
        #pic.write('\n\n')
    json.dump(ans_dict, pic, indent=2)


codexes_to_json("codexes")

#norms_codexes_to_normal("codexes")
'''
coll.iter_by_docs()
for co in codexes_out:
    print(co)
print(len(codexes_out))
'''
Exemple #7
0
from tools.relative_paths_to_directories import path_to_directories

PATH_TO_ROOT, PATH_TO_TOOLS, PATH_TO_FILES, PATH_TO_TF_IDF, PATH_TO_INV_IND, PATH_TO_BM_25, \
    PATH_TO_LEARNING_TO_RANK = path_to_directories(os.getcwd())

# директория на папку с кодексами
codexes_dir = os.path.join(PATH_TO_ROOT, "codexes")

tokenizer = Tokenizer()
simple_corp = SimpleCorp()

simple_corp_art_names = SimpleCorp()

for filename in tqdm(os.listdir(codexes_dir)):
    d1, _ = coll.iter_by_docs(filename, codexes_dir, 'article', 0)
    for doc_id, doc_text in d1.items():
        simple_corp.add_doc(doc_id, doc_text)

for filename in tqdm(os.listdir(codexes_dir)):
    names = coll.iter_by_docs(filename, codexes_dir, 'art_name', 1)
    for doc_id, doc_text in names.items():
        simple_corp_art_names.add_doc(doc_id, doc_text)

tokenized_corp = SimpleCorp()
tokenized_corp.make_from(simple_corp, tokenizer)

simple_corp.save('codexes_corp_articles', os.path.join(PATH_TO_FILES, "corp"))
tokenized_corp.save('codexes_tokenized_corp_articles',
                    os.path.join(PATH_TO_FILES, "corp"))
simple_corp_art_names.save('codexes_corp_art_names',