# coding: utf8 # этот файл — один из исполняемых # скрипт проходит по таблице Storage, разбивает тексты оттуда на предложения и вставляет в таблицу splitted import db_init import db_splitter def split(text): return [ el for lst in list(map(lambda x: x.split(" "), text.split(". "))) for el in lst ] if __name__ == "__main__": db_con = db_init.init_sync() for el in db_splitter.get_list_of_unsplitted_pages(db_con): db_splitter.add_list_to_db(el[0], split(el[1]), db_con) print(f"Статья {el[0]} обработана")
import sys sys.path.append("../../parser/") from db_init import init_sync def prepare_rules(raw_name): splitted_name = raw_name.split(" ") processed_splitted_name = list( map(lambda x: '"' + x.replace('"', '').lower() + '"', splitted_name)) final_name = ' '.join(processed_splitted_name) if final_name == '"память"': final_name = '"музей" "память"' return (f'Place -> {final_name};\n') if __name__ == '__main__': with init_sync() as cur: with open("places.cxx", "w", encoding="utf-8") as f: cur.execute("SELECT name FROM places;") f.write( '#encoding "utf-8"\n#GRAMMAR_ROOT PlaceName\n\nPlaceName -> Place interp (Place.Name);\n' ) for name in cur.fetchall(): name = name[0] rule = prepare_rules(name) if rule != "": f.write(rule)
subprocess.call([path_to_tomita, config_file_name]) with open(output_file_name, "r", encoding="utf-8") as o_f: output = o_f.read() os.remove(input_file_name) os.remove(output_file_name) if rewrite_config: os.remove(config_file_name) return output if __name__ == '__main__': if len(sys.argv) == 1: PATH_TO_TOMITA = "./tomita-parser" else: PATH_TO_TOMITA = sys.argv[1] print("Путь к томите: " + PATH_TO_TOMITA) con = init_sync() print("Начинаю обрабатывать...") with con.cursor() as cur: cur.execute(f'SELECT count(*) FROM storage;') db_size = cur.fetchall()[0][0] batch_size = 50 for offset in range(0, db_size-db_size%batch_size, batch_size): cur.execute(f'SELECT link, text FROM storage LIMIT {batch_size} OFFSET {offset};') rows = cur.fetchall() text = '\n'.join(list(map(lambda x:x[1], rows))) res = find_facts(text, base_path=BASE_PATH, path_to_tomita=PATH_TO_TOMITA) if res != []: for el in res: el = str(el) el = (el.replace(" .", ".") .replace(" ,", ",")