Esempio n. 1
0
# coding: utf8
# этот файл — один из исполняемых
# скрипт проходит по таблице Storage, разбивает тексты оттуда на предложения и вставляет в таблицу splitted
import db_init
import db_splitter


def split(text):
    return [
        el for lst in list(map(lambda x: x.split("  "), text.split(". ")))
        for el in lst
    ]


if __name__ == "__main__":
    db_con = db_init.init_sync()
    for el in db_splitter.get_list_of_unsplitted_pages(db_con):
        db_splitter.add_list_to_db(el[0], split(el[1]), db_con)
        print(f"Статья {el[0]} обработана")
import sys

sys.path.append("../../parser/")
from db_init import init_sync


def prepare_rules(raw_name):
    splitted_name = raw_name.split(" ")
    processed_splitted_name = list(
        map(lambda x: '"' + x.replace('"', '').lower() + '"', splitted_name))
    final_name = ' '.join(processed_splitted_name)
    if final_name == '"память"':
        final_name = '"музей" "память"'
    return (f'Place -> {final_name};\n')


if __name__ == '__main__':
    with init_sync() as cur:
        with open("places.cxx", "w", encoding="utf-8") as f:
            cur.execute("SELECT name FROM places;")
            f.write(
                '#encoding "utf-8"\n#GRAMMAR_ROOT PlaceName\n\nPlaceName -> Place interp (Place.Name);\n'
            )
            for name in cur.fetchall():
                name = name[0]
                rule = prepare_rules(name)
                if rule != "":
                    f.write(rule)
Esempio n. 3
0
    subprocess.call([path_to_tomita, config_file_name])
    with open(output_file_name, "r", encoding="utf-8") as o_f:
        output = o_f.read()
    os.remove(input_file_name)
    os.remove(output_file_name)
    if rewrite_config:
        os.remove(config_file_name)
    return output
    
if __name__ == '__main__':
    if len(sys.argv) == 1:
        PATH_TO_TOMITA = "./tomita-parser"
    else:
        PATH_TO_TOMITA = sys.argv[1]
    print("Путь к томите: " + PATH_TO_TOMITA)
    con = init_sync()
    print("Начинаю обрабатывать...")
    with con.cursor() as cur:
        cur.execute(f'SELECT count(*) FROM storage;')
        db_size = cur.fetchall()[0][0]
        batch_size = 50
        for offset in range(0, db_size-db_size%batch_size, batch_size):
            cur.execute(f'SELECT link, text FROM storage LIMIT {batch_size} OFFSET {offset};')
            rows = cur.fetchall()
            text = '\n'.join(list(map(lambda x:x[1], rows)))
            res = find_facts(text, base_path=BASE_PATH, path_to_tomita=PATH_TO_TOMITA)
            if res != []:
                for el in res:
                    el = str(el)
                    el = (el.replace(" .", ".")
                        .replace(" ,", ",")