Example #1
0
def clean_summarizes():
    conf = ConfigHandler.loadFromFile()
    title_path = 'title_{}.pkl'.format(conf['title_table'])
    content_path = 'content_{}.pkl'.format(conf['content_table'])
    titles = handle_pickle(title_path)
    contents = handle_pickle(content_path)

    none_remove = []
    for content in contents:
        if content != None:
            none_remove.append(content)

    max_len = max([len(title) for title in titles])
    news = []
    for row in range(len(none_remove)):
        cur_sums = none_remove[row]
        new_sums = [[], []]
        for i in range(len(cur_sums[0])):
            if cur_sums[0][i].strip() != '' and len(cur_sums[0][i]) < max_len:
                new_sums[0].append(cur_sums[0][i])
                new_sums[1].append(cur_sums[1][i])
            if len(new_sums[0]) > 2:
                news.append(new_sums)

    handle_pickle('clean_' + content_path, data=news, is_save=True)
Example #2
0
def get_titles():
    conf = ConfigHandler.loadFromFile()
    sql = f"select {conf['title_column']} from {conf['title_table']}"
    handler = MysqlHandler(host=conf['db_host'], user=conf['db_user'],
                           password=conf['db_passwd'], port=int(conf['db_port']))
    df = get_clean_df(sql, handler)
    data = []
    for col in df:
        for title in df[col]:
            data.append(title)
    handle_pickle(f"title_{conf['title_table']}.pkl", data=data, is_save=True)
def extract_from_given_title():
    conf = ConfigHandler.loadFromFile()
    total_title = handle_pickle('title_{}.pkl'.format(conf['title_table']))
    total_sum = handle_pickle('clean_content_{}.pkl'.format(
        conf['content_table']))
    data = [(total_sum[i], total_title) for i in range(len(total_sum))]
    result = apply_by_multiprocessor(data=data, func=extract_from_content)

    json.dump(result,
              open('result.json', 'w', encoding='utf-8'),
              ensure_ascii=False,
              indent=4)
Example #4
0
def clean_title_summarizes_pair():
    conf = ConfigHandler.loadFromFile()
    path = '{}.pkl'.format(conf['title_table'])
    data = handle_pickle(path)

    none_remove = []
    for row in range(len(data)):
        if data[row] != None:
            none_remove.append(data[row])

    max_len = 0
    for row in range(len(none_remove)):
        if len(none_remove[row][0]) > max_len:
            max_len = len(none_remove[row][0])

        cur_sums = none_remove[row][2]
        new_sums = [[], []]
        for i in range(len(cur_sums[0])):
            if cur_sums[0][i] != '':
                new_sums[0].append(cur_sums[0][i])
                new_sums[1].append(cur_sums[1][i])
        none_remove[row][2] = new_sums

    news = []
    for row in range(len(none_remove)):
        cur_sums = none_remove[row][2]
        new_sums = [[], []]
        for i in range(len(cur_sums[0])):
            if len(cur_sums[0][i]) < max_len:
                new_sums[0].append(cur_sums[0][i])
                new_sums[1].append(cur_sums[1][i])
        if len(new_sums[0]) > 2:
            none_remove[row][2] = new_sums
            news.append(none_remove[row])

    handle_pickle('clean_' + path, data=news, is_save=True)
def get_data():
    conf = ConfigHandler.loadFromFile()
    data = handle_pickle('clean_' + '{}.pkl'.format(conf['title_table']))

    total_sum = []
    total_title = []

    for row in range(len(data)):
        if data[row] is None:
            continue
        if data[row][0] in total_title:
            continue
        total_title.append(data[row][0])
        total_sum.append(data[row][2])

    return total_title, total_sum