Exemple #1
0
def get_quality_phrase(twarr, threshold):
    quality_list = list()
    if len(twarr) == 0:
        return quality_list
    fu.write_lines(raw_train_file, [tw[tk.key_text] for tw in twarr])
    autophrase(raw_train_file, model_base)
    lines = fu.read_lines(output_file)
    for line in lines:
        confidence, phrase = line.strip().split(maxsplit=1)
        if float(confidence) > threshold:
            quality_list.append(phrase)
    return quality_list
def make_train_test():
    p_file = ft_data_pattern.format("pos_2016.txt")
    n_bad_files = fi.listchildren(ft_data_pattern.format(''),
                                  fi.TYPE_FILE,
                                  concat=True,
                                  pattern='2016_bad')
    n_2017_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2017')
    # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12]
    n_2012_fulls = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2012_full')
    n_2016_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2016_queried')
    print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls),
          len(n_2016_files))

    n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files

    p_txtarr = fu.read_lines(p_file)
    p_prefix_txtarr = prefix_textarr(label_t, p_txtarr)
    n_txtarr_blocks = [fu.read_lines(file) for file in n_files]
    n_prefix_txtarr_blocks = [
        prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks
    ]

    train_test = list()
    bad = len(n_bad_files)
    bad_blocks, n_blocks = n_prefix_txtarr_blocks[:
                                                  bad], n_prefix_txtarr_blocks[
                                                      bad:]
    train_test.append(split_train_test(p_prefix_txtarr))
    train_test.extend([split_train_test(block) for block in n_blocks])
    print("len(train_test)", len(train_test))
    train_list, test_list = zip(*train_test)
    train_list = list(train_list) + bad_blocks

    train_txtarr = au.merge_array(train_list)
    test_txtarr = au.merge_array(test_list)
    fu.write_lines(fasttext_train, train_txtarr)
    fu.write_lines(fasttext_test, test_txtarr)
    print("len(train_list)", len(train_list), "len(train_txtarr)",
          len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
def autophrase_wrapper(process_code, textarr):
    # process_code用于辨识进程所占用的路径,textarr是一个文本list
    process_base = fi.join(autophrase_output_base, str(process_code))
    copy_into_process_base(process_base)
    commander = fi.join(process_base, "auto_phrase.sh")
    input_text_file = fi.join(process_base, "raw_train.txt")
    output_keyword_file = fi.join(process_base, "AutoPhrase.txt")
    # 将文本列表写入文件, 执行autophrase
    fu.write_lines(input_text_file, textarr)
    min_sup = determine_min_sup(len(textarr))
    autophrase(input_text_file, process_base, commander, process_base, min_sup)
    # 读取autophrase结果
    lines = fu.read_lines(output_keyword_file)
    conf_word_list = list()
    for line in lines:
        conf, word = line.split(maxsplit=1)
        conf_word_list.append((float(conf), word))
    # fi.rmtree(os.path.join(process_base, 'tmp'))
    return conf_word_list
def make_text_files():
    for idx, file in enumerate(neg_2012_full_files):
        twarr = fu.load_array(file)
        txtarr = list()
        for tw in twarr:
            text = pu.text_normalization(tw[tk.key_text])
            if pu.is_empty_string(text) or len(text) < 20:
                continue
            txtarr.append(text)
        print('len delta', len(twarr) - len(txtarr))
        path = Path(file)
        out_file_name = '_'.join([path.parent.name,
                                  path.name]).replace('json', 'txt')
        out_file = ft_data_pattern.format(out_file_name)
        print(out_file)
        fu.write_lines(out_file, txtarr)
    return
    p_twarr_blocks = map(fu.load_array, pos_files)
    p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks)
    p_txtarr = au.merge_array(list(p_txtarr_blocks))
    p_out_file = ft_data_pattern.format('pos_2016.txt')
    fu.write_lines(p_out_file, p_txtarr)

    for f in neg_files:
        in_file = neg_event_pattern.format(f)
        out_file = ft_data_pattern.format(f.replace("json", "txt"))
        twarr = fu.load_array(in_file)
        txtarr = twarr2textarr(twarr)
        print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr))
        fu.write_lines(out_file, txtarr)
Exemple #5
0
def write_cic_list(path, cic_list):
    """
    调用 cic_list 中的各元素的 construct_od返回一个OrderedDIct,将每个OrderedDIct持久化到指定路径下的文件中
    :param path: str,输出路径
    :param cic_list: list,每个元素为 ClusterInfoCarrier
    :return:
    """
    fi.mkdir(path, remove_previous=True)
    cic_list = sorted(cic_list, key=lambda item: len(item.twarr), reverse=True)
    print('    bext: output cic list, len={}'.format(len(cic_list)))
    for idx, cic in enumerate(cic_list):
        cluid = cic.cluid
        cic.twarr = ClusterInfoGetter.group_similar_tweets(cic.twarr,
                                                           process_num=10)
        od = cic.construct_od()
        json_str = fu.dumps(od)
        cluster_file = fi.join(path, '{}_cluid:{}.json'.format(idx, cluid))
        fu.write_lines(cluster_file, [json_str])

        # textarr_file = fi.join(path, '{}_text.json'.format(idx))
        # textarr = [tw[tk.key_text] for tw in cic.twarr]
        # fu.write_lines(textarr_file, textarr)
    print('    bext: output into files over')
    _cic_list = bext.get_batch_output()
    print('get cic outputs, type:{}'.format(type(_cic_list)))
    for cic in _cic_list:
        twnum = len(cic.twarr)
        _geo_list = [
            geo['address'] for geo in cic.od['geo_infer']
            if geo['quality'] == 'locality'
        ]
        print('cluid:{}, twarr len:{}'.format(cic.cluid, twnum))
        print(cic.od['summary']['keywords'])
        print(_geo_list)
        print('\n')

        if len(_geo_list) == 0:
            _top_geo = 'NOGPE'
        else:
            _top_geo = '`'.join(_geo_list)
        _out_file = '/home/nfs/cdong/tw/src/calling/tmp/id{}_tw{}_{}.txt'.format(
            cic.cluid, twnum, _top_geo)
        _txtarr = [tw[tk.key_text] for tw in cic.twarr]
        _idx_g, _txt_g = au.group_similar_items(_txtarr,
                                                score_thres=0.3,
                                                process_num=20)
        _txt_g = [
            sorted(g, key=lambda t: len(t), reverse=True) for g in _txt_g
        ]
        _txtarr = au.merge_array(_txt_g)
        fu.write_lines(_out_file, _txtarr)

    tmu.check_time()