コード例 #1
0
def train_spacy_model(nlp):
    p_textarr = fu.read_lines(pos_text_file)
    n_textarr = fu.read_lines(neg_text_file)[-10000:]
    p_train_x, p_test_x, p_train_y, p_test_y = split_array(
        p_textarr, {'cats': {
            true_label: True
        }})
    n_train_x, n_test_x, n_train_y, n_test_y = split_array(
        n_textarr, {'cats': {
            true_label: False
        }})

    p_train_data = list(zip(p_train_x, p_train_y))
    n_train_data = list(zip(n_train_x, n_train_y))
    train_data = p_train_data + n_train_data
    test_x, test_y = p_test_x + n_test_x, p_test_y + n_test_y
    """ prepare pipelines """
    vocab_size = len(nlp.vocab)
    pipe_cat_name = 'textcat'  # the pipe has to be named so. or spacy cannot recognize it
    if pipe_cat_name not in nlp.pipe_names:
        terror_cat = nlp.create_pipe(pipe_cat_name)
        nlp.add_pipe(terror_cat, last=True)
    else:
        terror_cat = nlp.get_pipe(pipe_cat_name)
    terror_cat.add_label(true_label)
    """ start training """
    n_iter = 10
    other_pipe_names = [
        pipe for pipe in nlp.pipe_names if pipe != pipe_cat_name
    ]
    with nlp.disable_pipes(*other_pipe_names):  # only train textcat
        optimizer = nlp.begin_training()
        for i in range(n_iter):
            print("iter:{}".format(i))
            losses = {}
            batch_size = 16
            batch_num = int(math.ceil(len(train_data) / batch_size))
            batches = [
                train_data[idx * batch_size:(idx + 1) * batch_size]
                for idx in range(batch_num)
            ]
            print(Counter([len(b) for b in batches]))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            print("losses:", losses)
            with terror_cat.model.use_params(optimizer.averages):
                evaluate(nlp.tokenizer, terror_cat, test_x, test_y)
    print("vocab size: {} -> {}".format(vocab_size, len(nlp.vocab)))
    return nlp
コード例 #2
0
def make_train_test():
    p_file = ft_data_pattern.format("pos_2016.txt")
    n_bad_files = fi.listchildren(ft_data_pattern.format(''),
                                  fi.TYPE_FILE,
                                  concat=True,
                                  pattern='2016_bad')
    n_2017_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2017')
    # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12]
    n_2012_fulls = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2012_full')
    n_2016_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2016_queried')
    print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls),
          len(n_2016_files))

    n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files

    p_txtarr = fu.read_lines(p_file)
    p_prefix_txtarr = prefix_textarr(label_t, p_txtarr)
    n_txtarr_blocks = [fu.read_lines(file) for file in n_files]
    n_prefix_txtarr_blocks = [
        prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks
    ]

    train_test = list()
    bad = len(n_bad_files)
    bad_blocks, n_blocks = n_prefix_txtarr_blocks[:
                                                  bad], n_prefix_txtarr_blocks[
                                                      bad:]
    train_test.append(split_train_test(p_prefix_txtarr))
    train_test.extend([split_train_test(block) for block in n_blocks])
    print("len(train_test)", len(train_test))
    train_list, test_list = zip(*train_test)
    train_list = list(train_list) + bad_blocks

    train_txtarr = au.merge_array(train_list)
    test_txtarr = au.merge_array(test_list)
    fu.write_lines(fasttext_train, train_txtarr)
    fu.write_lines(fasttext_test, test_txtarr)
    print("len(train_list)", len(train_list), "len(train_txtarr)",
          len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
コード例 #3
0
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list):
    lbl_txt_arr = fu.read_lines(lbl_txt_file)
    lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list))
    args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file)
                 for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)]
    print([len(b) for b in lbl_txt_blocks])
    mu.multi_process_batch(_generate_matrices, 10, args_list)
コード例 #4
0
def file2label_text_array(file):
    """
    转化文本-标记文件到内存
    :param file: str,文本-标记文件的路径
    :return: 见 text2label_text_array
    """
    lines = fu.read_lines(file)
    return text2label_text_array(lines)
コード例 #5
0
ファイル: auto_phrase.py プロジェクト: leeyanghaha/my_merge
def get_quality_phrase(twarr, threshold):
    quality_list = list()
    if len(twarr) == 0:
        return quality_list
    fu.write_lines(raw_train_file, [tw[tk.key_text] for tw in twarr])
    autophrase(raw_train_file, model_base)
    lines = fu.read_lines(output_file)
    for line in lines:
        confidence, phrase = line.strip().split(maxsplit=1)
        if float(confidence) > threshold:
            quality_list.append(phrase)
    return quality_list
コード例 #6
0
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list):
    """
    给出fasttext模型文件的路径,读取文本-标记文件,将文件内容分块传递给多个子进程,
    各子进程将文本和标记分别转化为向量列表(即矩阵)输出到 mtx_lbl_file_list 中的每个文件中
    文本量较大的情况下避免每次训练分类器都要重新生成文本对应的向量列表
    :param ft_model_file: str,fasttext模型的文件路径
    :param lbl_txt_file: str,文本-标记文件的路径
    :param mtx_lbl_file_list: 每个元素为tuple,tuple的每个元素为str,
        第一个str标志存储矩阵的文件,第二个str表示存储该矩阵对应的标记列表的文件
    :return:
    """
    lbl_txt_arr = fu.read_lines(lbl_txt_file)
    lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list))
    args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file)
                 for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)]
    print([len(b) for b in lbl_txt_blocks])
    mu.multi_process_batch(_generate_matrices, 10, args_list)
コード例 #7
0
def autophrase_wrapper(process_code, textarr):
    # process_code用于辨识进程所占用的路径,textarr是一个文本list
    process_base = fi.join(autophrase_output_base, str(process_code))
    copy_into_process_base(process_base)
    commander = fi.join(process_base, "auto_phrase.sh")
    input_text_file = fi.join(process_base, "raw_train.txt")
    output_keyword_file = fi.join(process_base, "AutoPhrase.txt")
    # 将文本列表写入文件, 执行autophrase
    fu.write_lines(input_text_file, textarr)
    min_sup = determine_min_sup(len(textarr))
    autophrase(input_text_file, process_base, commander, process_base, min_sup)
    # 读取autophrase结果
    lines = fu.read_lines(output_keyword_file)
    conf_word_list = list()
    for line in lines:
        conf, word = line.split(maxsplit=1)
        conf_word_list.append((float(conf), word))
    # fi.rmtree(os.path.join(process_base, 'tmp'))
    return conf_word_list
コード例 #8
0
#     for pos_type in target_pos_types:
#         itemarr = postype2itemarr[pos_type]
#         if len(itemarr) == 0:
#             pos_vector = np.zeros([glovec_dim, ])
#         else:
#             pos_vector = np.mean([item[1] for item in itemarr], axis=0)
#         assert len(pos_vector) == glovec_dim
#         vecarr.append(pos_vector)
#     return vecarr

if __name__ == '__main__':
    import utils.function_utils as fu
    import utils.timer_utils as tmu

    pos_file = "/home/nfs/cdong/tw/seeding/Terrorist/data/fasttext/pos_2016.txt"
    txtarr = fu.read_lines(pos_file)

    nlp1 = spacy.load("en_core_web_lg")
    nlp2 = spacy.load("en_core_web_lg", disable=['tagger'])
    nlp3 = spacy.load("en_core_web_lg", disable=['parser'])
    nlp4 = spacy.load("en_core_web_lg", disable=['parser', 'tagger'])

    tmu.check_time()
    docarr1 = list(nlp1.pipe(txtarr, n_threads=10))
    tmu.check_time()
    docarr2 = list(nlp2.pipe(txtarr, n_threads=10))
    tmu.check_time()
    docarr3 = list(nlp3.pipe(txtarr, n_threads=10))
    tmu.check_time()
    docarr4 = list(nlp4.pipe(txtarr, n_threads=10))
    tmu.check_time()
コード例 #9
0
def get_quality_n_gram(textarr, n_range, len_thres):
    posttextarr = [text.lower().strip() for text in textarr]
    tokens_list = [valid_tokens_of_text(text) for text in posttextarr]
    keywords = get_quality_keywords(tokens_list, n_range, len_thres)
    return tokens_list, keywords


if __name__ == "__main__":
    # from utils.array_utils import group_textarr_similar_index
    # file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-03-26_suicide-bomb_Lahore.json"
    # twarr = fu.load_array(file)
    # textarr = [tw[tk.key_text] for tw in twarr]
    # tmu.check_time()
    file = "/home/nfs/cdong/tw/src/calling/tmp/0.txt"
    _textarr = fu.read_lines(file)
    _tokens_list = [
        valid_tokens_of_text(text.lower().strip()) for text in _textarr
    ]
    tmu.check_time()
    _keywords = get_quality_keywords(_tokens_list,
                                     n_range=4,
                                     len_thres=20,
                                     top_k=100)
    tmu.check_time()
    print(_keywords)
    # _ngrams = get_ngrams_from_textarr(_textarr, 4)
    # _reorder_list = reorder_grams(_ngrams, 100)
    # _keywords = [w for w, f in _reorder_list]
    # print(_keywords)
    # idx_g, word_g = group_textarr_similar_index(_keywords, 0.2)
コード例 #10
0
    print(len(labal), len(proba))
    au.precision_recall_threshold(labal, proba)


if __name__ == '__main__':
    from calling.back_filter import filter_twarr_text
    # from classifying.terror.classifier_terror import file2label_text_array
    # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test")
    pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive"
    neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative"
    pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012')
    
    base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant"
    pos_files = fi.listchildren(base, concat=True)
    print(len(pos_files))
    print(sum([len(fu.read_lines(f)) for f in pos_files]))
    exit()
    
    my_filter = EffectCheck()
    pos_probarr, neg_probarr = list(), list()
    tmu.check_time()
    for file in neg_files:
        twarr = filter_twarr_text(fu.load_array(file))
        probarr = my_filter.predict_proba(twarr)
        neg_probarr.extend(probarr)
    tmu.check_time()
    for file in pos_files:
        probarr = my_filter.predict_proba(fu.load_array(file))
        pos_probarr.extend(probarr)
        # post_twarr = list()
    
コード例 #11
0
def file2label_text_array(file):
    lines = fu.read_lines(file)
    return text2label_text_array(lines)