def train_spacy_model(nlp): p_textarr = fu.read_lines(pos_text_file) n_textarr = fu.read_lines(neg_text_file)[-10000:] p_train_x, p_test_x, p_train_y, p_test_y = split_array( p_textarr, {'cats': { true_label: True }}) n_train_x, n_test_x, n_train_y, n_test_y = split_array( n_textarr, {'cats': { true_label: False }}) p_train_data = list(zip(p_train_x, p_train_y)) n_train_data = list(zip(n_train_x, n_train_y)) train_data = p_train_data + n_train_data test_x, test_y = p_test_x + n_test_x, p_test_y + n_test_y """ prepare pipelines """ vocab_size = len(nlp.vocab) pipe_cat_name = 'textcat' # the pipe has to be named so. or spacy cannot recognize it if pipe_cat_name not in nlp.pipe_names: terror_cat = nlp.create_pipe(pipe_cat_name) nlp.add_pipe(terror_cat, last=True) else: terror_cat = nlp.get_pipe(pipe_cat_name) terror_cat.add_label(true_label) """ start training """ n_iter = 10 other_pipe_names = [ pipe for pipe in nlp.pipe_names if pipe != pipe_cat_name ] with nlp.disable_pipes(*other_pipe_names): # only train textcat optimizer = nlp.begin_training() for i in range(n_iter): print("iter:{}".format(i)) losses = {} batch_size = 16 batch_num = int(math.ceil(len(train_data) / batch_size)) batches = [ train_data[idx * batch_size:(idx + 1) * batch_size] for idx in range(batch_num) ] print(Counter([len(b) for b in batches])) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) print("losses:", losses) with terror_cat.model.use_params(optimizer.averages): evaluate(nlp.tokenizer, terror_cat, test_x, test_y) print("vocab size: {} -> {}".format(vocab_size, len(nlp.vocab))) return nlp
def make_train_test(): p_file = ft_data_pattern.format("pos_2016.txt") n_bad_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_bad') n_2017_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2017') # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12] n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full') n_2016_files = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2016_queried') print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls), len(n_2016_files)) n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files p_txtarr = fu.read_lines(p_file) p_prefix_txtarr = prefix_textarr(label_t, p_txtarr) n_txtarr_blocks = [fu.read_lines(file) for file in n_files] n_prefix_txtarr_blocks = [ prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks ] train_test = list() bad = len(n_bad_files) bad_blocks, n_blocks = n_prefix_txtarr_blocks[: bad], n_prefix_txtarr_blocks[ bad:] train_test.append(split_train_test(p_prefix_txtarr)) train_test.extend([split_train_test(block) for block in n_blocks]) print("len(train_test)", len(train_test)) train_list, test_list = zip(*train_test) train_list = list(train_list) + bad_blocks train_txtarr = au.merge_array(train_list) test_txtarr = au.merge_array(test_list) fu.write_lines(fasttext_train, train_txtarr) fu.write_lines(fasttext_test, test_txtarr) print("len(train_list)", len(train_list), "len(train_txtarr)", len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)
def file2label_text_array(file): """ 转化文本-标记文件到内存 :param file: str,文本-标记文件的路径 :return: 见 text2label_text_array """ lines = fu.read_lines(file) return text2label_text_array(lines)
def get_quality_phrase(twarr, threshold): quality_list = list() if len(twarr) == 0: return quality_list fu.write_lines(raw_train_file, [tw[tk.key_text] for tw in twarr]) autophrase(raw_train_file, model_base) lines = fu.read_lines(output_file) for line in lines: confidence, phrase = line.strip().split(maxsplit=1) if float(confidence) > threshold: quality_list.append(phrase) return quality_list
def generate_train_matrices(ft_model_file, lbl_txt_file, mtx_lbl_file_list): """ 给出fasttext模型文件的路径,读取文本-标记文件,将文件内容分块传递给多个子进程, 各子进程将文本和标记分别转化为向量列表(即矩阵)输出到 mtx_lbl_file_list 中的每个文件中 文本量较大的情况下避免每次训练分类器都要重新生成文本对应的向量列表 :param ft_model_file: str,fasttext模型的文件路径 :param lbl_txt_file: str,文本-标记文件的路径 :param mtx_lbl_file_list: 每个元素为tuple,tuple的每个元素为str, 第一个str标志存储矩阵的文件,第二个str表示存储该矩阵对应的标记列表的文件 :return: """ lbl_txt_arr = fu.read_lines(lbl_txt_file) lbl_txt_blocks = mu.split_multi_format(lbl_txt_arr, len(mtx_lbl_file_list)) args_list = [(ft_model_file, lbl_txt_blocks[idx], mtx_file, lbl_file) for idx, (mtx_file, lbl_file) in enumerate(mtx_lbl_file_list)] print([len(b) for b in lbl_txt_blocks]) mu.multi_process_batch(_generate_matrices, 10, args_list)
def autophrase_wrapper(process_code, textarr): # process_code用于辨识进程所占用的路径,textarr是一个文本list process_base = fi.join(autophrase_output_base, str(process_code)) copy_into_process_base(process_base) commander = fi.join(process_base, "auto_phrase.sh") input_text_file = fi.join(process_base, "raw_train.txt") output_keyword_file = fi.join(process_base, "AutoPhrase.txt") # 将文本列表写入文件, 执行autophrase fu.write_lines(input_text_file, textarr) min_sup = determine_min_sup(len(textarr)) autophrase(input_text_file, process_base, commander, process_base, min_sup) # 读取autophrase结果 lines = fu.read_lines(output_keyword_file) conf_word_list = list() for line in lines: conf, word = line.split(maxsplit=1) conf_word_list.append((float(conf), word)) # fi.rmtree(os.path.join(process_base, 'tmp')) return conf_word_list
# for pos_type in target_pos_types: # itemarr = postype2itemarr[pos_type] # if len(itemarr) == 0: # pos_vector = np.zeros([glovec_dim, ]) # else: # pos_vector = np.mean([item[1] for item in itemarr], axis=0) # assert len(pos_vector) == glovec_dim # vecarr.append(pos_vector) # return vecarr if __name__ == '__main__': import utils.function_utils as fu import utils.timer_utils as tmu pos_file = "/home/nfs/cdong/tw/seeding/Terrorist/data/fasttext/pos_2016.txt" txtarr = fu.read_lines(pos_file) nlp1 = spacy.load("en_core_web_lg") nlp2 = spacy.load("en_core_web_lg", disable=['tagger']) nlp3 = spacy.load("en_core_web_lg", disable=['parser']) nlp4 = spacy.load("en_core_web_lg", disable=['parser', 'tagger']) tmu.check_time() docarr1 = list(nlp1.pipe(txtarr, n_threads=10)) tmu.check_time() docarr2 = list(nlp2.pipe(txtarr, n_threads=10)) tmu.check_time() docarr3 = list(nlp3.pipe(txtarr, n_threads=10)) tmu.check_time() docarr4 = list(nlp4.pipe(txtarr, n_threads=10)) tmu.check_time()
def get_quality_n_gram(textarr, n_range, len_thres): posttextarr = [text.lower().strip() for text in textarr] tokens_list = [valid_tokens_of_text(text) for text in posttextarr] keywords = get_quality_keywords(tokens_list, n_range, len_thres) return tokens_list, keywords if __name__ == "__main__": # from utils.array_utils import group_textarr_similar_index # file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-03-26_suicide-bomb_Lahore.json" # twarr = fu.load_array(file) # textarr = [tw[tk.key_text] for tw in twarr] # tmu.check_time() file = "/home/nfs/cdong/tw/src/calling/tmp/0.txt" _textarr = fu.read_lines(file) _tokens_list = [ valid_tokens_of_text(text.lower().strip()) for text in _textarr ] tmu.check_time() _keywords = get_quality_keywords(_tokens_list, n_range=4, len_thres=20, top_k=100) tmu.check_time() print(_keywords) # _ngrams = get_ngrams_from_textarr(_textarr, 4) # _reorder_list = reorder_grams(_ngrams, 100) # _keywords = [w for w, f in _reorder_list] # print(_keywords) # idx_g, word_g = group_textarr_similar_index(_keywords, 0.2)
print(len(labal), len(proba)) au.precision_recall_threshold(labal, proba) if __name__ == '__main__': from calling.back_filter import filter_twarr_text # from classifying.terror.classifier_terror import file2label_text_array # textarr, labelarr = file2label_text_array("/home/nfs/cdong/tw/seeding/Terrorist/data/test") pos_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive" neg_base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/negative" pos_files, neg_files = fi.listchildren(pos_base, concat=True), fi.listchildren(neg_base, concat=True, pattern='2012') base = "/home/nfs/yangl/event_detection/testdata/event2012/relevant" pos_files = fi.listchildren(base, concat=True) print(len(pos_files)) print(sum([len(fu.read_lines(f)) for f in pos_files])) exit() my_filter = EffectCheck() pos_probarr, neg_probarr = list(), list() tmu.check_time() for file in neg_files: twarr = filter_twarr_text(fu.load_array(file)) probarr = my_filter.predict_proba(twarr) neg_probarr.extend(probarr) tmu.check_time() for file in pos_files: probarr = my_filter.predict_proba(fu.load_array(file)) pos_probarr.extend(probarr) # post_twarr = list()
def file2label_text_array(file): lines = fu.read_lines(file) return text2label_text_array(lines)