def binary_predict(target, model, threshold): if type(target) is str: # single text classification text = target.strip() if pu.is_empty_string(text): return value_f else: pred, score = model.predict_proba(text, threshold=threshold) return binary_label2value[pred[0]], score[0] else: # multi-line text classification text_arr, ignore_idx_arr, pred_value_arr = list(), list(), list() for idx, text in enumerate(target): assert type(text) is str text = text.strip() if pu.is_empty_string(text): ignore_idx_arr.append(idx) else: text_arr.append(text) pred_label_arr, score_arr = model.predict_proba(text_arr, threshold=threshold) assert len(pred_label_arr) == len(text_arr) and len(score_arr) == len(text_arr) for idx in range(len(text_arr)): pred = pred_label_arr[idx] # print(score_arr[idx], pred) if len(pred) == 0: # this text has been ignored due to some unidentifiable reason value = binary_predict(text_arr[idx], model, threshold) else: value = binary_label2value[pred[0]] pred_value_arr.append(value) for idx in ignore_idx_arr: pred_value_arr.insert(idx, value_f) score_arr = [s[0] for s in score_arr] return pred_value_arr, score_arr
def prefix_textarr(label, textarr): label_text_arr = list() for text in textarr: if pu.is_empty_string(text): continue label_text_arr.append('{} {}'.format(label, text.strip())) return label_text_arr
def make_text_files(): for idx, file in enumerate(neg_2012_full_files): twarr = fu.load_array(file) txtarr = list() for tw in twarr: text = pu.text_normalization(tw[tk.key_text]) if pu.is_empty_string(text) or len(text) < 20: continue txtarr.append(text) print('len delta', len(twarr) - len(txtarr)) path = Path(file) out_file_name = '_'.join([path.parent.name, path.name]).replace('json', 'txt') out_file = ft_data_pattern.format(out_file_name) print(out_file) fu.write_lines(out_file, txtarr) return p_twarr_blocks = map(fu.load_array, pos_files) p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks) p_txtarr = au.merge_array(list(p_txtarr_blocks)) p_out_file = ft_data_pattern.format('pos_2016.txt') fu.write_lines(p_out_file, p_txtarr) for f in neg_files: in_file = neg_event_pattern.format(f) out_file = ft_data_pattern.format(f.replace("json", "txt")) twarr = fu.load_array(in_file) txtarr = twarr2textarr(twarr) print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr)) fu.write_lines(out_file, txtarr)
def generate_candidate_keywords(sentence_list, stopword_pattern, stop_word_list, min_char_num=1, max_words_num=5, min_words_length_adj=1, max_words_length_adj=1, min_phrase_freq_adj=2): phrase_list = list() adjoined_candidates = list() for s in sentence_list: replace_stop = re.sub(stopword_pattern, "|", s.strip()) phrases = replace_stop.split("|") for phrase in phrases: phrase = phrase.strip().lower() if not pu.is_empty_string(phrase) and is_acceptable_phrase( phrase, min_char_num, max_words_num): phrase_list.append(phrase) adjoined_candidates += adjoined_candidates_from_sentence( s, stop_word_list, min_words_length_adj, max_words_length_adj) phrase_list += filter_adjoined_candidates(adjoined_candidates, min_phrase_freq_adj) phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list, min_words_length_adj, max_words_length_adj, min_phrase_freq_adj) return phrase_list
def twarr2textarr(twarr): textarr = list() for tw in twarr: text = tw.get(tk.key_text).strip() if tk.key_orgntext not in tw: text = pu.text_normalization(text) if pu.is_empty_string(text): continue textarr.append(text) return textarr
def filter_twarr_text(twarr): """ This function only suits for tweets that are not processed """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr
def twarr_ark(twarr, from_field=tk.key_text, to_field=tk.key_ark): empty_idxes, non_empty_idxes = list(), list() textarr = list() for idx, tw in enumerate(twarr): text = tw[from_field] if pu.is_empty_string(text): empty_idxes.append(idx) else: textarr.append(text) non_empty_idxes.append(idx) posarr = runtagger_parse(textarr) for tw_idx in empty_idxes: twarr[tw_idx][to_field] = [] for pos_idx, tw_idx in enumerate(non_empty_idxes): twarr[tw_idx][to_field] = posarr[pos_idx] return twarr
def filter_twarr_text(twarr): """ 对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特; 每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果 :param twarr: list,推特列表 :return: list,经过文本预处理以及筛选的推特列表 """ flt_twarr = list() for tw in twarr: # TODO text_orgn = tw.get(tk.key_text, '').strip() text_orgn = tw.get(tk.key_text, '').strip() # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip() if not text_orgn: continue text_norm = pu.text_normalization(text_orgn).strip() if pu.is_empty_string(text_norm) or not pu.has_enough_alpha( text_norm, 0.65): continue tw[tk.key_orgntext] = text_orgn tw[tk.key_text] = text_norm flt_twarr.append(tw) return flt_twarr