def binary_predict(target, model, threshold):
    if type(target) is str:
        # single text classification
        text = target.strip()
        if pu.is_empty_string(text):
            return value_f
        else:
            pred, score = model.predict_proba(text, threshold=threshold)
            return binary_label2value[pred[0]], score[0]
    else:
        # multi-line text classification
        text_arr, ignore_idx_arr, pred_value_arr = list(), list(), list()
        for idx, text in enumerate(target):
            assert type(text) is str
            text = text.strip()
            if pu.is_empty_string(text):
                ignore_idx_arr.append(idx)
            else:
                text_arr.append(text)
        pred_label_arr, score_arr = model.predict_proba(text_arr, threshold=threshold)
        assert len(pred_label_arr) == len(text_arr) and len(score_arr) == len(text_arr)
        for idx in range(len(text_arr)):
            pred = pred_label_arr[idx]
            # print(score_arr[idx], pred)
            if len(pred) == 0:
                # this text has been ignored due to some unidentifiable reason
                value = binary_predict(text_arr[idx], model, threshold)
            else:
                value = binary_label2value[pred[0]]
            pred_value_arr.append(value)
        for idx in ignore_idx_arr:
            pred_value_arr.insert(idx, value_f)
        score_arr = [s[0] for s in score_arr]
        return pred_value_arr, score_arr
def prefix_textarr(label, textarr):
    label_text_arr = list()
    for text in textarr:
        if pu.is_empty_string(text):
            continue
        label_text_arr.append('{} {}'.format(label, text.strip()))
    return label_text_arr
def make_text_files():
    for idx, file in enumerate(neg_2012_full_files):
        twarr = fu.load_array(file)
        txtarr = list()
        for tw in twarr:
            text = pu.text_normalization(tw[tk.key_text])
            if pu.is_empty_string(text) or len(text) < 20:
                continue
            txtarr.append(text)
        print('len delta', len(twarr) - len(txtarr))
        path = Path(file)
        out_file_name = '_'.join([path.parent.name,
                                  path.name]).replace('json', 'txt')
        out_file = ft_data_pattern.format(out_file_name)
        print(out_file)
        fu.write_lines(out_file, txtarr)
    return
    p_twarr_blocks = map(fu.load_array, pos_files)
    p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks)
    p_txtarr = au.merge_array(list(p_txtarr_blocks))
    p_out_file = ft_data_pattern.format('pos_2016.txt')
    fu.write_lines(p_out_file, p_txtarr)

    for f in neg_files:
        in_file = neg_event_pattern.format(f)
        out_file = ft_data_pattern.format(f.replace("json", "txt"))
        twarr = fu.load_array(in_file)
        txtarr = twarr2textarr(twarr)
        print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr))
        fu.write_lines(out_file, txtarr)
Exemple #4
0
def generate_candidate_keywords(sentence_list,
                                stopword_pattern,
                                stop_word_list,
                                min_char_num=1,
                                max_words_num=5,
                                min_words_length_adj=1,
                                max_words_length_adj=1,
                                min_phrase_freq_adj=2):
    phrase_list = list()
    adjoined_candidates = list()
    for s in sentence_list:
        replace_stop = re.sub(stopword_pattern, "|", s.strip())
        phrases = replace_stop.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if not pu.is_empty_string(phrase) and is_acceptable_phrase(
                    phrase, min_char_num, max_words_num):
                phrase_list.append(phrase)

        adjoined_candidates += adjoined_candidates_from_sentence(
            s, stop_word_list, min_words_length_adj, max_words_length_adj)
    phrase_list += filter_adjoined_candidates(adjoined_candidates,
                                              min_phrase_freq_adj)

    phrase_list += extract_adjoined_candidates(sentence_list, stop_word_list,
                                               min_words_length_adj,
                                               max_words_length_adj,
                                               min_phrase_freq_adj)
    return phrase_list
def twarr2textarr(twarr):
    textarr = list()
    for tw in twarr:
        text = tw.get(tk.key_text).strip()
        if tk.key_orgntext not in tw:
            text = pu.text_normalization(text)
        if pu.is_empty_string(text):
            continue
        textarr.append(text)
    return textarr
def filter_twarr_text(twarr):
    """ This function only suits for tweets that are not processed """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr
Exemple #7
0
def twarr_ark(twarr, from_field=tk.key_text, to_field=tk.key_ark):
    empty_idxes, non_empty_idxes = list(), list()
    textarr = list()
    for idx, tw in enumerate(twarr):
        text = tw[from_field]
        if pu.is_empty_string(text):
            empty_idxes.append(idx)
        else:
            textarr.append(text)
            non_empty_idxes.append(idx)

    posarr = runtagger_parse(textarr)

    for tw_idx in empty_idxes:
        twarr[tw_idx][to_field] = []
    for pos_idx, tw_idx in enumerate(non_empty_idxes):
        twarr[tw_idx][to_field] = posarr[pos_idx]
    return twarr
Exemple #8
0
def filter_twarr_text(twarr):
    """
    对输入的推特列表,对所有推特的文本进行预处理,抛弃预处理结果不合规的推特;
    每条推特 tk.key_orgntext 字段保留原始文本, tk.key_text 字段保留预处理结果
    :param twarr: list,推特列表
    :return: list,经过文本预处理以及筛选的推特列表
    """
    flt_twarr = list()
    for tw in twarr:
        # TODO text_orgn = tw.get(tk.key_text, '').strip()
        text_orgn = tw.get(tk.key_text, '').strip()
        # text_orgn = tw.get(tk.key_orgntext, tw.get(tk.key_text, None)).strip()
        if not text_orgn:
            continue
        text_norm = pu.text_normalization(text_orgn).strip()
        if pu.is_empty_string(text_norm) or not pu.has_enough_alpha(
                text_norm, 0.65):
            continue
        tw[tk.key_orgntext] = text_orgn
        tw[tk.key_text] = text_norm
        flt_twarr.append(tw)
    return flt_twarr