Esempio n. 1
0
def seg_long_phrases(origin_long_txt, seged_long_txt2):
    segs_list = file_utils.read_line(origin_long_txt,
                                     lambda line: segment.seg_text(line))
    seg_list = [
        seg for segs in segs_list for seg in segs.split(' ') if len(seg) > 1
    ]
    file_utils.save_list2file(list(set(seg_list)), seged_long_txt2)
Esempio n. 2
0
def load_data(data_path):
    datas = file_utils.read_line(data_path,
                                 lambda line_contents: (line_contents[0], line_contents[1]),
                                 split='\t')

    # return zip(*datas)
    return datas
Esempio n. 3
0
def find_words_not_in_vec(word_index, vectors_file, inclue_file, exclude_file):
    vec_words = file_utils.read_line(vectors_file, lambda line: line.split()[0])
    print(f'{vec_words[0]}')
    exclude_words = [word for word in word_index if word not in vec_words]
    include_words = [word for word in word_index if word in vec_words]
    file_utils.save_list2file(exclude_words, exclude_file)
    file_utils.save_list2file(include_words, inclue_file)
Esempio n. 4
0
def concat_all(clf_dir, dest_dir, portion):
    file_names = ['train', 'val', 'test']
    clf_name_file = os.path.join(dest_dir, 'clf_name.txt')
    clf_names = set()
    for clf_file in os.listdir(clf_dir):
        clf_name = clf_file[0:4]
        clf_count = int(clf_file[5:-4])
        clf_file_path = os.path.join(clf_dir, clf_file)
        texts = list(
            file_utils.read_line(clf_file_path,
                                 lambda line: json.loads(line)['abs']))
        random.shuffle(texts)
        count2read = int(clf_count * 0.05)

        for i in range(20):
            start = count2read * i
            end = count2read * (i + 1) if len(texts) - 1 > count2read * (
                i + 1) else len(texts) - 1
            splits = split_list(texts[start:end], portion)
            if splits:
                clf_names.add(clf_name)
                print(f'write clf {clf_name}')
                for index, list2write in enumerate(splits):
                    dest_file = os.path.join(dest_dir,
                                             f'{file_names[index]}{i}.txt')
                    file_utils.save_list2file(
                        list2write,
                        dest_file,
                        work_func=lambda text: f'{clf_name}\t{text}',
                        filter_func=lambda item: len(item) > 1)
            else:
                print(f'not split')

    file_utils.save_list2file(list(clf_names), clf_name_file)
Esempio n. 5
0
def process_file(filename, word_to_id, cat_to_id, max_length=600):
    """将文件转换为id表示"""
    def judge(line_contents):
        return (line_contents[0],
                line_contents[1].split()) if len(line_contents) > 1 else ('',
                                                                          '')

    data2train = file_utils.read_line(
        filename, lambda line_contents: judge(line_contents), split='\t')

    data_id, label_id = [], []
    for label, content in data2train:
        if len(label) == 0 or len(content) == 0:
            continue
        data_id.append(
            [word_to_id[word] for word in content if word in word_to_id])
        label_id.append(cat_to_id[label])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id,
                                                    max_length,
                                                    truncating='post')
    y_pad = kr.utils.to_categorical(
        label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示

    return x_pad, y_pad
Esempio n. 6
0
def seg_clf_file(clf_file_pair):
    raw_clf_file, seged_clf_file = clf_file_pair
    print(f'seg file {raw_clf_file} to {seged_clf_file}')
    seged_lines = file_utils.read_line(
        raw_clf_file, lambda line: segment.seg_raw_doc(json.loads(line)))
    file_utils.save_list2file(
        seged_lines, seged_clf_file,
        lambda doc_json: json.dumps(doc_json, ensure_ascii=False))
Esempio n. 7
0
def clean(raw_phrase_txt):
    reged_phrases = file_utils.read_line(raw_phrase_txt,
                                         lambda line: extract_chn(line))
    clearn_phrases = [
        words for reged_phrase in reged_phrases for words in reged_phrase
    ]
    # clearn_phrases.sort()
    return set(clearn_phrases)
Esempio n. 8
0
def ans_score(my_ans_dir, right_ans_dir):
    que_count = 0
    right_answer_count = 0
    for ans_file_name in os.listdir(my_ans_dir):
        my_ans_file = os.path.join(my_ans_dir, ans_file_name)
        right_ans_file = os.path.join(right_ans_dir, ans_file_name)
        my_ans_dict = dict(file_utils.read_line(my_ans_file, lambda split: (split[0], split[1]), split=':'))
        right_ans_dict = dict(file_utils.read_line(right_ans_file, lambda split: (split[0], split[1]), split=':'))
        for _q, my_ans in my_ans_dict.items():
            right_ans = right_ans_dict[_q]
            print(f'{_q}, my:{my_ans}, right:{right_ans}')
            que_count += 1
            if my_ans == right_ans:
                right_answer_count += 1

    total_score = right_answer_count / que_count
    print(f'total score is {total_score}')
Esempio n. 9
0
def get_clf_str_from_file(clf_names_file_path: str):
    """
    read classifications name string from file and return a generator whose
    item is a Classification obj
    :param clf_names_file_path:file path store classification infos
    :return:
    """
    return file_utils.read_line(clf_names_file_path,
                                lambda line: gen_from_clf_str(line))
Esempio n. 10
0
def get_clf_info_dict(clf_count_info_file: str) -> list([dict]):
    """

    :type clf_count_info_file: str
    """
    info_list = file_utils.read_line(clf_count_info_file,
                                     lambda info: (info[0], ClfInfo(info[0], info[1])),
                                     ':')
    return dict(info_list)
Esempio n. 11
0
def create_corpus(seged_clf_dir, copus_file):
    for seged_clf_file in os.listdir(seged_clf_dir):
        print(f'add clf {seged_clf_file}')
        file2read = os.path.join(seged_clf_dir, seged_clf_file)
        texts = file_utils.read_line(file2read,
                                     lambda line: json.loads(line)['abs'])
        file_utils.save_list2file(
            texts, copus_file, filter_func=lambda text: text and len(text) > 0)
    print(f'create copus complete')
Esempio n. 12
0
def get_content_dict(answers_dir, spliter):
    total_content_dict = {}
    for content_file in file_utils.get_files(answers_dir):
        content_dict = dict(
            file_utils.read_line(content_file,
                                 lambda content: (content[0], content[1]),
                                 split=spliter))
        total_content_dict.update(content_dict)

    return total_content_dict
Esempio n. 13
0
def join_phrases(phrase_union_txt, *phrase_txts):
    print(f'start join...')
    phrase_set = set()
    for phrase_txt in phrase_txts:
        for phrase in file_utils.read_line(phrase_txt):
            if len(phrase) < 6:
                phrase_set.add(phrase)
        print(f'set phrases is:  {phrase_set}')
    l = list(phrase_set)
    l.sort()
    file_utils.save_list2file(l, phrase_union_txt)
Esempio n. 14
0
def build_vocab(train_txt_path, vocab_txt_path, vocab_size=5000):
    """根据训练集构建词汇表,存储"""
    contents = file_utils.read_line(train_txt_path,
                                    lambda line_contents: line_contents[1]
                                    if len(line_contents) > 1 else '',
                                    split='\t')

    counter = Counter(
        [word for content in contents for word in content.split()])
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    file_utils.save_list2file(words, vocab_txt_path)
Esempio n. 15
0
def process_question_file(filepath, word_to_id, max_length=600):
    data2train = file_utils.read_line(
        filepath,
        lambda line_contents: (line_contents[0], line_contents[1].split()),
        split='\t')
    data_id, y_pad = [], []
    for pub_id, content in data2train:
        data_id.append(
            [word_to_id[word] for word in content if word in word_to_id])
        y_pad.append(pub_id)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id,
                                                    max_length,
                                                    truncating='post')

    return x_pad, y_pad
Esempio n. 16
0
def get_df(path, show_df_info=False):
    """
    get dataFrame, if show_df_info is True, log the dataFrame head, null value count ands ample data.
    :param path:
    :param show_df_info:
    :return:
    """
    logger.info(f'get data frame from file {path}')

    contents = file_utils.read_line(path, lambda content: (content[0], content[1]), split='\t')

    df = pd.DataFrame(contents, columns=['clf', 'text'])
    if show_df_info:
        logger.info(f'df head is \n {df.head()}')
        logger.info(f'isnull count:\n {df.isnull().sum()}')
        logger.info(f'train sample clf: {df["clf"].iloc[0]}, text: {df["text"].iloc[0]}')
    return df
Esempio n. 17
0
def group_phrases(origin_file, short_file, median_file, long_file):
    phrases = file_utils.read_line(origin_file)
    short_phrases = []
    median_phrases = []
    long_phrases = []
    for phrase in phrases:
        print(f'{phrase}')
        if len(phrase) < 6:
            short_phrases.append(phrase)
        elif len(phrase) > 10:
            long_phrases.append(phrase)
        else:
            median_phrases.append(phrase)

    file_utils.save_list2file(short_phrases, short_file)
    file_utils.save_list2file(median_phrases, median_file)
    file_utils.save_list2file(long_phrases, long_file)
Esempio n. 18
0
def select_sample(seged_dir, select_dir):
    # seged_dir = 'E:/ip_data/clfs/new_seged/no_limit'
    # select_dir = 'E:/ip_data/clfs/new_seged/no_limit_t'
    clf_dict, total_count = get_clf_info(seged_dir)
    for clf_file in os.listdir(seged_dir):
        clf_name = clf_file[0:4]
        clf_count = clf_dict[clf_name]
        read_count = int(clf_count / total_count * 10000)
        if read_count > 20:
            file2read = os.path.join(seged_dir, clf_file)
            lines = list(file_utils.read_line(file2read))
            random.shuffle(lines)
            print(
                f'clf {clf_name}, clf count {clf_count}, write count {read_count}'
            )
            save_file = f'{clf_name}_{read_count}.txt'
            file_utils.save_list2file(lines[0:read_count],
                                      os.path.join(select_dir, save_file))
Esempio n. 19
0
def right_ans_distribution(right_ans_dir, clf_count_file):
    clf_info_dict = dict(file_utils.read_line(clf_count_file, lambda split: (split[0], split[1]), split=':'))
    total_doc_count = 0
    total_que_count = 0
    for k, v in clf_info_dict.items():
        # print(f'clf info k: {k}, v: {v}')
        total_doc_count += int(v)

    clf_que_count_dict = {clf: 0 for clf, count in clf_info_dict.items()}
    all_que_dict = get_all_ans_dict(right_ans_dir)
    for k, v in all_que_dict.items():
        # print(f'all ans k {k},v {v}')
        total_que_count += 1
        clf_que_count_dict[v] += 1

    clf_que_count_dict = {k: int(v) * 100 / total_que_count for (k, v) in clf_que_count_dict.items()}
    clf_info_list = [(k, int(v) * 100 / total_doc_count) for (k, v) in clf_info_dict.items()]
    clf_info_list.sort(key=lambda ele: ele[1], reverse=True)
    print(f'total_que_count {total_que_count}, total_doc_count {total_doc_count}')

    for k, v in clf_info_list:
        doc_portion = '{0:.3f}%'.format(v)
        que_portion = '{0:.3f}%'.format(clf_que_count_dict.get(k))
        print(f'clf : {k}, doc portion {doc_portion}, que portion: {que_portion}')
Esempio n. 20
0
def extract_abs(clf_file_pair):
    raw_clf_file, seged_clf_file = clf_file_pair
    print(f'extract abs file {raw_clf_file} to {seged_clf_file}')
    abs_lines = file_utils.read_line(
        raw_clf_file, lambda line: segment.seg_text((json.loads(line))['abs']))
    file_utils.save_list2file(abs_lines, seged_clf_file)
Esempio n. 21
0
def read_category(clf_name_file):
    """读取分类目录,固定"""
    categories = list(file_utils.read_line(clf_name_file, lambda line: line))
    cat_to_id = dict(zip(categories, range(len(categories))))

    return categories, cat_to_id
Esempio n. 22
0

def verify(word: str):
    word_len = len(word)
    if word_len % 2 == 0:
        half_word = word[0:int(word_len / 2)]
        return half_word if word.count(half_word) == 2 else word
    return word


def collect_new_dict(dict_dir: str, dest_dict_file: str):
    new_dict_files = file_utils.get_files(dict_dir)
    word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)])
    # list_utils.print_list(word_counter.keys())
    file_utils.save_list2file(word_counter.keys(), dest_dict_file)


if __name__ == '__main__':
    # collect('E:/dict/new_words','E:/dict/new_words.txt')
    # file_utils.remove_redundant('E:/dict/new_words.txt','E:/dict/new_words2.txt')
    clf_names_file_path = '/home/tqhy/ip_nlp/resources/clfs/class_needed.txt'
    clf_raw_dir = '/home/tqhy/ip_nlp/resources/clfs/raw/no_limit'
    lower_score_dir = '/home/tqhy/ip_nlp/resources/clfs/raw/lower_score'

    clf_to_collect = list(file_utils.read_line(clf_names_file_path))
    for file_names in os.listdir(clf_raw_dir):
        if file_names[0:4] in clf_to_collect:
            src_file = os.path.join(clf_raw_dir, file_names)
            dest_file = os.path.join(lower_score_dir, file_names)
            shutil.copyfile(src_file, dest_file)
Esempio n. 23
0
def read_ans(ans_file):
    return file_utils.read_line(ans_file, lambda split: (split[0], split[1]), split=':')
Esempio n. 24
0
def collect_new_dict(dict_dir: str, dest_dict_file: str):
    new_dict_files = file_utils.get_files(dict_dir)
    word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)])
    # list_utils.print_list(word_counter.keys())
    file_utils.save_list2file(word_counter.keys(), dest_dict_file)
Esempio n. 25
0
def extract_eng(raw_phrase_txt, eng_file):
    engs_lists = file_utils.read_line(
        raw_phrase_txt, lambda line: english_pattern.findall(line))
    file_utils.save_list2file(engs_lists, eng_file,
                              lambda engs: '\n'.join(engs))