Exemple #1
0
def find_new_words(root, file_pair):
    abs_file, new_words_file = file_pair[0], file_pair[1]
    if os.path.exists(new_words_file):
        print(f'clf {new_words_file} has already found new words ...')
        return
    print(f'start find new word in {abs_file}')
    datas = load_data(abs_file)
    model = root
    topN = 2
    if len(datas) > 0:
        tmp = []
        count = 0
        words2add = set()
        for item in datas:
            tmp.append(item)
            count += 1
            if count % 40 == 0:
                load_data_2_root(tmp, model)
                result, add_word = model.find_word(topN)
                words2add.update(add_word.keys())
                print(f'words2add: {words2add}, {count}')
                tmp.clear()
        if len(tmp) > 0:
            print(f'{words2add}')
            load_data_2_root(tmp, model)
            result, add_word = model.find_word(topN)
            words2add.update(add_word.keys())
            print(f'words2add: {words2add}, {count}')
            tmp.clear()
        file_utils.save_list2file(words2add, new_words_file)
Exemple #2
0
def concat_all(clf_dir, dest_dir, portion):
    file_names = ['train', 'val', 'test']
    clf_name_file = os.path.join(dest_dir, 'clf_name.txt')
    clf_names = set()
    for clf_file in os.listdir(clf_dir):
        clf_name = clf_file[0:4]
        clf_count = int(clf_file[5:-4])
        clf_file_path = os.path.join(clf_dir, clf_file)
        texts = list(
            file_utils.read_line(clf_file_path,
                                 lambda line: json.loads(line)['abs']))
        random.shuffle(texts)
        count2read = int(clf_count * 0.05)

        for i in range(20):
            start = count2read * i
            end = count2read * (i + 1) if len(texts) - 1 > count2read * (
                i + 1) else len(texts) - 1
            splits = split_list(texts[start:end], portion)
            if splits:
                clf_names.add(clf_name)
                print(f'write clf {clf_name}')
                for index, list2write in enumerate(splits):
                    dest_file = os.path.join(dest_dir,
                                             f'{file_names[index]}{i}.txt')
                    file_utils.save_list2file(
                        list2write,
                        dest_file,
                        work_func=lambda text: f'{clf_name}\t{text}',
                        filter_func=lambda item: len(item) > 1)
            else:
                print(f'not split')

    file_utils.save_list2file(list(clf_names), clf_name_file)
Exemple #3
0
def find_words_not_in_vec(word_index, vectors_file, inclue_file, exclude_file):
    vec_words = file_utils.read_line(vectors_file, lambda line: line.split()[0])
    print(f'{vec_words[0]}')
    exclude_words = [word for word in word_index if word not in vec_words]
    include_words = [word for word in word_index if word in vec_words]
    file_utils.save_list2file(exclude_words, exclude_file)
    file_utils.save_list2file(include_words, inclue_file)
Exemple #4
0
def seg_long_phrases(origin_long_txt, seged_long_txt2):
    segs_list = file_utils.read_line(origin_long_txt,
                                     lambda line: segment.seg_text(line))
    seg_list = [
        seg for segs in segs_list for seg in segs.split(' ') if len(seg) > 1
    ]
    file_utils.save_list2file(list(set(seg_list)), seged_long_txt2)
Exemple #5
0
def seg_clf_file(clf_file_pair):
    raw_clf_file, seged_clf_file = clf_file_pair
    print(f'seg file {raw_clf_file} to {seged_clf_file}')
    seged_lines = file_utils.read_line(
        raw_clf_file, lambda line: segment.seg_raw_doc(json.loads(line)))
    file_utils.save_list2file(
        seged_lines, seged_clf_file,
        lambda doc_json: json.dumps(doc_json, ensure_ascii=False))
Exemple #6
0
def create_corpus(seged_clf_dir, copus_file):
    for seged_clf_file in os.listdir(seged_clf_dir):
        print(f'add clf {seged_clf_file}')
        file2read = os.path.join(seged_clf_dir, seged_clf_file)
        texts = file_utils.read_line(file2read,
                                     lambda line: json.loads(line)['abs'])
        file_utils.save_list2file(
            texts, copus_file, filter_func=lambda text: text and len(text) > 0)
    print(f'create copus complete')
Exemple #7
0
def join_phrases(phrase_union_txt, *phrase_txts):
    print(f'start join...')
    phrase_set = set()
    for phrase_txt in phrase_txts:
        for phrase in file_utils.read_line(phrase_txt):
            if len(phrase) < 6:
                phrase_set.add(phrase)
        print(f'set phrases is:  {phrase_set}')
    l = list(phrase_set)
    l.sort()
    file_utils.save_list2file(l, phrase_union_txt)
Exemple #8
0
def gen_train_text(answers_dir, seged_texts_dir, train_file):
    total_answers = get_content_dict(answers_dir, ':')
    # list_utils.print_dict(total_answers)

    total_seged_texts = get_content_dict(seged_texts_dir, '\t')
    # list_utils.print_dict(total_seged_texts)

    train_list = [(clf, total_seged_texts.get(_id))
                  for _id, clf in total_answers.items()]
    # list_utils.print_list(train_list)

    file_utils.save_list2file(train_list, train_file,
                              lambda pair: f'{pair[0]}\t{pair[1]}')
Exemple #9
0
def build_vocab(train_txt_path, vocab_txt_path, vocab_size=5000):
    """根据训练集构建词汇表,存储"""
    contents = file_utils.read_line(train_txt_path,
                                    lambda line_contents: line_contents[1]
                                    if len(line_contents) > 1 else '',
                                    split='\t')

    counter = Counter(
        [word for content in contents for word in content.split()])
    count_pairs = counter.most_common(vocab_size - 1)
    words, _ = list(zip(*count_pairs))
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    file_utils.save_list2file(words, vocab_txt_path)
Exemple #10
0
def select_sample(seged_dir, select_dir):
    # seged_dir = 'E:/ip_data/clfs/new_seged/no_limit'
    # select_dir = 'E:/ip_data/clfs/new_seged/no_limit_t'
    clf_dict, total_count = get_clf_info(seged_dir)
    for clf_file in os.listdir(seged_dir):
        clf_name = clf_file[0:4]
        clf_count = clf_dict[clf_name]
        read_count = int(clf_count / total_count * 10000)
        if read_count > 20:
            file2read = os.path.join(seged_dir, clf_file)
            lines = list(file_utils.read_line(file2read))
            random.shuffle(lines)
            print(
                f'clf {clf_name}, clf count {clf_count}, write count {read_count}'
            )
            save_file = f'{clf_name}_{read_count}.txt'
            file_utils.save_list2file(lines[0:read_count],
                                      os.path.join(select_dir, save_file))
Exemple #11
0
def write_docs(store_dir: str, clf: Classification, docs, count):
    """
    write ip docs json string to a local file line by line, the file was named
    in format like 'A_01_B_300.txt'. The number in file name is the count of docs
    stored in the file.
    :param store_dir:
    :param clf:
    :param docs: get from mongo, each item is a Bson obj
    :return:
    """

    file_suffix = f'{count}.txt'
    logger.info(f'start write tasks {clf} with suffix {file_suffix}')

    file_name = f'{clf}_{file_suffix}'
    file_path = path.join(store_dir, file_name)
    logger.info(f'tasks docs store file path is {file_path}')

    file_utils.save_list2file(docs, file_path,
                              lambda doc: json_encoder.doc2json(doc))
Exemple #12
0
def group_phrases(origin_file, short_file, median_file, long_file):
    phrases = file_utils.read_line(origin_file)
    short_phrases = []
    median_phrases = []
    long_phrases = []
    for phrase in phrases:
        print(f'{phrase}')
        if len(phrase) < 6:
            short_phrases.append(phrase)
        elif len(phrase) > 10:
            long_phrases.append(phrase)
        else:
            median_phrases.append(phrase)

    file_utils.save_list2file(short_phrases, short_file)
    file_utils.save_list2file(median_phrases, median_file)
    file_utils.save_list2file(long_phrases, long_file)
Exemple #13
0
def extract_abs(clf_file_pair):
    raw_clf_file, seged_clf_file = clf_file_pair
    print(f'extract abs file {raw_clf_file} to {seged_clf_file}')
    abs_lines = file_utils.read_line(
        raw_clf_file, lambda line: segment.seg_text((json.loads(line))['abs']))
    file_utils.save_list2file(abs_lines, seged_clf_file)
Exemple #14
0
def collect_new_dict(dict_dir: str, dest_dict_file: str):
    new_dict_files = file_utils.get_files(dict_dir)
    word_counter = Counter([verify(word) for dict_file in new_dict_files for word in file_utils.read_line(dict_file)])
    # list_utils.print_list(word_counter.keys())
    file_utils.save_list2file(word_counter.keys(), dest_dict_file)
Exemple #15
0
                yield get_words_from_ctg_page(ctg_page)


def get_page_count(ctg_pg0):
    soup = BeautifulSoup(ctg_pg0, features='lxml')
    page_info = soup.find('span', text=re.compile('^共.*')).text
    print('page info is {}'.format(page_info))
    # 共16页  共[306]词汇
    matcher = re.search('[0-9]+', page_info)

    return int(matcher.group(0)) if matcher else 0


def get_words_from_ctg_page(ctg_pg):
    soup = BeautifulSoup(ctg_pg, features='lxml')
    tr_tags = soup.find(id='lblcon').find_all('tr')
    words = [tr_tag.find_all('td')[1].a.text for tr_tag in tr_tags if not tr_tag.has_attr('class')]
    return words


if __name__ == '__main__':
    html = get_html(catalogue_url)
    # print('text {}'.format(html))
    ctg_uris = get_categories_uri(html)
    words = get_words_from_ctg(ctg_uris)
    dict_cnki_path = 'F:/temp/ip_nlp/cnki_dict.txt'
    for word in words:
        # print(word)
        file_utils.save_list2file(word, dict_cnki_path)
    print('all task complete...')
Exemple #16
0
def extract_eng(raw_phrase_txt, eng_file):
    engs_lists = file_utils.read_line(
        raw_phrase_txt, lambda line: english_pattern.findall(line))
    file_utils.save_list2file(engs_lists, eng_file,
                              lambda engs: '\n'.join(engs))
Exemple #17
0
def process_raw_answers(raw_answers_dir, processed_answer_dir):
    for raw_answer in os.listdir(raw_answers_dir):
        raw_answer_file = os.path.join(raw_answers_dir, raw_answer)
        processed_answers = process_raw_answer(raw_answer_file)
        store_answer_file = os.path.join(processed_answer_dir, raw_answer)
        file_utils.save_list2file(processed_answers, store_answer_file)
Exemple #18
0
            yield word


def grab_failed_page():
    while len(fail_pages) > 0:
        for fail_page_url in fail_pages[:]:
            fail_page_html = get_html(fail_page_url)
            word = get_word(fail_page_html)
            if word:
                fail_pages.remove(fail_page_url)
                yield word


def test():
    # html_page = get_html(base_url + 'h_5286500000.html')
    html_page = get_html('http://dict.cnki.net/h_9999999000.html')
    print('html page is:')
    print(html_page)
    soup = BeautifulSoup(html_page, features='lxml')
    input_value = soup.find(id='txt2').attrs['value']
    print('input value is {}'.format(len(input_value)))


if __name__ == '__main__':
    words = grab_words(max_page_num)
    file_utils.save_list2file(words, 'F:/temp/ip_nlp/cnki_trans.txt')

    if len(fail_pages) > 0:
        supply_words = grab_failed_page()
        file_utils.save_list2file(words, 'F:/temp/ip_nlp/cnki_trans.txt')