Beispiel #1
0
def gen_bow_wiki():
    line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    dst_bow_docs_file_name = 'e:/dc/el/wiki/wiki_bow.bin'
    # text_process_common.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, dst_bow_docs_file_name)

    dst_word_cnts_file = 'e:/dc/el/wiki/word_cnts.bin'
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
Beispiel #2
0
def gen_bow_wiki():
    line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    dst_bow_docs_file_name = 'e:/dc/el/wiki/wiki_bow.bin'
    # text_process_common.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, dst_bow_docs_file_name)

    dst_word_cnts_file = 'e:/dc/el/wiki/word_cnts.bin'
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
Beispiel #3
0
def gen_dw_nyt():
    min_occurrance = 30
    line_docs_file_name = 'e:/dc/nyt-world-full/processed/test/docs_tokenized_lc.txt'
    proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt'
    dst_bow_docs_file_name = 'e:/dc/nyt-world-full/processed/bin/dw-%d.bin' % min_occurrance
    textutils.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, min_occurrance, dst_bow_docs_file_name)

    dst_word_cnts_file = 'e:/dc/nyt-world-full/processed/bin/word-cnts-%d.bin' % min_occurrance
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
Beispiel #4
0
def gen_entity_net_20ng():
    proper_entity_dict_file = 'e:/dc/20ng_bydate/entity_names.txt'
    doc_entity_candidates_file = 'e:/dc/20ng_bydate/doc_entity_candidates.txt'
    dst_doc_entity_list_file = 'e:/dc/20ng_bydate/doc_entities_short.bin'
    # gen_doc_entity_list(proper_entity_dict_file, doc_entity_candidates_file, dst_doc_entity_list_file)

    dst_entity_cnts_file = 'e:/dc/20ng_bydate/entity_cnts.bin'
    textutils.gen_word_cnts_file_from_bow_file(dst_doc_entity_list_file,
                                               dst_entity_cnts_file)
Beispiel #5
0
def all_line_docs_to_net():
    line_docs_file_name = 'e:/dc/20ng_bydate/doc_text_data.txt'
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    # dst_bin_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    dst_dw_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net_short.bin'
    dst_word_indices_doc_file_name = 'e:/dc/20ng_bydate/all_docs_wi.txt'
    # line_docs_to_net(line_docs_file_name, word_dict_file_name, dst_dw_file_name, dst_word_indices_doc_file_name)

    word_cnts_file_for_ns = 'e:/dc/20ng_bydate/word_cnts.bin'
    textutils.gen_word_cnts_file_from_bow_file(dst_dw_file_name, word_cnts_file_for_ns)
Beispiel #6
0
def gen_entity_net_wiki():
    proper_entity_dict_file = 'e:/dc/el/wiki/entity_names.txt'
    doc_entity_candidates_file = 'e:/dc/el/wiki/doc_entity_candidates.txt'
    dst_doc_entity_list_file = 'e:/dc/el/wiki/wiki_entities.bin'

    gen_doc_entity_pairs(proper_entity_dict_file, doc_entity_candidates_file,
                         dst_doc_entity_list_file)
    dst_entity_cnts_file = 'e:/dc/el/wiki/entity_cnts.bin'
    textutils.gen_word_cnts_file_from_bow_file(dst_doc_entity_list_file,
                                               dst_entity_cnts_file)

    entity_candidate_cliques_file = 'e:/dc/el/wiki/entity_candidate_cliques.txt'
    dst_entity_net_adj_list_file = 'e:/dc/el/wiki/entity_net_adj_list.bin'
    gen_entity_entity_pairs(proper_entity_dict_file,
                            entity_candidate_cliques_file,
                            dst_entity_net_adj_list_file)
Beispiel #7
0
def __gen_dw_nyt():
    # data_dir = 'e:/data/emadr/nyt-world-full/processed/'
    data_dir = 'e:/data/emadr/nyt-less-docs/business'
    min_occurrence = 10
    proper_word_cnts_dict_file = os.path.join(data_dir,
                                              'words-dict-proper.txt')

    line_docs_file = os.path.join(data_dir,
                                  'tokenizedlc/docs-tokenized-lc-2.txt')
    dst_bow_docs_file = os.path.join(data_dir,
                                     'bindata/dw-%d.bin' % min_occurrence)

    words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file,
                                                  min_occurrence)
    print 'vocab size:', len(words_dict)
    textutils.line_docs_to_bow(line_docs_file, words_dict, min_occurrence,
                               dst_bow_docs_file)

    dst_word_cnts_file = os.path.join(
        data_dir, 'bindata/word-cnts-%d.bin' % min_occurrence)
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file,
                                               dst_word_cnts_file)

    train_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-train-2.txt')
    val_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-val-2.txt')
    test_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-test-2.txt')
    dst_train_dw_file = os.path.join(
        data_dir, 'bindata/dw-train-%d.bin' % min_occurrence)
    dst_val_dw_file = os.path.join(data_dir,
                                   'bindata/dw-val-%d.bin' % min_occurrence)
    dst_test_dw_file = os.path.join(data_dir,
                                    'bindata/dw-test-%d.bin' % min_occurrence)

    textutils.line_docs_to_bow(train_doc_text_file, words_dict, min_occurrence,
                               dst_train_dw_file)
    textutils.line_docs_to_bow(val_doc_text_file, words_dict, min_occurrence,
                               dst_val_dw_file)
    textutils.line_docs_to_bow(test_doc_text_file, words_dict, min_occurrence,
                               dst_test_dw_file)