def __gen_tac_dw(): # docs_dir = r'D:\data\el\LDC2015E19\data\2010\training\source_documents' # docs_dir = r'D:\data\el\LDC2015E19\data\2010\eval\source_documents' # docs_dir = r'D:\data\el\LDC2015E19\data\2009\eval\source_documents' # doc_list_file = 'e:/data/el/LDC2015E19/data/2010/eval/data/eng-docs-list-win.txt' doc_list_file = 'e:/data/el/LDC2015E20/data/eval/data/eng-docs-list-win.txt' emadr_data_dir = 'e:/data/emadr/el/tac/2014/eval' line_docs_file = os.path.join(emadr_data_dir, 'docs.txt') # __gen_line_docs_file_tac(doc_list_file, line_docs_file) tokenized_line_docs_file = os.path.join(emadr_data_dir, 'docs-tokenized.txt') proper_word_cnts_dict_file = 'e:/data/emadr/el/wiki/words_dict_proper.txt' max_word_len = 20 tokenized_line_docs_lc_file = os.path.join(emadr_data_dir, 'docs-tokenized-lc.txt') bow_docs_file = os.path.join(emadr_data_dir, 'dw.bin') textutils.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file, max_word_len, 1, tokenized_line_docs_lc_file) min_occurrence = 2 words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence) textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict, min_occurrence, bow_docs_file)
def __gen_lowercase_token_file_nyt(): # data_dir = 'e:/data/emadr/nyt-world-full/processed/' data_dir = 'e:/data/emadr/nyt-less-docs/world' tokenized_line_docs_file_name = os.path.join(data_dir, 'docs-tokenized.txt') proper_word_cnts_dict_file = os.path.join(data_dir, 'words-dict-proper.txt') dataset_split_file = os.path.join(data_dir, 'bindata/dataset-split-labels.bin') max_word_len = 20 min_occurrance = 100 all_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-%d.txt' % min_occurrance) train_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-train-%d.txt' % min_occurrance) val_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-val-%d.txt' % min_occurrance) test_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-test-%d.txt' % min_occurrance) textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file, max_word_len, min_occurrance, all_doc_text_file) # textutils.split_docs_text_file_by_dataset_labels(all_doc_text_file, dataset_split_file, train_doc_text_file, # test_doc_text_file) textutils.split_docs_text_file_by_dataset_labels_tvt( all_doc_text_file, dataset_split_file, train_doc_text_file, val_doc_text_file, test_doc_text_file)
def gen_lowercase_token_file_wiki(): tokenized_line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized.txt' proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt' max_word_len = 20 dst_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt' textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file, max_word_len, dst_file_name)
def gen_lowercase_token_file_nyt(): tokenized_line_docs_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized.txt' proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt' max_word_len = 20 min_occurrance = 40 dst_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized-lc-%d.txt' % min_occurrance textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file, max_word_len, min_occurrance, dst_file_name)
def tac_el_job_14train(): docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\training\source_documents' line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text.txt' docs_list_file = 'e:/dc/el/tac/tac_2014_train_docs_list.txt' # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file) tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized.txt' proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt' max_word_len = 20 tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized_lc.txt' textutils.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file, max_word_len, tokenized_line_docs_lc_file) bow_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_bow.bin'