def main(): preprocessing = Preprocessing() root_path = './test' # before pickle hierarchy directory_dict = {} dir_hierarchy = preprocessing.lookup_directory(root_path, directory_dict) file_list = list() dir_list = list() label_num = 0 for tar_dir in dir_hierarchy: file_list += dir_hierarchy[tar_dir] for file_path in tqdm(file_list): text = extract_text(file_path) new_path = file_path[:-4] with open(new_path, 'wb') as f: pickle.dump(text, f)
def prepare_env(root_path: str, cached_tokens=None, verbose=False): os.environ['DROPFILE_LOGLEVEL'] = "1" if verbose else "0" normalpreprocessing = Preprocessing() dspreprocessing = DependencyStructurePreprocessing() nppreprocessing = NounPhrasePreprocessing() npreprocessing = NounPreprocessing() spacypreprocessing = SpacyPreprocessing() twcpreprocessing = TargetWordChunkingPreprocessing() cfgpreprocessing = CFGPreprocessing() preprocessing_dict = { "Preprocessing": normalpreprocessing, "DependencyStructurePreprocessing": dspreprocessing, "NounPhrasePreprocessing": nppreprocessing, "NounPreprocessing": npreprocessing, "SpacyPreprocessing": spacypreprocessing, "TargetWordChunkingPreprocessing": twcpreprocessing, "CFGPreprocessing": cfgpreprocessing } DTM_dict = dict() vocab_dict = dict() synonym_dict_dict = dict() start = time.time() directory_dict = defaultdict( list) # empty dictionary for lookup_directory function dir_hierarchy = normalpreprocessing.lookup_directory( root_path, directory_dict) file_list = list() doc_dict = dict() for tar_dir in dir_hierarchy: file_list += dir_hierarchy[tar_dir] if cached_tokens is None: tokens_dict = defaultdict(dict) else: tokens_dict = cached_tokens for file in file_list: if file not in tokens_dict["Preprocessing"]: doc_dict[file] = normalpreprocessing.file2text(file) if verbose: print(f"file2text takes {time.time() - start:.4f} s.") for name, preprocessing in preprocessing_dict.items(): if verbose: print(f"{name} started") # preprocessing : lookup hierarchy of root path directory_dict = defaultdict( list) # empty dictionary for lookup_directory function start = time.time() dir_hierarchy = preprocessing.lookup_directory( root_path, directory_dict) # change it to have 2 parameter if verbose: print(f"{name}.lookup_directory takes {time.time()-start:.4f} s.") file_list = list() dir_list = list() label_num = 0 for tar_dir in dir_hierarchy: file_list += dir_hierarchy[tar_dir] dir_list.append(tar_dir) label_num += 1 # preprocessing : build vocabulary from file_list # if (DTM is None) and (vocab is None) and (synonym_dict is None): doc_list = list() start = time.time() for file in file_list: if name in tokens_dict and file in tokens_dict[name]: tokens = tokens_dict[name][file] else: tokens = preprocessing.text2tok(doc_dict[file]) doc_list.append(tokens) tokens_dict[name][file] = tokens if verbose: print(f"{name}.text2tok takes {time.time()-start:.4f} s.") start = time.time() vocab, synonym_dict = preprocessing.build_vocab(doc_list) if verbose: print(f"{name}.build_vocab takes {time.time()-start:.4f} s.") # preprocessing : build DTM of files under root_path start = time.time() DTM = preprocessing.build_DTM(doc_list, vocab, synonym_dict) if verbose: print(f"{name}.build_DTM takes {time.time()-start:.4f} s.") DTM_dict[name] = DTM vocab_dict[name] = vocab synonym_dict_dict[name] = synonym_dict return DTM_dict, vocab_dict, synonym_dict_dict, tokens_dict