Example #1
0
def main():
    preprocessing = Preprocessing()
    root_path = './test'  # before pickle hierarchy
    directory_dict = {}
    dir_hierarchy = preprocessing.lookup_directory(root_path, directory_dict)
    file_list = list()
    dir_list = list()
    label_num = 0
    for tar_dir in dir_hierarchy:
        file_list += dir_hierarchy[tar_dir]

    for file_path in tqdm(file_list):
        text = extract_text(file_path)
        new_path = file_path[:-4]
        with open(new_path, 'wb') as f:
            pickle.dump(text, f)
Example #2
0
def prepare_env(root_path: str, cached_tokens=None, verbose=False):
    os.environ['DROPFILE_LOGLEVEL'] = "1" if verbose else "0"

    normalpreprocessing = Preprocessing()
    dspreprocessing = DependencyStructurePreprocessing()
    nppreprocessing = NounPhrasePreprocessing()
    npreprocessing = NounPreprocessing()
    spacypreprocessing = SpacyPreprocessing()
    twcpreprocessing = TargetWordChunkingPreprocessing()
    cfgpreprocessing = CFGPreprocessing()
    preprocessing_dict = {
        "Preprocessing": normalpreprocessing,
        "DependencyStructurePreprocessing": dspreprocessing,
        "NounPhrasePreprocessing": nppreprocessing,
        "NounPreprocessing": npreprocessing,
        "SpacyPreprocessing": spacypreprocessing,
        "TargetWordChunkingPreprocessing": twcpreprocessing,
        "CFGPreprocessing": cfgpreprocessing
    }

    DTM_dict = dict()
    vocab_dict = dict()
    synonym_dict_dict = dict()

    start = time.time()
    directory_dict = defaultdict(
        list)  # empty dictionary for lookup_directory function
    dir_hierarchy = normalpreprocessing.lookup_directory(
        root_path, directory_dict)
    file_list = list()
    doc_dict = dict()

    for tar_dir in dir_hierarchy:
        file_list += dir_hierarchy[tar_dir]

    if cached_tokens is None:
        tokens_dict = defaultdict(dict)
    else:
        tokens_dict = cached_tokens

    for file in file_list:
        if file not in tokens_dict["Preprocessing"]:
            doc_dict[file] = normalpreprocessing.file2text(file)
    if verbose:
        print(f"file2text takes {time.time() - start:.4f} s.")

    for name, preprocessing in preprocessing_dict.items():
        if verbose:
            print(f"{name} started")
        # preprocessing : lookup hierarchy of root path
        directory_dict = defaultdict(
            list)  # empty dictionary for lookup_directory function

        start = time.time()
        dir_hierarchy = preprocessing.lookup_directory(
            root_path, directory_dict)  # change it to have 2 parameter
        if verbose:
            print(f"{name}.lookup_directory takes {time.time()-start:.4f} s.")

        file_list = list()
        dir_list = list()
        label_num = 0
        for tar_dir in dir_hierarchy:
            file_list += dir_hierarchy[tar_dir]
            dir_list.append(tar_dir)
            label_num += 1

        # preprocessing : build vocabulary from file_list
        # if (DTM is None) and (vocab is None) and (synonym_dict is None):
        doc_list = list()
        start = time.time()
        for file in file_list:
            if name in tokens_dict and file in tokens_dict[name]:
                tokens = tokens_dict[name][file]
            else:
                tokens = preprocessing.text2tok(doc_dict[file])
            doc_list.append(tokens)
            tokens_dict[name][file] = tokens

        if verbose:
            print(f"{name}.text2tok takes {time.time()-start:.4f} s.")
        start = time.time()
        vocab, synonym_dict = preprocessing.build_vocab(doc_list)
        if verbose:
            print(f"{name}.build_vocab takes {time.time()-start:.4f} s.")
        # preprocessing : build DTM of files under root_path
        start = time.time()
        DTM = preprocessing.build_DTM(doc_list, vocab, synonym_dict)
        if verbose:
            print(f"{name}.build_DTM takes {time.time()-start:.4f} s.")

        DTM_dict[name] = DTM
        vocab_dict[name] = vocab
        synonym_dict_dict[name] = synonym_dict

    return DTM_dict, vocab_dict, synonym_dict_dict, tokens_dict