def nest_corpus(path_in: pathlib.Path, path_out: pathlib.Path, control: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 widgets = [ 'Re-nesting Corpus # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker='.', left='[', right=']') ] with pb.ProgressBar(widgets=widgets) as bar: with control.open('r', encoding='utf-8', newline='') as control: reader = csv.DictReader(control, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for row in reader: bar.update(i) i = i + 1 source_path = path_in.joinpath(row['filename']) dest_path = path_out.joinpath(row['relitive path']) pathlib.Path(dest_path.parent).mkdir(parents=True, exist_ok=True) shutil.copy(source_path, dest_path)
def unnest_corpus(path_in: pathlib.Path, path_out: pathlib.Path, control: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 widgets = [ 'Unnesting Corpus # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker='.', left='[', right=']') ] with pb.ProgressBar(widgets=widgets) as bar: with control.open('w', encoding='utf-8', newline='') as control: writer = csv.writer(control, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(['filename', 'relitive path']) for file_name in path_in.rglob("*"): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 dest_path = path_out.joinpath( f'{str(uuid.uuid4())}{file_name.suffix}') shutil.copy(file_name, dest_path) relative_path = file_name.relative_to(path_in) writer.writerow([dest_path.name, str(relative_path)])
def documents_to_corpus(path_in: pathlib.Path, path_out: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 widgets = [ 'Formatting Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')] with pb.ProgressBar(widgets = widgets) as bar: for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 sentences = __tokenize_document(file_name) u.write_document(path_out, file_name, sentences)
def corpus_to_reading_level(path_in: pathlib.Path, path_out: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) start = time.time() __clean_grade_level_files(path_out) doc_cnt = 0 for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): print(file_name.stem) sentences = __document_to_sentences(file_name) __save_new_documents(path_out, sentences) doc_cnt = doc_cnt + 1 seconds = time.time() - start if(doc_cnt > 0): print(f'Processed (seconds/document): {int(seconds):,}/{doc_cnt:,} = {int(seconds/doc_cnt):,} spd') else: print(f'No text documents found in {path_in}')
def normalize_corpus_by_truncation(path_in: pathlib.Path, path_out: pathlib.Path, max_length: int) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 widgets = [ 'Normalize Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker='.', left='[', right=']') ] with pb.ProgressBar(widgets=widgets) as bar: for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 sentences = __normalize_document_by_truncation( file_name, min_length) u.write_document(path_out, file_name, sentences)
def remove_stopwords_from_corpus(path_in: pathlib.Path, path_out: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 stopwords = nltk.corpus.stopwords.words('english') stopwords = set(stopwords) widgets = [ 'Pre-Processing Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker='.', left='[', right=']') ] with pb.ProgressBar(widgets=widgets) as bar: for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 sentences = __remove_stopwords_from_document( file_name, stopwords) u.write_document(path_out, file_name, sentences)
def vectorize_corpus(path_in: pathlib.Path, path_out: pathlib.Path, path_control: pathlib.Path) -> None: u.assert_folder_is_readable(path_in) u.assert_folder_is_writable(path_out) i = 1 widgets = [ 'Collecting Tokens # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')] tokens = dict() with pb.ProgressBar(widgets = widgets) as bar: for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 __append_tokens(tokens, file_name) token_map = __map_tokens(tokens) __save_token_file(tokens, token_map, path_control) i = 1 widgets = [ 'Vectorizing Document # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker = '.', left = '[', right = ']')] with pb.ProgressBar(widgets = widgets) as bar: for file_name in path_in.iterdir(): if u.is_corpus_document(file_name): bar.update(i) i = i + 1 vector = __vectorise_document(file_name, token_map) u.write_document(path_out, file_name, vector)
def tokenize_corpus(corpus_in: pathlib.Path, corpus_out: pathlib.Path) -> None: u.assert_folder_is_readable(corpus_in) u.ensure_folder_is_writable(corpus_out) pass
def extract_fold(corpus_in: pathlib.Path, corpus_out: pathlib.Path, subset: pathlib.Path) -> None: u.assert_folder_is_readable(corpus_in) u.assert_file_is_readable(subset) u.ensure_folder_is_writable(corpus_out) pass
def validate_model(corpus_in: pathlib.Path, state_folder: pathlib.Path) -> None: u.assert_folder_is_readable(corpus_in) u.ensure_folder_is_writable(state_folder.joinpath('./results')) pass
def create_corpus_folds(corpus_in: pathlib.Path, state_folder: pathlib.Path) -> None: u.assert_folder_is_readable(corpus_in) u.ensure_folder_is_writable(state_folder.joinpath('./sub')) pass
def train_model(corpus_in: pathlib.Path, state_folder: pathlib.Path) -> None: u.assert_folder_is_readable(corpus_in) u.ensure_folder_is_writable(state_folder.joinpath('./weights')) pass
def hyper_tune_model(train_in: pathlib.Path, test_in: pathlib.Path, state_folder: pathlib.Path) -> None: u.assert_folder_is_readable(train_in) u.assert_folder_is_readable(test_in) u.ensure_folder_is_writable(state_folder) pass