def build_coordinate_matrix_incidence(working_dir, file_extension) -> tuple: """ Construction of the matrix incidence between files in working directory and their tokens :type working_dir: str :type file_extension: str """ filepath_list = dir_files_by_extension(working_dir=working_dir, file_extension=file_extension) corpora = pipeline_input(input=extract_corpora_from_dir( working_dir=working_dir, file_extension=file_extension)) tokens = tokenize_corpora_into_dict(corpora) files_corporas = [] for file_index, filepath in enumerate(filepath_list): file_corpora = extract_corpora_from_file(filepath=filepath, file_index=file_index, file_extension=file_extension) files_corporas.append(file_corpora) incidence_matrix = build_coordinate_matrix_incidence_ndarray( files_corporas=files_corporas, tokens=tokens) frequency_matrix = build_coordinate_matrix_frequency_ndarray( files_corporas=files_corporas, tokens=tokens) position_matrix = build_coordinate_matrix_positions_ndarray( files_corporas=files_corporas, tokens=tokens) return incidence_matrix, frequency_matrix, position_matrix, tokens
def build_singly_matrix_incidence_df( working_dir, file_extension, ) -> pd.DataFrame: """ Construction of the matrix incidence between files in working directory and their tokens :type working_dir: str :type file_extension: str """ filepath_list = dir_files_by_extension( working_dir=working_dir, file_extension=file_extension ) incidence_matrix, tokens = build_singly_matrix_incidence( working_dir=working_dir, file_extension=file_extension ) print("Converting the generated numpy matrix into pandas data frame") df = pd.DataFrame( data=incidence_matrix.T, index=tokens, columns=range(filepath_list.__len__()) ) df.columns = filepath_list print("Data frame was generated successfully!") return df
def build_coordinate_matrices_df( working_dir, file_extension, ) -> tuple: """ Construction of the matrix incidence between files in working directory and their tokens :type working_dir: str :type file_extension: str """ filepath_list = dir_files_by_extension(working_dir=working_dir, file_extension=file_extension) incidence_matrix, frequency_matrix, position_matrix, tokens = build_coordinate_matrix_incidence( working_dir=working_dir, file_extension=file_extension) print( "Converting the generated numpy incidence matrix into pandas data frame" ) df_incidence = pd.DataFrame(data=incidence_matrix.T, index=tokens, columns=range(filepath_list.__len__())) columns = filepath_list df_incidence.columns = columns print("Incidence data frame was generated successfully!") print( "Converting the generated numpy frequency matrix into pandas data frame" ) df_frequency = pd.DataFrame(data=frequency_matrix.T, index=tokens, columns=range(filepath_list.__len__())) columns = filepath_list df_frequency.columns = columns print("Frequency data frame was generated successfully!") print( "Converting the generated numpy position matrix into pandas data frame" ) df_position = pd.DataFrame(data=position_matrix.T, index=tokens, columns=range(filepath_list.__len__())) columns = filepath_list df_position.columns = columns print("Position data frame was generated successfully!") return df_incidence, df_frequency, df_position