コード例 #1
0
def build_coordinate_matrix_incidence(working_dir, file_extension) -> tuple:
    """
    Construction of the matrix incidence between files in working directory and their tokens
    :type working_dir: str
    :type file_extension: str
    """
    filepath_list = dir_files_by_extension(working_dir=working_dir,
                                           file_extension=file_extension)

    corpora = pipeline_input(input=extract_corpora_from_dir(
        working_dir=working_dir, file_extension=file_extension))
    tokens = tokenize_corpora_into_dict(corpora)

    files_corporas = []
    for file_index, filepath in enumerate(filepath_list):
        file_corpora = extract_corpora_from_file(filepath=filepath,
                                                 file_index=file_index,
                                                 file_extension=file_extension)
        files_corporas.append(file_corpora)

    incidence_matrix = build_coordinate_matrix_incidence_ndarray(
        files_corporas=files_corporas, tokens=tokens)

    frequency_matrix = build_coordinate_matrix_frequency_ndarray(
        files_corporas=files_corporas, tokens=tokens)

    position_matrix = build_coordinate_matrix_positions_ndarray(
        files_corporas=files_corporas, tokens=tokens)
    return incidence_matrix, frequency_matrix, position_matrix, tokens
コード例 #2
0
ファイル: __init__.py プロジェクト: danorel/CD-Inverted-Index
def build_singly_matrix_incidence_df(
        working_dir,
        file_extension,
) -> pd.DataFrame:
    """
    Construction of the matrix incidence between files in working directory and their tokens
    :type working_dir: str
    :type file_extension: str
    """
    filepath_list = dir_files_by_extension(
        working_dir=working_dir,
        file_extension=file_extension
    )

    incidence_matrix, tokens = build_singly_matrix_incidence(
        working_dir=working_dir,
        file_extension=file_extension
    )

    print("Converting the generated numpy matrix into pandas data frame")
    df = pd.DataFrame(
        data=incidence_matrix.T,
        index=tokens,
        columns=range(filepath_list.__len__())
    )
    df.columns = filepath_list
    print("Data frame was generated successfully!")
    return df
コード例 #3
0
def build_coordinate_matrices_df(
    working_dir,
    file_extension,
) -> tuple:
    """
    Construction of the matrix incidence between files in working directory and their tokens
    :type working_dir: str
    :type file_extension: str
    """
    filepath_list = dir_files_by_extension(working_dir=working_dir,
                                           file_extension=file_extension)

    incidence_matrix, frequency_matrix, position_matrix, tokens = build_coordinate_matrix_incidence(
        working_dir=working_dir, file_extension=file_extension)

    print(
        "Converting the generated numpy incidence matrix into pandas data frame"
    )
    df_incidence = pd.DataFrame(data=incidence_matrix.T,
                                index=tokens,
                                columns=range(filepath_list.__len__()))
    columns = filepath_list
    df_incidence.columns = columns
    print("Incidence data frame was generated successfully!")

    print(
        "Converting the generated numpy frequency matrix into pandas data frame"
    )
    df_frequency = pd.DataFrame(data=frequency_matrix.T,
                                index=tokens,
                                columns=range(filepath_list.__len__()))
    columns = filepath_list
    df_frequency.columns = columns
    print("Frequency data frame was generated successfully!")

    print(
        "Converting the generated numpy position matrix into pandas data frame"
    )
    df_position = pd.DataFrame(data=position_matrix.T,
                               index=tokens,
                               columns=range(filepath_list.__len__()))
    columns = filepath_list
    df_position.columns = columns
    print("Position data frame was generated successfully!")

    return df_incidence, df_frequency, df_position