Example #1
0
def build_coordinate_matrix_incidence(working_dir, file_extension) -> tuple:
    """
    Construction of the matrix incidence between files in working directory and their tokens
    :type working_dir: str
    :type file_extension: str
    """
    filepath_list = dir_files_by_extension(working_dir=working_dir,
                                           file_extension=file_extension)

    corpora = pipeline_input(input=extract_corpora_from_dir(
        working_dir=working_dir, file_extension=file_extension))
    tokens = tokenize_corpora_into_dict(corpora)

    files_corporas = []
    for file_index, filepath in enumerate(filepath_list):
        file_corpora = extract_corpora_from_file(filepath=filepath,
                                                 file_index=file_index,
                                                 file_extension=file_extension)
        files_corporas.append(file_corpora)

    incidence_matrix = build_coordinate_matrix_incidence_ndarray(
        files_corporas=files_corporas, tokens=tokens)

    frequency_matrix = build_coordinate_matrix_frequency_ndarray(
        files_corporas=files_corporas, tokens=tokens)

    position_matrix = build_coordinate_matrix_positions_ndarray(
        files_corporas=files_corporas, tokens=tokens)
    return incidence_matrix, frequency_matrix, position_matrix, tokens
Example #2
0
def build_coordinate_matrix_frequency_ndarray(files_corporas,
                                              tokens) -> numpy.ndarray:
    """

    :type files_corporas: list
    :type tokens: list
    """
    print("Initialization of the frequency matrix...")
    frequency_matrix = numpy.zeros(
        (files_corporas.__len__(), tokens.__len__()))
    print(frequency_matrix)
    print(f"Matrix shape: {frequency_matrix.shape}")

    for file_index, file_corpora in enumerate(files_corporas):
        file_output = pipeline_input(file_corpora)
        file_tokens = tokenize_corpora(file_output)
        for token_index, token in enumerate(tokens):
            frequency_matrix[file_index][token_index] = calculate_token_freq(
                file_tokens=file_tokens, token=token)
    print("Processing ended successfully!")

    print(frequency_matrix)
    return frequency_matrix
Example #3
0
def build_coordinate_matrix_incidence_ndarray(files_corporas,
                                              tokens) -> numpy.ndarray:
    """

    :type files_corporas: list
    :type tokens: list
    """
    print("Initialization of the incidence matrix...")
    incidence_matrix = numpy.zeros(
        (files_corporas.__len__(), tokens.__len__()))
    print(incidence_matrix)
    print(f"Matrix shape: {incidence_matrix.shape}")

    print("Processing the matrix...")
    # Calculate the token-document existence
    for file_index, file_corpora in enumerate(files_corporas):
        file_output = pipeline_input(file_corpora)
        file_dict = tokenize_corpora_into_dict(file_output)
        for token_index, token in enumerate(tokens):
            incidence_matrix[file_index][token_index] = token in file_dict
    print("Processing ended successfully!")

    print(incidence_matrix)
    return incidence_matrix
Example #4
0
def build_doubly_matrix_incidence_ndarray(files_corporas,
                                          tokens) -> numpy.ndarray:
    """

    :type files_corporas: list
    :type tokens: list
    """
    print("Initialization of the matrix...")
    incidence_matrix = numpy.zeros(
        (files_corporas.__len__(), tokens.__len__()))
    print(incidence_matrix)
    print(f"Matrix shape: {incidence_matrix.shape}")

    print("Processing the matrix...")
    for file_index, file_corpora in enumerate(files_corporas):
        file_output = pipeline_input(file_corpora)
        file_tokens = tokenize_corpora_into_dict(file_output)
        file_bigrams = ngram_tokens(file_tokens, 2)
        for token_index, token in enumerate(tokens):
            incidence_matrix[file_index][token_index] = token in file_bigrams
    print("Processing ended successfully!")

    print(incidence_matrix)
    return incidence_matrix