def test_normalize_4():
    # Create pandas dataframe for testing
    dict_4 = {
        "col 0": [1, 2, None],
        "col 1": [5, 20, 15],
        "col 2": ["", "tg", "ctatg"],
        "col 3": [100, 200, 300]
    }
    df_4 = pd.DataFrame(data=dict_4)

    with pytest.raises(Exception):
        reformat_df.normalize(df_4, 4, [0, 1, 3])
def count_file(input_file,
               output_file,
               input_header=None,
               output_header=False,
               sep='\t'):
    """
    Wrapper for counting all kmers in a table of base pairs stored in a file.
    :param input_file: File containing data table. Columns in file should
    correspond to the variables chr_col, start_col, end_col, labels_col,
    pairs_col in this module.
    :param output_file: File to write table with updated kmer counts.
    :param input_header: Row of header in input_file (default=None)
    :param output_header: Whether to write header in output_file(default=False)
    :param sep: Separator for table in output_file(default='\t)
    :return: None
    """
    # Read in file as pandas dataframe
    input_df = pd.read_table(input_file, header=input_header)

    # Rename columns
    input_df = input_df.rename(
        columns={
            chr_col: "chr",
            start_col: "start",
            end_col: "end",
            label_col: "label",
            pairs_col: "pairs"
        })

    # Generate all possible k-mers
    kmers_list = generate_kmers()

    # Create features matrix with columns as the different kmers
    features_df = input_df.reindex(columns=(input_df.columns.tolist() +
                                            kmers_list),
                                   fill_value=0)

    # Update the features data frame.
    features_df = count_kmers(features_df=features_df)

    # Get list of integer column numbers to normalize
    kmer_cols_list = []
    for kmer in kmers_list:
        kmer_cols_list.append(features_df.columns.get_loc(kmer))
    # Normalize the features data frame by the length of sequences of base
    # pairs.
    normalize(features_df,
              lengths_col=pairs_col,
              normalize_cols=kmer_cols_list)

    # Save to output file
    features_df.to_csv(output_file, header=output_header, sep=sep, index=False)
def test_normalize_3():
    # Create pandas dataframe for testing
    dict_3 = {
        "col 0": [1, 2, 3],
        "col 1": [5, 20, 15],
        "col 2": ["a", "tg", "ctatg"],
        "col 3": [100, 200, 300]
    }
    df_3 = pd.DataFrame(data=dict_3)

    dict_3_norm = {
        "col 0": [1, 2, 3],
        "col 1": [5.0, 10.0, 3.0],
        "col 2": ["a", "tg", "ctatg"],
        "col 3": [100, 200, 300]
    }
    df_3_normalized = pd.DataFrame(data=dict_3_norm)

    # Test normalize function
    reformat_df.normalize(df_3, 2, [1])
    assert df_3.equals(df_3_normalized)
def test_normalize_1():
    # Create pandas dataframe for testing
    dict_1 = {
        "col 0": [1, 2, None],
        "col 1": [5, 20, 15],
        "col 2": ["a", "tg", "ctatg"],
        "col 3": [100, 200, 300]
    }
    df_1 = pd.DataFrame(data=dict_1)

    dict_1_norm = {
        "col 0": [1.0, 1.0, None],
        "col 1": [5.0, 10.0, 3.0],
        "col 2": ["a", "tg", "ctatg"],
        "col 3": [100.0, 100.0, 60.0]
    }
    df_1_normalized = pd.DataFrame(data=dict_1_norm)

    # Test normalize function
    reformat_df.normalize(df_1, 2, [0, 1, 3])
    assert df_1.equals(df_1_normalized)
Exemple #5
0
    # Combine data frames      
    return pd.concat(frames_list, axis = axis) 


# In[ ]:


if __name__ == "__main__":
    # List of files to combine
    file_list = []
    
    # Directory the files are in 
    directory = "/dors/capra_lab/users/yand1/te_ml/data/2018_07_11_pca_te_enhancers/batch_output/"
    
    # Generate list of files with shuffled human genome data to combine
    for i in range(50):
        file_list.append(directory + "shuffle_{}_features_matrix.tsv".format(i + 50))
        
    # Get combined data frame
    print("Combining files...")
    combined_df = combine(file_list)
    
    # Normalize counts 
    print("Normalizing counts...")
    reformat_df.normalize(combined_df, lengths_col = 4, normalize_cols = list(range(5, 4101)))
    
    # Save to new file
    combined_df.to_csv("/dors/capra_lab/users/yand1/te_ml/data/2018_07_12_pca_te_enhancers/shuffled_features_matrix.tsv",
                      header = False, index = False, sep = '\t')