def test_normalize_4(): # Create pandas dataframe for testing dict_4 = { "col 0": [1, 2, None], "col 1": [5, 20, 15], "col 2": ["", "tg", "ctatg"], "col 3": [100, 200, 300] } df_4 = pd.DataFrame(data=dict_4) with pytest.raises(Exception): reformat_df.normalize(df_4, 4, [0, 1, 3])
def count_file(input_file, output_file, input_header=None, output_header=False, sep='\t'): """ Wrapper for counting all kmers in a table of base pairs stored in a file. :param input_file: File containing data table. Columns in file should correspond to the variables chr_col, start_col, end_col, labels_col, pairs_col in this module. :param output_file: File to write table with updated kmer counts. :param input_header: Row of header in input_file (default=None) :param output_header: Whether to write header in output_file(default=False) :param sep: Separator for table in output_file(default='\t) :return: None """ # Read in file as pandas dataframe input_df = pd.read_table(input_file, header=input_header) # Rename columns input_df = input_df.rename( columns={ chr_col: "chr", start_col: "start", end_col: "end", label_col: "label", pairs_col: "pairs" }) # Generate all possible k-mers kmers_list = generate_kmers() # Create features matrix with columns as the different kmers features_df = input_df.reindex(columns=(input_df.columns.tolist() + kmers_list), fill_value=0) # Update the features data frame. features_df = count_kmers(features_df=features_df) # Get list of integer column numbers to normalize kmer_cols_list = [] for kmer in kmers_list: kmer_cols_list.append(features_df.columns.get_loc(kmer)) # Normalize the features data frame by the length of sequences of base # pairs. normalize(features_df, lengths_col=pairs_col, normalize_cols=kmer_cols_list) # Save to output file features_df.to_csv(output_file, header=output_header, sep=sep, index=False)
def test_normalize_3(): # Create pandas dataframe for testing dict_3 = { "col 0": [1, 2, 3], "col 1": [5, 20, 15], "col 2": ["a", "tg", "ctatg"], "col 3": [100, 200, 300] } df_3 = pd.DataFrame(data=dict_3) dict_3_norm = { "col 0": [1, 2, 3], "col 1": [5.0, 10.0, 3.0], "col 2": ["a", "tg", "ctatg"], "col 3": [100, 200, 300] } df_3_normalized = pd.DataFrame(data=dict_3_norm) # Test normalize function reformat_df.normalize(df_3, 2, [1]) assert df_3.equals(df_3_normalized)
def test_normalize_1(): # Create pandas dataframe for testing dict_1 = { "col 0": [1, 2, None], "col 1": [5, 20, 15], "col 2": ["a", "tg", "ctatg"], "col 3": [100, 200, 300] } df_1 = pd.DataFrame(data=dict_1) dict_1_norm = { "col 0": [1.0, 1.0, None], "col 1": [5.0, 10.0, 3.0], "col 2": ["a", "tg", "ctatg"], "col 3": [100.0, 100.0, 60.0] } df_1_normalized = pd.DataFrame(data=dict_1_norm) # Test normalize function reformat_df.normalize(df_1, 2, [0, 1, 3]) assert df_1.equals(df_1_normalized)
# Combine data frames return pd.concat(frames_list, axis = axis) # In[ ]: if __name__ == "__main__": # List of files to combine file_list = [] # Directory the files are in directory = "/dors/capra_lab/users/yand1/te_ml/data/2018_07_11_pca_te_enhancers/batch_output/" # Generate list of files with shuffled human genome data to combine for i in range(50): file_list.append(directory + "shuffle_{}_features_matrix.tsv".format(i + 50)) # Get combined data frame print("Combining files...") combined_df = combine(file_list) # Normalize counts print("Normalizing counts...") reformat_df.normalize(combined_df, lengths_col = 4, normalize_cols = list(range(5, 4101))) # Save to new file combined_df.to_csv("/dors/capra_lab/users/yand1/te_ml/data/2018_07_12_pca_te_enhancers/shuffled_features_matrix.tsv", header = False, index = False, sep = '\t')