new_aln = f.flat_file_to_df([0, 1, 15]) orig_aln.index = [i.split('_')[1] for i in orig_aln.index] aln_df = pd.concat([orig_aln, new_aln]) aln_df.fillna(0, inplace=True) ################# # filter out subset viruses and those that have less than 10 peptides >=80% pos aln_df = aln_df.reindex(peptide_lib).fillna(0) print("filled NA", flush=True) binary_b = aln_df[aln_df >= 80].fillna(0) binary_b = pd.DataFrame(index=binary_b.index, columns=binary_b.columns, data=binary_b.values, dtype=bool) binary_b = flex_array.array(binary_b).filter_aln(ref_seq='../ref_seq/') aln_df = aln_df.loc[:, binary_b. columns] #use the resulting columns from filtering the binary matrix binary_b = pd.DataFrame(index=aln_df.index, columns=aln_df.columns, data=aln_df.values, dtype=bool) print("filtered", flush=True) # dictionary of list of alignments for each virus. keys: virus strings, values: Series objects containing only nonzero alignments virus_aln = {i: aln_df.loc[binary_b[i], i] for i in binary_b.columns} # above, but with only the xr peptides virus_xr = {i: virus_aln[i][virus_aln[i] < 80] for i in binary_b.columns} # only 'evidence' peptides (reassigned variable name) virus_aln = {i: virus_aln[i][virus_aln[i] >= 80] for i in binary_b.columns} dep_pep = dependent_peptides()
def run_analysis(self): zdf = flex_array.standard_df(self.par['zscore_file']) binary_b = flex_array.sparse_aln_df(self.par['file_aln']) binary_b = flex_array.array(binary_b).filter_aln() binary_b = binary_b.reindex(zdf.index).fillna(0) sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf)) p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf)) hits_series = pd.Series(index=list(zdf)) nonoverlap_hits_series = pd.Series(index=list(zdf)) samples = list(zdf.columns) nonoverlap_dict = {} for sample_name, column in zdf.iteritems(): hits = column[column >= self.par['Z_threshold']].copy() if self.par['use_filter']: nonoverlap_hits = flex_array.array(hits).gen_ind_hits( self.dependent_pep, self.par['graph_dir'], samples.index(sample_name)) elif not self.par['use_filter']: nonoverlap_hits = hits.copy() input_num = len(nonoverlap_hits) hits_series[sample_name] = len(hits) nonoverlap_hits_series[sample_name] = input_num nonoverlap_dict[sample_name] = list(nonoverlap_hits.index) print("%s:\thits=%s, nonoverlapped=%s" % (sample_name, len(hits), input_num)) if input_num > 0: zb_df = binary_b.loc[nonoverlap_hits.index] collapse_zb, glob_array, sim_tag, p_series, orig_pseries = flex_array.array( zb_df).binom_reassign(input_num, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism']) sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag pep_df[sample_name] = collapse_zb.apply( lambda x: flex_array.array(x).names_string(0.001), axis=0) p_df[sample_name] = p_series orig_p[sample_name] = orig_pseries file_head = self.par['sub_dir'] + self.par['zscore_file'].split( '/')[-1].split('.')[0] #Removes file path and extension if self.par['organism']: file_head += '_organism_' else: file_head += '_species_' #Write log file params.file_IO(self.par['sub_dir'] + 'parameters.log', sep='=').dict_to_file(self.par) #Write analysis files sum_df.to_csv(file_head + 'total-counts.txt', sep='\t', header=True, index_label='Specie') glob_unique.to_csv(file_head + 'unique-counts.txt', sep='\t', header=True, index_label='Specie') pep_df.to_csv(file_head + 'peptides.txt', sep='\t', header=True, index_label='Specie') p_df.to_csv(file_head + 'p-values.txt', sep='\t', header=True, index_label='Specie') orig_p.to_csv(file_head + 'orig-p-values.txt', sep='\t', header=True, index_label='Specie') for i in p_df.columns: pvals = np.array(p_df[i].values) if not pd.isnull(pvals).all(): mask = [j for j in np.where(np.isfinite(pvals))[0]] pval_corrected = np.empty(pvals.shape) pval_corrected.fill(np.nan) pval_corrected[mask] = multipletests(pvals[mask], method='fdr_bh')[1] padjust_df[i] = pval_corrected padjust_df.to_csv(file_head + 'p-adjusted.txt', sep='\t', header=True, index_label='Specie') #Write independent peptides file f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w') for i in samples: f.write(i) for j in nonoverlap_dict[i]: f.write('\t' + str(j)) f.write('\n') f.close() #Write summary file f = open(self.par['sub_dir'] + 'results_summary.txt', 'w') f.write( "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\t" ) f.write( "Assigned peptides\tTotal sample hits\tTotal filtered sample hits\n" ) for i in samples: BH = padjust_df[i] BH = BH[BH < self.par['bh_threshold']] p_value = p_df[i] p_value = p_value[BH.index] orig_pvalue = orig_p[i] orig_pvalue = orig_pvalue[BH.index] counts = sum_df[i] counts = counts[BH.index] peptides = pep_df[i] peptides = peptides[BH.index] for j in BH.index: if counts[j] > self.par['x_threshold']: f.write(i + '\t') f.write(j + '\t' + str(BH[j]) + '\t') f.write( str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t') f.write(str(counts[j]) + '\t' + str(peptides[j]) + '\t') f.write( str(hits_series[i]) + '\t' + str(nonoverlap_hits_series[i]) + '\n') f.close() print("End of run.") return None
def run_analysis(self): zdf = flex_array.standard_df(self.par['zscore_file']) f = params.file_IO( '../ref_seq/pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8', '\t') orig_aln = f.flat_file_to_df([0, 1, 15]) f = params.file_IO( '../ref_seq/new_pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8', '\t') new_aln = f.flat_file_to_df([0, 1, 15]) orig_aln.index = [i.split('_')[1] for i in orig_aln.index] aln_df = pd.concat([orig_aln, new_aln]) aln_df.fillna(0, inplace=True) binary_b = aln_df[aln_df >= 80].fillna(0) binary_b = pd.DataFrame(index=binary_b.index, columns=binary_b.columns, data=binary_b.values, dtype=bool) binary_b = flex_array.array(binary_b).filter_aln( ref_seq=self.par['dir_ref_seq']) binary_b = binary_b.reindex(zdf.index).fillna(0) aln_df = aln_df.loc[:, binary_b.columns] #binary_b = flex_array.sparse_aln_df(self.par['file_aln']) #binary_b = flex_array.array(binary_b).filter_aln(ref_seq=self.par['dir_ref_seq']) sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf)) p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) n_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf)) filter_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) hits_series = pd.Series(index=list(zdf)) nonoverlap_hits_series = pd.Series(index=list(zdf)) samples = list(zdf.columns) nonoverlap_dict = {} parallel_dict1 = {} parallel_dict2 = {} for sample_name, column in zdf.iteritems(): hits = column[column >= self.par['Z_threshold']].copy() if self.par['use_filter']: nonoverlap_hits = flex_array.gen_ind_hits( hits, self.dependent_pep, self.par['graph_dir'], samples.index(sample_name)) input_num = len(nonoverlap_hits) elif not self.par['use_filter']: nonoverlap_hits = hits.copy() input_num = len( flex_array.gen_ind_hits(hits, self.dependent_pep, self.par['graph_dir'], samples.index(sample_name))) hits_series[sample_name] = len(hits) nonoverlap_hits_series[sample_name] = input_num nonoverlap_dict[sample_name] = list(nonoverlap_hits.index) print("%s:\thits=%s, nonoverlapped=%s" % (sample_name, len(hits), input_num)) if input_num > 0: zb_df = aln_df.loc[nonoverlap_hits.index] parallel_dict1[sample_name] = zb_df parallel_dict2[sample_name] = nonoverlap_hits ''' collapse_zb, glob_array, sim_tag, p_series, orig_pseries, filter_series = flex_array.array(zb_df).binom_reassign( nonoverlap_hits, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism']) sum_df[sample_name]=collapse_zb.apply(sum, axis=0) + sim_tag glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag pep_df[sample_name]=collapse_zb.apply(lambda x: flex_array.array(x).names_string(0.001),axis=0) p_df[sample_name]=p_series orig_p[sample_name]=orig_pseries filter_df[sample_name]=filter_series ''' #parallel_dict1 = pd.Series(parallel_dict1) #parallel_dict2 = pd.Series(parallel_dict2) #parallel_dict2 = parallel_dict.loc[parallel_dict1.index] list1 = list(parallel_dict1.keys()) #sample names list2 = list(parallel_dict1.values()) #zb_df list3 = [parallel_dict2[i] for i in list1] #hits series zipped = zip(list2, list3, list1) results = Parallel(n_jobs=-1)( delayed(flex_array.binom_reassign) (zb_df, nonoverlap_hits, sample_name, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism']) for zb_df, nonoverlap_hits, sample_name in zipped) r1, r2, r3, r4, r5, r6, r7, r8 = zip(*results) for i in range(len(r7)): sample_name = r7[i] collapse_zb = r1[i] glob_array = r2[i] sim_tag = r3[i] p_series = r4[i] orig_pseries = r5[i] filter_series = r6[i] n_series = r8[i] sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag pep_df[sample_name] = collapse_zb.apply( lambda x: flex_array.array(x).names_string(0.001), axis=0) n_df[sample_name] = n_series p_df[sample_name] = p_series orig_p[sample_name] = orig_pseries filter_df[sample_name] = filter_series file_head = self.par['sub_dir'] + self.par['zscore_file'].split( '/')[-1].split('.')[0] #Removes file path and extension if self.par['organism']: file_head += '_organism_' else: file_head += '_species_' #Write log file params.file_IO(self.par['sub_dir'] + 'parameters.log', sep='=').dict_to_file(self.par) #Write analysis files sum_df.to_csv(file_head + 'total-counts.txt', sep='\t', header=True, index_label='Specie') glob_unique.to_csv(file_head + 'unique-counts.txt', sep='\t', header=True, index_label='Specie') pep_df.to_csv(file_head + 'peptides.txt', sep='\t', header=True, index_label='Specie') p_df.to_csv(file_head + 'p-values.txt', sep='\t', header=True, index_label='Specie') orig_p.to_csv(file_head + 'orig-p-values.txt', sep='\t', header=True, index_label='Specie') filter_df.to_csv(file_head + 'virus-filter.txt', sep='\t', header=True, index_label='Specie') for i in p_df.columns: pvals = np.array(p_df[i].values) if not pd.isnull(pvals).all(): mask = [j for j in np.where(np.isfinite(pvals))[0]] pval_corrected = np.empty(pvals.shape) pval_corrected.fill(np.nan) pval_corrected[mask] = multipletests(pvals[mask], method='fdr_bh')[1] padjust_df[i] = pval_corrected padjust_df.to_csv(file_head + 'p-adjusted.txt', sep='\t', header=True, index_label='Specie') #Write independent peptides file f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w') for i in samples: f.write(i) for j in nonoverlap_dict[i]: f.write('\t' + str(j)) f.write('\n') f.close() #Write summary file f = open(file_head + 'results_summary.txt', 'w') f.write( "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\tFiltered Assigned Counts\t" ) f.write( "Assigned peptides\tTotal significant peptides\tRanking N\tTotal sample hits\tTotal filtered sample hits\n" ) for i in samples: BH = padjust_df[i] BH = BH[BH < self.par['bh_threshold']] p_value = p_df[i] n_value = n_df[i] n_value = n_value[BH.index] p_value = p_value[BH.index] filter_value = filter_df[i] filter_value = filter_value[BH.index] orig_pvalue = orig_p[i] orig_pvalue = orig_pvalue[BH.index] counts = sum_df[i] counts = counts[BH.index] peptides = pep_df[i] peptides = peptides[BH.index] for j in BH.index: if filter_value[j] > self.par['x_threshold']: f.write(i + '\t') f.write(j + '\t' + str(BH[j]) + '\t') f.write( str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t') f.write( str(counts[j]) + '\t' + str(filter_value[j]) + '\t' + str(peptides[j]) + '\t') #write number of peptides pep_set = set() for k in BH.index: pep_list = peptides[k].split(';') pep_set = pep_set.union(set(pep_list)) f.write(str(len(pep_set)) + '\t') f.write(str(n_value[j]) + '\t') f.write( str(hits_series[i]) + '\t' + str(nonoverlap_hits_series[i]) + '\n') f.close() print("End of run.") return None
def probability_ref(self): # Writes to file probability tables later used in binomial assessments viral_peptidome = open(self.par['file_annotation'], 'r') peptide_lib = [] next(viral_peptidome) for line in viral_peptidome: items = line.split('\t') peptide_lib.append(str(items[0])) viral_peptidome.close() peptide_lib = peptide_lib[:-1] #peptide_lib.sort(key=int) binary_b = flex_array.sparse_aln_df(self.par['file_aln']) binary_b = binary_b.reindex(peptide_lib).fillna(0) binary_b = flex_array.array(binary_b).filter_aln( ref_seq=self.par['dir_ref_seq']) binary_b = pd.DataFrame(index=binary_b.index, columns=binary_b.columns, data=binary_b.values, dtype=bool) virus_aln = { i: binary_b.loc[binary_b[i], i] for i in binary_b.columns } #list of alignments to feed into filter dep_pep = self.dependent_peptides() virus_tot_filter = [ flex_array.gen_ind_hits(virus_aln[i], dep_pep) for i in virus_aln ] #virus_aln = dict(zip(list(binary_b.columns), virus_aln)) virus_sums = pd.Series(index=binary_b.columns, data=[ len(i) for i in virus_tot_filter ]) #binary_b.apply(np.count_nonzero, axis=0) first_round_prob = pd.Series(index=binary_b.columns) viruses = list(binary_b.columns) for i in first_round_prob.index: print("Virus " + str(viruses.index(i))) first_round_prob[i] = virus_sums[i] / ( len(peptide_lib) - (len(virus_aln[i]) - virus_sums[i])) first_round_prob.to_csv(self.par['dir_ref_seq'] + "total_probabilities_20180524.csv", header=False, index=True) print("First probability file generated.") ''' virus_shared = pd.DataFrame(index=viruses, columns=viruses) virus_unique = pd.DataFrame(index=viruses, columns=viruses) for i in viruses: for j in viruses: shared = virus_aln[i]*virus_aln[j]; shared.fillna(0.0, inplace=True); shared = shared[shared>0]#; shared = list(shared.index); shared = [str(i) for i in shared] #shared = ';'.join(shared) virus_shared.loc[i,j] = len(shared)#(flex_array.gen_ind_hits(shared, dep_pep)) for i in virus_shared.columns: virus_unique[i] = virus_sums[i] - virus_shared[i] ''' second_round_prob = pd.DataFrame(index=viruses, columns=viruses) third_round_prob = pd.DataFrame(index=viruses, columns=viruses) #virus_pairs = [] #total = 139656 #count = 0 virus_pairs = list(combo(viruses, 2)) def calc_pair(pair): i = pair[0] j = pair[1] d1 = {} d2 = {} i_index = set(virus_aln[i].index) j_index = set(virus_aln[j].index) shared_index = set(i_index).intersection(j_index) shared = virus_aln[i].loc[list(shared_index)] if len(shared) == 0: d1[(i, j)] = first_round_prob[j] d1[(j, i)] = first_round_prob[i] d2[(i, j)] = 0.0 d2[(j, i)] = 0.0 else: unique_j = j_index - shared_index unique_j = virus_aln[j].loc[unique_j] filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep) d1[(i, j)] = len(filter_unique_j) / ( len(peptide_lib) - len(shared) - (len(unique_j) - len(filter_unique_j))) unique_i = i_index - shared_index unique_i = virus_aln[i].loc[unique_i] filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep) d1[(j, i)] = len(filter_unique_i) / ( len(peptide_lib) - len(shared) - (len(unique_i) - len(filter_unique_i))) filter_shared = flex_array.gen_ind_hits(shared, dep_pep) d2[(i, j)] = len(filter_shared) / ( len(peptide_lib) - len(unique_i) - len(unique_j) - (len(shared) - len(filter_shared))) d2[(j, i)] = d2[(i, j)] return d1, d2 results = Parallel(n_jobs=-1, verbose=100000)(delayed(calc_pair)(pair) for pair in virus_pairs) m1, m2 = zip(*results) # for i in index: # for j in range(index.index(i),len(second_round_prob.columns)): # #pair = set([i, j]) # #if pair not in virus_pairs: # # print progress # j = second_round_prob.columns[j] # count += 1 # print("Proportion of pairs evaluated: " + str(count/total)) # # add pair to list of sets # #virus_pairs.append(pair) # ''' # #do uniques # shared = virus_aln[i] & virus_aln[j]; shared.fillna(False, inplace=True); shared = shared[shared]; # unique_j = virus_aln[j] ^ shared; unique_j.fillna(True, inplace=True); unique_j = unique_j[unique_j]; # filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep) # second_round_prob.loc[i,j] = len(filter_unique_j)/(len(peptide_lib)-len(shared)-(len(unique_j)-len(filter_unique_j))) # # unique_i = virus_aln[i] ^ shared; unique_i.fillna(True, inplace=True); unique_i = unique_i[unique_i]; # filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep) # second_round_prob.loc[j,i] = len(filter_unique_i)/(len(peptide_lib)-len(shared)-(len(unique_i)-len(filter_unique_i))) # # now do shared probabilities # filter_shared = flex_array.gen_ind_hits(shared, dep_pep) # third_round_prob.loc[i,j] = len(filter_shared)/(len(peptide_lib)-len(unique_i)-len(unique_j)-(len(shared)-len(filter_shared))) # third_round_prob.loc[j,i] = third_round_prob.loc[i,j] # ''' # # find shared # i_index = set(virus_aln[i].index); j_index = set(virus_aln[j].index) # shared_index = set(i_index).intersection(j_index); shared = virus_aln[i].loc[list(shared_index)] # # set values if shared is 0 # if len(shared) == 0: # second_round_prob.loc[i,j] = first_round_prob[j] # second_round_prob.loc[j,i] = first_round_prob[i] # third_round_prob.loc[i,j] = 0.0 # third_round_prob.loc[j,i] = 0.0 # else: # # unique at j # unique_j = j_index - shared_index; unique_j = virus_aln[j].loc[unique_j] # filter_unique_j = flex_array.gen_ind_hits(unique_j, dep_pep) # second_round_prob.loc[i,j] = len(filter_unique_j)/(len(peptide_lib)-len(shared)-(len(unique_j)-len(filter_unique_j))) # # unique at i # unique_i = i_index - shared_index; unique_i = virus_aln[i].loc[unique_i] # filter_unique_i = flex_array.gen_ind_hits(unique_i, dep_pep) # second_round_prob.loc[j,i] = len(filter_unique_i)/(len(peptide_lib)-len(shared)-(len(unique_i)-len(filter_unique_i))) # # shared prob # filter_shared = flex_array.gen_ind_hits(shared, dep_pep) # third_round_prob.loc[i,j] = len(filter_shared)/(len(peptide_lib)-len(unique_i)-len(unique_j)-(len(shared)-len(filter_shared))) # third_round_prob.loc[j,i] = third_round_prob.loc[i,j] # second_round_prob.to_csv(self.par['dir_ref_seq'] + "unique_probabilities_20180524.csv", header=True, index=True) print("Second probability file generated.") third_round_prob.to_csv(self.par['dir_ref_seq'] + "shared_probabilities_20180524.csv", header=True, index=True) print("Third (and last) probability file generated.") ''' a = binary_b[i]; b = binary_b[j] virus_intersections.loc[i,j] = np.dot(a,b) virus_unique = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.columns: virus_unique[i] = virus_sums[i] - virus_intersections[i] second_round_prob = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.index: for j in virus_intersections.columns: second_round_prob.loc[i,j] = virus_unique.loc[i,j]/(len(peptide_lib)-virus_intersections.loc[i,j]) second_round_prob.to_csv(self.par['dir_ref_seq']+"unique_probabilities.csv", header=True, index=True) print("Second probability file generated.") third_round_prob = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.index: for j in virus_intersections.columns: third_round_prob.loc[i,j] = virus_intersections.loc[i,j]/(len(peptide_lib)-virus_unique.loc[i,j]-virus_unique.loc[j,i]) third_round_prob.to_csv(self.par['dir_ref_seq']+"shared_probabilities.csv", header=True, index=True) print("Third (and last) probability file generated.") ''' return None
def probability_ref(self): # Writes to file probability tables later used in binomial assessments viral_peptidome = open(self.par['file_annotation'], 'r') peptide_lib = [] next(viral_peptidome) for line in viral_peptidome: items = line.split('\t') peptide_lib.append(str(items[0])) viral_peptidome.close() peptide_lib = peptide_lib[:-1] peptide_lib.sort(key=int) binary_b = flex_array.binary_aln_df(self.par['file_aln']) binary_b = flex_array.array(binary_b).filter_aln() binary_b = binary_b.reindex(peptide_lib).fillna(0) virus_sums = binary_b.apply(np.count_nonzero, axis=0) first_round_prob = virus_sums / len(peptide_lib) first_round_prob.to_csv(self.par['dir_ref_seq'] + "total_probabilities.csv", header=False, index=True) print("First probability file generated.") viruses = list(binary_b.columns) virus_intersections = pd.DataFrame(index=viruses, columns=viruses) for i in viruses: for j in viruses: a = binary_b[i] b = binary_b[j] virus_intersections.loc[i, j] = np.dot(a, b) virus_unique = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.columns: virus_unique[i] = virus_sums[i] - virus_intersections[i] second_round_prob = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.index: for j in virus_intersections.columns: second_round_prob.loc[i, j] = virus_unique.loc[i, j] / ( len(peptide_lib) - virus_intersections.loc[i, j]) second_round_prob.to_csv(self.par['dir_ref_seq'] + "unique_probabilities.csv", header=True, index=True) print("Second probability file generated.") third_round_prob = pd.DataFrame(index=viruses, columns=viruses) for i in virus_intersections.index: for j in virus_intersections.columns: third_round_prob.loc[i, j] = virus_intersections.loc[i, j] / ( len(peptide_lib) - virus_unique.loc[i, j] - virus_unique.loc[j, i]) third_round_prob.to_csv(self.par['dir_ref_seq'] + "shared_probabilities.csv", header=True, index=True) print("Third (and last) probability file generated.") return None # End