def calculate_pssm_thresholds( meme_path, cutoffs_path, faa_path, number_of_random_pssms, output_path, done_path, argv='no_argv', pssm_score_peptide='/groups/pupko/orenavr2/igomeProfilingPipeline/src/PSSM_score_Peptide/PSSM_score_Peptide' ): if not os.path.exists(output_path): # TODO: any modules to load? cmd = f'{pssm_score_peptide} -pssm {meme_path} -pssm_cutoffs {cutoffs_path} -seq {faa_path} ' \ f'-out {output_path} -NrandPSSM {number_of_random_pssms} -CalcPSSM_Pval' logger.info( f'{datetime.datetime.now()}: starting CalcPSSM_Pval. Executed command is:\n{cmd}' ) subprocess.run(cmd, shell=True) else: logger.info( f'{datetime.datetime.now()}: skipping scanning calculation as it is already exist at:\n{output_path}' ) # make sure that there are results and the file is not empty verify_file_is_not_empty(output_path) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def remove_cysteine(fasta_file, out_fasta_file, done_file_path, argv='no_argv'): """ :param fasta_file: a fasta file with sequences :param out_fasta_file: a fasta file with the same sequences but flanking Cysteine is removed :return: """ logger.info( f'{datetime.datetime.now()}: removing Cysteine loop from {fasta_file}') verify_file_is_not_empty(fasta_file) f_in = open(fasta_file) f_out = open(out_fasta_file, 'w') for header in f_in: seq = f_in.readline().rstrip() if seq.startswith('C') and seq.endswith('C'): seq = f'{seq[1:-1]}' # remove Cys loop f_out.write(f'{header}{seq}\n') verify_file_is_not_empty(out_fasta_file) with open(done_file_path, 'w') as f: f.write(' '.join(argv) + '\n')
def aggregate_pvalues_results(meme_path, scanning_results_dir_path, bc, samplename2biologicalcondition_path, aggregated_pvalues_path, aggregated_hits_path, done_path, argv='no_argv'): samplename2biologicalcondition = load_table_to_dict(samplename2biologicalcondition_path, 'Barcode {} belongs to more than one sample_name!!') all_consensuses = get_consensus_sequences_from_meme(meme_path) pvalues_f = open(aggregated_pvalues_path, 'w') hits_f = open(aggregated_hits_path, 'w') #header pvalues_result = hits_result = f'sample_name,label,{",".join(all_consensuses)}' for file_name in sorted(os.listdir(scanning_results_dir_path)): if file_name.endswith('100.txt'): raise TypeError # why? if file_name.startswith('.'): # system file... continue if file_name.endswith('00.txt'): # next sample is starting pvalues_f.write(f'{pvalues_result.rstrip(",")}\n') hits_f.write(f'{hits_result.rstrip(",")}\n') sample_name = file_name.split('_peptides')[0] if bc in sample_name: label = samplename2biologicalcondition[sample_name] else: label = 'other' pvalues_result = hits_result = f'{sample_name},{label},' pvalues, hits = get_results(os.path.join(scanning_results_dir_path, file_name)) pvalues_result += ','.join(pvalues) + ',' hits_result += ','.join(hits) + ',' pvalues_f.write(f'{pvalues_result.rstrip(",")}\n') hits_f.write(f'{hits_result.rstrip(",")}\n') pvalues_f.close() hits_f.close() # remove insignificant features: df = pd.read_csv(aggregated_pvalues_path) # features with at least one significant score, across positive-labeled samples positive_class_df = df[df['label'] != 'other'] significant_features = (positive_class_df.drop(['sample_name', 'label'], axis=1) < 0.05).sum()>0 mask = pd.concat([pd.Series([True, True], index=['sample_name', 'label']), significant_features]) df = df.loc[:, mask] # df = pd.concat([df.loc[:, ['sample_name', 'label']], df.drop(['sample_name', 'label'], axis=1).loc[:, significant_features]], axis=1) df.to_csv(aggregated_pvalues_path.replace('_insignificant', ''), index=False) # make sure that there are results and the file is not empty verify_file_is_not_empty(aggregated_pvalues_path) verify_file_is_not_empty(aggregated_hits_path) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def aggregate_scores(scores_path, bc): # scores_path is a folder in which each file contains the scores of one of the scans split, e.g.: # /groups/pupko/orenavr2/igomeProfilingPipeline/experiments/test/analysis/model_fitting/17b/hits_scores output_path = f'{os.path.split(scores_path)[0]}/hits.txt' call(f'cat {scores_path}/*{bc}_motifs_*.txt > {output_path}', shell=True) # make sure that there are results and the file is not empty verify_file_is_not_empty(output_path)
def aggregate_pvalues_results(meme_path, scanning_results_dir_path, bc, samplename2biologicalcondition_path, aggregated_pvalues_path, aggregated_hits_path, done_path, argv='no_argv'): samplename2biologicalcondition = load_table_to_dict( samplename2biologicalcondition_path, 'Barcode {} belongs to more than one sample_name!!') all_consensuses = get_consensus_sequences_from_meme(meme_path) pvalues_f = open(aggregated_pvalues_path, 'w') hits_f = open(aggregated_hits_path, 'w') #header pvalues_result = hits_result = f'sample_name,label,{",".join(all_consensuses)}' for file_name in sorted(os.listdir(scanning_results_dir_path)): if file_name.endswith('100.txt'): raise TypeError # why? if file_name.endswith('00.txt'): # next sample is starting pvalues_f.write(f'{pvalues_result.rstrip(",")}\n') hits_f.write(f'{hits_result.rstrip(",")}\n') sample_name = file_name.split('_peptides')[0] if bc in sample_name: label = samplename2biologicalcondition[sample_name] else: label = 'other' pvalues_result = hits_result = f'{sample_name},{label},' pvalues, hits = get_results( os.path.join(scanning_results_dir_path, file_name)) pvalues_result += ','.join(pvalues) + ',' hits_result += ','.join(hits) + ',' pvalues_f.write(f'{pvalues_result.rstrip(",")}\n') hits_f.write(f'{hits_result.rstrip(",")}\n') pvalues_f.close() hits_f.close() # make sure that there are results and the file is not empty verify_file_is_not_empty(aggregated_pvalues_path) verify_file_is_not_empty(aggregated_hits_path) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def extract_clusters_sequences(fasta_file, clstr_file, output_dir, done_path, max_number_of_members_per_cluster, cluster_prefix_length_in_clstr_file, file_prefix, argv='no_argv'): verify_file_is_not_empty(fasta_file) verify_file_is_not_empty(clstr_file) os.makedirs(output_dir, exist_ok=True) member_prefix_to_record = load_member_prefix_to_record_dict(fasta_file, cluster_prefix_length_in_clstr_file) cluster_to_members_records = load_clusters_to_members_dict(clstr_file, member_prefix_to_record, cluster_prefix_length_in_clstr_file) logger.info(f'{datetime.datetime.now()}: Writing clusters sequences...') trimmed_clusters = set() # sort records of each cluster by their size and keep only first $max_num_of_sequences_to_keep records for cluster in cluster_to_members_records: # sort cluster members by their "strength", i.e., counts cluster_to_members_records[cluster].sort(key=extract_sequence_counts_from_record, reverse=True) if len(cluster_to_members_records[cluster])>max_number_of_members_per_cluster: # discard (in-place) all sequences above the maximum required number cluster_to_members_records[cluster][max_number_of_members_per_cluster:] = [] trimmed_clusters.add(cluster) max_number_of_leading_zeros = len(str(len(cluster_to_members_records))) sorted_clusters_by_size = sorted(cluster_to_members_records, reverse=True, key=lambda cluster: extract_cluster_size_from_records(cluster_to_members_records[cluster])) if file_prefix != '': file_prefix += '_' for i, cluster in enumerate(sorted_clusters_by_size): cluster_rank = str(i).zfill(4) #max_number_of_leading_zeros) number_of_unique_members = min(len(cluster_to_members_records[cluster]), max_number_of_members_per_cluster) cluster_counts = extract_cluster_size_from_records(cluster_to_members_records[cluster]) filename = f'{file_prefix}clusterRank_' \ f'{cluster_rank}_uniqueMembers_' \ f'{"top" if cluster in trimmed_clusters else ""}' \ f'{number_of_unique_members}_' \ f'clusterSize_{cluster_counts:.2f}.faa' # take only 2 digits after the floating point with open(os.path.join(output_dir, filename), 'w') as f: f.write(''.join(record for i, record in enumerate(cluster_to_members_records[cluster])))# if i<100)) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def convert_sequences_to_upper(in_fasta_file, out_fasta_file, done_file_path, argv='no_argv'): logger.info(f'{datetime.datetime.now()}: upper casing all sequences in {in_fasta_file}') verify_file_is_not_empty(in_fasta_file) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(in_fasta_file) with open(out_fasta_file, 'w') as f: for header in header_to_sequence: f.write(f'>{header}\n{header_to_sequence[header].upper()}\n') verify_file_is_not_empty(out_fasta_file) with open(done_file_path, 'w') as f: f.write(' '.join(argv) + '\n')
def add_pssm_to_meme_file(msa_path, meme_path, add_header): if add_header: logger.info(f'Generating a new MEME file at {meme_path}') logger.info(f'Calculating PSSM of {msa_path}') # make sure that there are results and the file is not empty verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(msa_path) letters = sorted(set(letter.upper() for letter in nnk_table.values())) # don't differentiate between Q and q... column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters) consensus_sequence = ''.join(max(column_to_letters_frequency_counter[column], key=column_to_letters_frequency_counter[column].get) for column in column_to_letters_frequency_counter) mode = 'a' # append to an existing file meta_info = '' if add_header: # override previous file!! mode = 'w' meta_info = f'MEME version 4\n\n' \ f'ALPHABET= {"".join(letters)}\n\n' \ f'Background letter frequencies\n' \ f'{get_background_letters_frequency_str(nnk_table)}\n' else: # the file already exists and contains at least one PSSM # just add some new lines before the next PSSM meta_info += '\n\n' assert os.path.exists(meme_path), \ f"add_header parameter wasn't set but as if meme_path exists but it does not!\n{meme_path}\n" msa_name = os.path.split(os.path.splitext(msa_path)[0])[1] meta_info += f'MOTIF {consensus_sequence}_{msa_name}\n' meta_info += f'letter-probability matrix: ' \ f'alength= {len(letters)} ' \ f'w= {msa_length} ' \ f'nsites= {number_of_sequences}\n' with open(meme_path, mode) as f: f.write(meta_info) for column in column_to_letters_frequency_counter: # gaps are not counted so the total number of actual participating sequences can # be lower than $number_of_sequences number_of_participating_sequences = sum(column_to_letters_frequency_counter[column].values()) column_distribution_str = ' '.join(f'{count/number_of_participating_sequences}' for count in column_to_letters_frequency_counter[column].values()) + '\n' f.write(column_distribution_str)
def reconstruct_msa(sequences_file_path, output_file_path, done_path, argv='no_argv'): number_of_unique_members = get_unique_members_from(sequences_file_path) if number_of_unique_members > 1: import subprocess # TODO: module load mafft.. # --auto Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i and FFT-NS-2, according to data size. # --amino tells mafft that's an amino acid msa. If you let it decide by itself, it might wrong on small data sets # as they might look like dna but they are NOT! e.g., # [orenavr2@powerlogin-be2 test]$ cat /groups/pupko/orenavr2/igomeProfilingPipeline/experiments/test/analysis/motif_inference/17b_03/unaligned_sequences/17b_03_clusterRank_215_uniqueMembers_2_clusterSize_252.81.faa # >seq_235_lib_12_len_12_counts_126.40626975097965 # CNTDVACAAPGN # >seq_1112_lib_C8C_len_10_counts_126.40626975097965 # CTTACAPVNC cmd = f'mafft --auto --amino {sequences_file_path} > {output_file_path}' logger.info( f'{datetime.datetime.now()}: Starting MAFFT. Executed command is:\n{cmd}' ) subprocess.run(cmd, shell=True) else: logger.info( f'{datetime.datetime.now()}: skipping alignment for a cluster with a single member. ' f'Writing the output file as is to\n' f'{output_file_path}') with open(sequences_file_path) as unaligned_f: content = unaligned_f.read() with open(output_file_path, 'w') as aligned_f: aligned_f.write(content) # make sure that there are results and the file is not empty verify_file_is_not_empty(output_file_path) # override the results with clean ones (no redundant new lines. For further details see function's doc) remove_redundant_newlines_from_fasta(output_file_path, output_file_path) # make sure that there are results and the file is not empty verify_file_is_not_empty(output_file_path) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def create_meme_file(msas_path, meme_path, done_path, minimal_number_of_columns_required, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: generating a new MEME file at {meme_path}' ) letters = sorted( set(letter.upper() for letter in nnk_table.values())) # don't differentiate between Q and q... meme_f = open(meme_path, 'w') # write meme file header meme_f.write(f'MEME version 4\n\n' f'ALPHABET= {"".join(letters)}\n\n' f'Background letter frequencies\n' f'{get_background_letters_frequency_str(nnk_table)}\n') for msa_name in sorted(os.listdir( msas_path)): # Sorting pssm in meme files by cluster's rank # clusterRank_000_uniqueMembers_72_clusterSize_757849.92.faa msa_path = os.path.join(msas_path, msa_name) logger.info(f'{datetime.datetime.now()}: writing pssm of {msa_path}') # make sure that there are results and the msa file is not empty verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( msa_path) if msa_length < minimal_number_of_columns_required: logger.warning( f'{datetime.datetime.now()}: skipping pssm for {msa_path} with only {msa_length} columns ' f'(at least {minimal_number_of_columns_required} is required.') continue column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters) write_pssm(meme_f, letters, msa_name, column_to_letters_frequency_counter, msa_length, number_of_sequences) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def cluster_sequences(fasta_file, output_prefix, done_file_path, threshold, word_length, throw_sequences_shorter_than, argv='no_argv'): verify_file_is_not_empty(fasta_file) logger.info(f'{datetime.datetime.now()}: clustering sequences in {fasta_file}') # TODO: module load CD hit cmd = f'cd-hit -i {fasta_file} ' \ f'-o {output_prefix} ' \ f'-c {threshold} ' \ f'-n {word_length} ' \ f'-l {throw_sequences_shorter_than}' logger.info(f'Starting CD-hit. Executed command is:\n{cmd}') subprocess.call(cmd, shell=True) # make sure that there are results and the file is not empty verify_file_is_not_empty(f'{output_prefix}.clstr') with open(done_file_path, 'w') as f: f.write(' '.join(argv) + '\n')
def remove_configurations(in_fasta_file, out_fasta_file, allowed_configurations, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: removing all configurations that are not one of these:\n' f'{allowed_configurations}\n' f'From {in_fasta_file}') verify_file_is_not_empty(in_fasta_file) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( in_fasta_file) with open(out_fasta_file, 'w') as f: for header in header_to_sequence: for conf in allowed_configurations: if f'lib_{conf}_' in header or f'Type_{conf}' in header: f.write( f'>{header}\n{header_to_sequence[header].upper()}\n') break verify_file_is_not_empty(out_fasta_file)
def remove_sparse_columns(msa_path, out_path, done_path, maximal_gap_frequency_allowed_per_column, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: Removing sparse columns from {msa_path} (allowing columns with gap frequency lower than {maximal_gap_frequency_allowed_per_column})' ) verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( msa_path) cleaned_header_to_sequence = dict.fromkeys(header_to_sequence, '') for j in range(msa_length): column_j = [ header_to_sequence[header][j] for header in header_to_sequence ] gap_frequency = column_j.count('-') / number_of_sequences if gap_frequency <= maximal_gap_frequency_allowed_per_column: # not a sparse column for header in header_to_sequence: # add j'th column cleaned_header_to_sequence[header] += header_to_sequence[ header][j] else: logger.debug( f'{datetime.datetime.now()}: Removing column #{j}: {column_j}') with open(out_path, 'w') as f: for header in cleaned_header_to_sequence: f.write(f'>{header}\n{cleaned_header_to_sequence[header]}\n') logger.info( f'{datetime.datetime.now()}: Shortened from {msa_length} to {len(cleaned_header_to_sequence[header])} columns' ) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def unite_clusters( motif_inference_output_path, meme_file, biological_condition, sample_names, max_number_of_members_per_cluster, output_path, done_path, aln_cutoff, pcc_cutoff, unite_pssm_script_path='/groups/pupko/orenavr2/gershoni/src/UnitePSSMs/UnitePSSMs', argv='no_argv'): clusters_to_combine_path = os.path.join(output_path, 'cluster_to_combine.csv') if not os.path.exists(clusters_to_combine_path): # TODO: any modules to load? cmd = f'{unite_pssm_script_path} -pssm {meme_file} -out {clusters_to_combine_path} ' \ f'-aln_cutoff {aln_cutoff} -pcc_cutoff {pcc_cutoff}' logger.info( f'{datetime.datetime.now()}: starting UnitePSSMs. Executed command is:\n{cmd}' ) subprocess.run(cmd, shell=True) # make sure that there are results and the file is not empty verify_file_is_not_empty(clusters_to_combine_path) logger.info(f'Result file is at {clusters_to_combine_path}') clusters_to_combine = [] with open(clusters_to_combine_path) as f: for line in f: cluster_names = line.rstrip().split(',') # remove consensus sequence so we have the exact cluster (file) name cluster_without_prefix = [ cluster[cluster.index('_') + 1:] for cluster in cluster_names ] clusters_to_combine.append(cluster_without_prefix) logger.info(f'Sorting clusters by rank...') # sort the sublist such that the first one will contain the highest copy number, etc... clusters_to_combine.sort(key=lambda clusters: sum( get_cluster_size_from_name(cluster) for cluster in clusters), reverse=True) sorted_clusters_to_combine_path = clusters_to_combine_path.replace( 'cluster_to_combine', 'sorted_cluster_to_combine') with open(sorted_clusters_to_combine_path, 'w') as f: for cluster_names in clusters_to_combine: f.write(','.join(cluster_names) + '\n') unaligned_sequences_path = os.path.join(output_path, 'unaligned_sequences') os.makedirs(unaligned_sequences_path, exist_ok=True) for cluster_rank in range(len(clusters_to_combine)): if cluster_rank % 25 == 0: logger.info( f'Merging sequences of the cluster ranked {cluster_rank}') clusters_sequences, cluster_file_name = get_clusters_sequences( motif_inference_output_path, biological_condition, sample_names, clusters_to_combine[cluster_rank], cluster_rank, max_number_of_members_per_cluster) with open(os.path.join(unaligned_sequences_path, cluster_file_name), 'w') as f: f.write(clusters_sequences) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def split_meme_and_cutoff_files(meme_file_path, cutoffs_file_path, motifs_per_file, done_path, argv='no_argv'): verify_file_is_not_empty(meme_file_path) verify_file_is_not_empty(cutoffs_file_path) splitted_meme_dir = os.path.join(os.path.split(meme_file_path)[0], 'memes') os.makedirs(splitted_meme_dir, exist_ok=True) splitted_cutoffs_dir = os.path.join( os.path.split(cutoffs_file_path)[0], 'cutoffs') os.makedirs(splitted_cutoffs_dir, exist_ok=True) logger.info( f'{datetime.datetime.now()}: splitting pssms and cuttoffs to:\n' f'{splitted_meme_dir}\n' f'{splitted_cutoffs_dir}') with open(meme_file_path) as meme_f: meta_info = '' data = '' motif_number = 0 split_number = 0 add_meta_info = True for line in meme_f: if add_meta_info: if "MOTIF" not in line: meta_info += line continue else: add_meta_info = False if line.startswith("MOTIF"): if motif_number == motifs_per_file: with open( f'{splitted_meme_dir}/{str(split_number).zfill(2)}.txt', 'w') as f: f.write(meta_info + data) data = '' motif_number = 0 split_number += 1 motif_number += 1 data += line # don't forget last batch!! with open(f'{splitted_meme_dir}/{str(split_number).zfill(2)}.txt', 'w') as f: f.write(meta_info + data) with open(cutoffs_file_path) as cutoffs_f: data = '' motif_number = 0 split_number = 0 for line in cutoffs_f: if line.startswith("###"): if motif_number == motifs_per_file: with open( f'{splitted_cutoffs_dir}/{str(split_number).zfill(2)}.txt', 'w') as f: f.write(data) data = '' motif_number = 0 split_number += 1 motif_number += 1 data += line # don't forget last batch!! with open(f'{splitted_cutoffs_dir}/{str(split_number).zfill(2)}.txt', 'w') as f: f.write(data) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')