def get_clusters_sequences(motif_inference_output_path, biological_condition, sample_names, cluster_names, cluster_rank, max_number_of_sequences_to_use): sample_paths = [] for sample_name in sample_names: sample_paths.append( os.path.join(motif_inference_output_path, sample_name, 'unaligned_sequences')) unified_dict_sequence2header = {} # unified_cluster for cluster_file_name in cluster_names: for sample_folder in sample_paths: if cluster_file_name in os.listdir(sample_folder): cluster_file_path = os.path.join(sample_folder, cluster_file_name) break else: raise ValueError( f'No cluster named {cluster_file_name} was found in the following dirs:\n' + '\n'.join(sample_paths)) sequence2header = load_fasta_to_dict( cluster_file_path, reverse=True)[0] # don't need the other returned objects for sequence, header in sequence2header.items(): if sequence in unified_dict_sequence2header: unified_header = unified_dict_sequence2header.pop(sequence) total_counts = get_count_from(header) + get_count_from( unified_header) header = unified_header[:unified_header. rindex('_')] + f'_{total_counts}' unified_dict_sequence2header[sequence] = header # TODO: should the counts be normalized by the number of samples involved? each sample contributes million reads # TODO: so 6 samples will contribute more than just 2 samples... # TODO: if so, we should divide HERE the total count (last token of the header) by the number of samples... unified_dict_header2sequence = { header: sequence for sequence, header in unified_dict_sequence2header.items() } result = '' for i, header in enumerate( sorted(unified_dict_header2sequence, key=get_count_from, reverse=True)): if i == max_number_of_sequences_to_use: break result += f'>{header}\n{unified_dict_header2sequence[header]}\n' number_of_unique_members = len(unified_dict_header2sequence) cluster_size = sum( get_count_from(header) for header in unified_dict_header2sequence) result_file_name = f'{biological_condition}_clusterRank_' \ f'{str(cluster_rank).zfill(4)}_uniqueMembers_' \ f'{"top" if number_of_unique_members >= max_number_of_sequences_to_use else ""}' \ f'{min(number_of_unique_members, max_number_of_sequences_to_use)}_' \ f'clusterSize_{cluster_size:.2f}.faa' return result, result_file_name
def convert_sequences_to_upper(in_fasta_file, out_fasta_file, done_file_path, argv='no_argv'): logger.info(f'{datetime.datetime.now()}: upper casing all sequences in {in_fasta_file}') verify_file_is_not_empty(in_fasta_file) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(in_fasta_file) with open(out_fasta_file, 'w') as f: for header in header_to_sequence: f.write(f'>{header}\n{header_to_sequence[header].upper()}\n') verify_file_is_not_empty(out_fasta_file) with open(done_file_path, 'w') as f: f.write(' '.join(argv) + '\n')
def add_pssm_to_meme_file(msa_path, meme_path, add_header): if add_header: logger.info(f'Generating a new MEME file at {meme_path}') logger.info(f'Calculating PSSM of {msa_path}') # make sure that there are results and the file is not empty verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(msa_path) letters = sorted(set(letter.upper() for letter in nnk_table.values())) # don't differentiate between Q and q... column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters) consensus_sequence = ''.join(max(column_to_letters_frequency_counter[column], key=column_to_letters_frequency_counter[column].get) for column in column_to_letters_frequency_counter) mode = 'a' # append to an existing file meta_info = '' if add_header: # override previous file!! mode = 'w' meta_info = f'MEME version 4\n\n' \ f'ALPHABET= {"".join(letters)}\n\n' \ f'Background letter frequencies\n' \ f'{get_background_letters_frequency_str(nnk_table)}\n' else: # the file already exists and contains at least one PSSM # just add some new lines before the next PSSM meta_info += '\n\n' assert os.path.exists(meme_path), \ f"add_header parameter wasn't set but as if meme_path exists but it does not!\n{meme_path}\n" msa_name = os.path.split(os.path.splitext(msa_path)[0])[1] meta_info += f'MOTIF {consensus_sequence}_{msa_name}\n' meta_info += f'letter-probability matrix: ' \ f'alength= {len(letters)} ' \ f'w= {msa_length} ' \ f'nsites= {number_of_sequences}\n' with open(meme_path, mode) as f: f.write(meta_info) for column in column_to_letters_frequency_counter: # gaps are not counted so the total number of actual participating sequences can # be lower than $number_of_sequences number_of_participating_sequences = sum(column_to_letters_frequency_counter[column].values()) column_distribution_str = ' '.join(f'{count/number_of_participating_sequences}' for count in column_to_letters_frequency_counter[column].values()) + '\n' f.write(column_distribution_str)
def create_meme_file(msas_path, meme_path, done_path, minimal_number_of_columns_required, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: generating a new MEME file at {meme_path}' ) letters = sorted( set(letter.upper() for letter in nnk_table.values())) # don't differentiate between Q and q... meme_f = open(meme_path, 'w') # write meme file header meme_f.write(f'MEME version 4\n\n' f'ALPHABET= {"".join(letters)}\n\n' f'Background letter frequencies\n' f'{get_background_letters_frequency_str(nnk_table)}\n') for msa_name in sorted(os.listdir( msas_path)): # Sorting pssm in meme files by cluster's rank # clusterRank_000_uniqueMembers_72_clusterSize_757849.92.faa msa_path = os.path.join(msas_path, msa_name) logger.info(f'{datetime.datetime.now()}: writing pssm of {msa_path}') # make sure that there are results and the msa file is not empty verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( msa_path) if msa_length < minimal_number_of_columns_required: logger.warning( f'{datetime.datetime.now()}: skipping pssm for {msa_path} with only {msa_length} columns ' f'(at least {minimal_number_of_columns_required} is required.') continue column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters) write_pssm(meme_f, letters, msa_name, column_to_letters_frequency_counter, msa_length, number_of_sequences) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')
def remove_configurations(in_fasta_file, out_fasta_file, allowed_configurations, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: removing all configurations that are not one of these:\n' f'{allowed_configurations}\n' f'From {in_fasta_file}') verify_file_is_not_empty(in_fasta_file) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( in_fasta_file) with open(out_fasta_file, 'w') as f: for header in header_to_sequence: for conf in allowed_configurations: if f'lib_{conf}_' in header or f'Type_{conf}' in header: f.write( f'>{header}\n{header_to_sequence[header].upper()}\n') break verify_file_is_not_empty(out_fasta_file)
def remove_sparse_columns(msa_path, out_path, done_path, maximal_gap_frequency_allowed_per_column, argv='no_argv'): logger.info( f'{datetime.datetime.now()}: Removing sparse columns from {msa_path} (allowing columns with gap frequency lower than {maximal_gap_frequency_allowed_per_column})' ) verify_file_is_not_empty(msa_path) header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict( msa_path) cleaned_header_to_sequence = dict.fromkeys(header_to_sequence, '') for j in range(msa_length): column_j = [ header_to_sequence[header][j] for header in header_to_sequence ] gap_frequency = column_j.count('-') / number_of_sequences if gap_frequency <= maximal_gap_frequency_allowed_per_column: # not a sparse column for header in header_to_sequence: # add j'th column cleaned_header_to_sequence[header] += header_to_sequence[ header][j] else: logger.debug( f'{datetime.datetime.now()}: Removing column #{j}: {column_j}') with open(out_path, 'w') as f: for header in cleaned_header_to_sequence: f.write(f'>{header}\n{cleaned_header_to_sequence[header]}\n') logger.info( f'{datetime.datetime.now()}: Shortened from {msa_length} to {len(cleaned_header_to_sequence[header])} columns' ) with open(done_path, 'w') as f: f.write(' '.join(argv) + '\n')