def get_clusters_sequences(motif_inference_output_path, biological_condition,
                           sample_names, cluster_names, cluster_rank,
                           max_number_of_sequences_to_use):
    sample_paths = []
    for sample_name in sample_names:
        sample_paths.append(
            os.path.join(motif_inference_output_path, sample_name,
                         'unaligned_sequences'))

    unified_dict_sequence2header = {}  # unified_cluster
    for cluster_file_name in cluster_names:
        for sample_folder in sample_paths:
            if cluster_file_name in os.listdir(sample_folder):
                cluster_file_path = os.path.join(sample_folder,
                                                 cluster_file_name)
                break
        else:
            raise ValueError(
                f'No cluster named {cluster_file_name} was found in the following dirs:\n'
                + '\n'.join(sample_paths))
        sequence2header = load_fasta_to_dict(
            cluster_file_path,
            reverse=True)[0]  # don't need the other returned objects
        for sequence, header in sequence2header.items():
            if sequence in unified_dict_sequence2header:
                unified_header = unified_dict_sequence2header.pop(sequence)
                total_counts = get_count_from(header) + get_count_from(
                    unified_header)
                header = unified_header[:unified_header.
                                        rindex('_')] + f'_{total_counts}'
            unified_dict_sequence2header[sequence] = header

    # TODO: should the counts be normalized by the number of samples involved? each sample contributes million reads
    # TODO: so 6 samples will contribute more than just 2 samples...
    # TODO: if so, we should divide HERE the total count (last token of the header) by the number of samples...
    unified_dict_header2sequence = {
        header: sequence
        for sequence, header in unified_dict_sequence2header.items()
    }

    result = ''
    for i, header in enumerate(
            sorted(unified_dict_header2sequence,
                   key=get_count_from,
                   reverse=True)):
        if i == max_number_of_sequences_to_use:
            break
        result += f'>{header}\n{unified_dict_header2sequence[header]}\n'

    number_of_unique_members = len(unified_dict_header2sequence)
    cluster_size = sum(
        get_count_from(header) for header in unified_dict_header2sequence)
    result_file_name = f'{biological_condition}_clusterRank_' \
                       f'{str(cluster_rank).zfill(4)}_uniqueMembers_' \
                       f'{"top" if number_of_unique_members >= max_number_of_sequences_to_use else ""}' \
                       f'{min(number_of_unique_members, max_number_of_sequences_to_use)}_' \
                       f'clusterSize_{cluster_size:.2f}.faa'
    return result, result_file_name
def convert_sequences_to_upper(in_fasta_file, out_fasta_file, done_file_path, argv='no_argv'):

    logger.info(f'{datetime.datetime.now()}: upper casing all sequences in {in_fasta_file}')

    verify_file_is_not_empty(in_fasta_file)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(in_fasta_file)
    with open(out_fasta_file, 'w') as f:
        for header in header_to_sequence:
            f.write(f'>{header}\n{header_to_sequence[header].upper()}\n')

    verify_file_is_not_empty(out_fasta_file)

    with open(done_file_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
Exemple #3
0
def add_pssm_to_meme_file(msa_path, meme_path, add_header):
    if add_header:
        logger.info(f'Generating a new MEME file at {meme_path}')

    logger.info(f'Calculating PSSM of {msa_path}')
    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(msa_path)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(msa_path)
    letters = sorted(set(letter.upper() for letter in nnk_table.values()))  # don't differentiate between Q and q...
    column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters)

    consensus_sequence = ''.join(max(column_to_letters_frequency_counter[column], key=column_to_letters_frequency_counter[column].get)
                                 for column in column_to_letters_frequency_counter)

    mode = 'a'  # append to an existing file
    meta_info = ''
    if add_header:
        # override previous file!!
        mode = 'w'
        meta_info = f'MEME version 4\n\n' \
                    f'ALPHABET= {"".join(letters)}\n\n' \
                    f'Background letter frequencies\n' \
                    f'{get_background_letters_frequency_str(nnk_table)}\n'
    else:
        # the file already exists and contains at least one PSSM
        # just add some new lines before the next PSSM
        meta_info += '\n\n'
        assert os.path.exists(meme_path), \
            f"add_header parameter wasn't set but as if meme_path exists but it does not!\n{meme_path}\n"

    msa_name = os.path.split(os.path.splitext(msa_path)[0])[1]
    meta_info += f'MOTIF {consensus_sequence}_{msa_name}\n'
    meta_info += f'letter-probability matrix: ' \
                 f'alength= {len(letters)} ' \
                 f'w= {msa_length} ' \
                 f'nsites= {number_of_sequences}\n'

    with open(meme_path, mode) as f:
        f.write(meta_info)
        for column in column_to_letters_frequency_counter:
            # gaps are not counted so the total number of actual participating sequences can
            # be lower than $number_of_sequences
            number_of_participating_sequences = sum(column_to_letters_frequency_counter[column].values())
            column_distribution_str = ' '.join(f'{count/number_of_participating_sequences}'
                                               for count in column_to_letters_frequency_counter[column].values()) + '\n'
            f.write(column_distribution_str)
def create_meme_file(msas_path,
                     meme_path,
                     done_path,
                     minimal_number_of_columns_required,
                     argv='no_argv'):

    logger.info(
        f'{datetime.datetime.now()}: generating a new MEME file at {meme_path}'
    )
    letters = sorted(
        set(letter.upper() for letter in
            nnk_table.values()))  # don't differentiate between Q and q...

    meme_f = open(meme_path, 'w')
    # write meme file header
    meme_f.write(f'MEME version 4\n\n'
                 f'ALPHABET= {"".join(letters)}\n\n'
                 f'Background letter frequencies\n'
                 f'{get_background_letters_frequency_str(nnk_table)}\n')

    for msa_name in sorted(os.listdir(
            msas_path)):  # Sorting pssm in meme files by cluster's rank
        # clusterRank_000_uniqueMembers_72_clusterSize_757849.92.faa
        msa_path = os.path.join(msas_path, msa_name)
        logger.info(f'{datetime.datetime.now()}: writing pssm of {msa_path}')
        # make sure that there are results and the msa file is not empty
        verify_file_is_not_empty(msa_path)
        header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
            msa_path)
        if msa_length < minimal_number_of_columns_required:
            logger.warning(
                f'{datetime.datetime.now()}: skipping pssm for {msa_path} with only {msa_length} columns '
                f'(at least {minimal_number_of_columns_required} is required.')
            continue
        column_to_letters_frequency_counter = get_pssm(header_to_sequence,
                                                       msa_length, letters)
        write_pssm(meme_f, letters, msa_name,
                   column_to_letters_frequency_counter, msa_length,
                   number_of_sequences)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def remove_configurations(in_fasta_file,
                          out_fasta_file,
                          allowed_configurations,
                          argv='no_argv'):

    logger.info(
        f'{datetime.datetime.now()}: removing all configurations that are not one of these:\n'
        f'{allowed_configurations}\n'
        f'From {in_fasta_file}')

    verify_file_is_not_empty(in_fasta_file)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
        in_fasta_file)
    with open(out_fasta_file, 'w') as f:
        for header in header_to_sequence:
            for conf in allowed_configurations:
                if f'lib_{conf}_' in header or f'Type_{conf}' in header:
                    f.write(
                        f'>{header}\n{header_to_sequence[header].upper()}\n')
                    break

    verify_file_is_not_empty(out_fasta_file)
Exemple #6
0
def remove_sparse_columns(msa_path,
                          out_path,
                          done_path,
                          maximal_gap_frequency_allowed_per_column,
                          argv='no_argv'):
    logger.info(
        f'{datetime.datetime.now()}: Removing sparse columns from {msa_path} (allowing columns with gap frequency lower than {maximal_gap_frequency_allowed_per_column})'
    )
    verify_file_is_not_empty(msa_path)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
        msa_path)
    cleaned_header_to_sequence = dict.fromkeys(header_to_sequence, '')
    for j in range(msa_length):
        column_j = [
            header_to_sequence[header][j] for header in header_to_sequence
        ]
        gap_frequency = column_j.count('-') / number_of_sequences
        if gap_frequency <= maximal_gap_frequency_allowed_per_column:
            # not a sparse column
            for header in header_to_sequence:  # add j'th column
                cleaned_header_to_sequence[header] += header_to_sequence[
                    header][j]
        else:
            logger.debug(
                f'{datetime.datetime.now()}: Removing column #{j}: {column_j}')

    with open(out_path, 'w') as f:
        for header in cleaned_header_to_sequence:
            f.write(f'>{header}\n{cleaned_header_to_sequence[header]}\n')

    logger.info(
        f'{datetime.datetime.now()}: Shortened from {msa_length} to {len(cleaned_header_to_sequence[header])} columns'
    )

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')