예제 #1
0
def calculate_pssm_thresholds(
    meme_path,
    cutoffs_path,
    faa_path,
    number_of_random_pssms,
    output_path,
    done_path,
    argv='no_argv',
    pssm_score_peptide='/groups/pupko/orenavr2/igomeProfilingPipeline/src/PSSM_score_Peptide/PSSM_score_Peptide'
):

    if not os.path.exists(output_path):
        # TODO: any modules to load?
        cmd = f'{pssm_score_peptide} -pssm {meme_path} -pssm_cutoffs {cutoffs_path} -seq {faa_path} ' \
              f'-out {output_path} -NrandPSSM {number_of_random_pssms} -CalcPSSM_Pval'
        logger.info(
            f'{datetime.datetime.now()}: starting CalcPSSM_Pval. Executed command is:\n{cmd}'
        )
        subprocess.run(cmd, shell=True)
    else:
        logger.info(
            f'{datetime.datetime.now()}: skipping scanning calculation as it is already exist at:\n{output_path}'
        )

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(output_path)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def remove_cysteine(fasta_file,
                    out_fasta_file,
                    done_file_path,
                    argv='no_argv'):
    """
    :param fasta_file: a fasta file with sequences
    :param out_fasta_file: a fasta file with the same sequences but flanking Cysteine is removed
    :return:
    """
    logger.info(
        f'{datetime.datetime.now()}: removing Cysteine loop from {fasta_file}')

    verify_file_is_not_empty(fasta_file)

    f_in = open(fasta_file)
    f_out = open(out_fasta_file, 'w')
    for header in f_in:
        seq = f_in.readline().rstrip()
        if seq.startswith('C') and seq.endswith('C'):
            seq = f'{seq[1:-1]}'  # remove Cys loop
        f_out.write(f'{header}{seq}\n')

    verify_file_is_not_empty(out_fasta_file)

    with open(done_file_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #3
0
def aggregate_pvalues_results(meme_path, scanning_results_dir_path, bc, samplename2biologicalcondition_path,
                              aggregated_pvalues_path, aggregated_hits_path, done_path, argv='no_argv'):

    samplename2biologicalcondition = load_table_to_dict(samplename2biologicalcondition_path,
                                                'Barcode {} belongs to more than one sample_name!!')

    all_consensuses = get_consensus_sequences_from_meme(meme_path)
    pvalues_f = open(aggregated_pvalues_path, 'w')
    hits_f = open(aggregated_hits_path, 'w')

    #header
    pvalues_result = hits_result = f'sample_name,label,{",".join(all_consensuses)}'
    for file_name in sorted(os.listdir(scanning_results_dir_path)):
        if file_name.endswith('100.txt'):
            raise TypeError  # why?
        if file_name.startswith('.'):
            # system file...
            continue

        if file_name.endswith('00.txt'):
            # next sample is starting
            pvalues_f.write(f'{pvalues_result.rstrip(",")}\n')
            hits_f.write(f'{hits_result.rstrip(",")}\n')
            sample_name = file_name.split('_peptides')[0]
            if bc in sample_name:
                label = samplename2biologicalcondition[sample_name]
            else:
                label = 'other'
            pvalues_result = hits_result = f'{sample_name},{label},'

        pvalues, hits = get_results(os.path.join(scanning_results_dir_path, file_name))
        pvalues_result += ','.join(pvalues) + ','
        hits_result += ','.join(hits) + ','

    pvalues_f.write(f'{pvalues_result.rstrip(",")}\n')
    hits_f.write(f'{hits_result.rstrip(",")}\n')

    pvalues_f.close()
    hits_f.close()

    # remove insignificant features:
    df = pd.read_csv(aggregated_pvalues_path)
    # features with at least one significant score, across positive-labeled samples
    positive_class_df = df[df['label'] != 'other']
    significant_features = (positive_class_df.drop(['sample_name', 'label'], axis=1) < 0.05).sum()>0
    mask = pd.concat([pd.Series([True, True], index=['sample_name', 'label']),
                    significant_features])
    df = df.loc[:, mask]
    # df = pd.concat([df.loc[:, ['sample_name', 'label']], df.drop(['sample_name', 'label'], axis=1).loc[:, significant_features]], axis=1)
    df.to_csv(aggregated_pvalues_path.replace('_insignificant', ''), index=False)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(aggregated_pvalues_path)
    verify_file_is_not_empty(aggregated_hits_path)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #4
0
def aggregate_scores(scores_path, bc):

    # scores_path is a folder in which each file contains the scores of one of the scans split, e.g.:
    # /groups/pupko/orenavr2/igomeProfilingPipeline/experiments/test/analysis/model_fitting/17b/hits_scores
    output_path = f'{os.path.split(scores_path)[0]}/hits.txt'

    call(f'cat {scores_path}/*{bc}_motifs_*.txt > {output_path}', shell=True)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(output_path)
예제 #5
0
def aggregate_pvalues_results(meme_path,
                              scanning_results_dir_path,
                              bc,
                              samplename2biologicalcondition_path,
                              aggregated_pvalues_path,
                              aggregated_hits_path,
                              done_path,
                              argv='no_argv'):

    samplename2biologicalcondition = load_table_to_dict(
        samplename2biologicalcondition_path,
        'Barcode {} belongs to more than one sample_name!!')

    all_consensuses = get_consensus_sequences_from_meme(meme_path)
    pvalues_f = open(aggregated_pvalues_path, 'w')
    hits_f = open(aggregated_hits_path, 'w')

    #header
    pvalues_result = hits_result = f'sample_name,label,{",".join(all_consensuses)}'
    for file_name in sorted(os.listdir(scanning_results_dir_path)):
        if file_name.endswith('100.txt'):
            raise TypeError  # why?

        if file_name.endswith('00.txt'):
            # next sample is starting
            pvalues_f.write(f'{pvalues_result.rstrip(",")}\n')
            hits_f.write(f'{hits_result.rstrip(",")}\n')
            sample_name = file_name.split('_peptides')[0]
            if bc in sample_name:
                label = samplename2biologicalcondition[sample_name]
            else:
                label = 'other'
            pvalues_result = hits_result = f'{sample_name},{label},'

        pvalues, hits = get_results(
            os.path.join(scanning_results_dir_path, file_name))
        pvalues_result += ','.join(pvalues) + ','
        hits_result += ','.join(hits) + ','

    pvalues_f.write(f'{pvalues_result.rstrip(",")}\n')
    hits_f.write(f'{hits_result.rstrip(",")}\n')

    pvalues_f.close()
    hits_f.close()

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(aggregated_pvalues_path)
    verify_file_is_not_empty(aggregated_hits_path)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #6
0
def extract_clusters_sequences(fasta_file, clstr_file, output_dir, done_path,
                               max_number_of_members_per_cluster, cluster_prefix_length_in_clstr_file,
                               file_prefix, argv='no_argv'):

    verify_file_is_not_empty(fasta_file)
    verify_file_is_not_empty(clstr_file)

    os.makedirs(output_dir, exist_ok=True)

    member_prefix_to_record = load_member_prefix_to_record_dict(fasta_file, cluster_prefix_length_in_clstr_file)

    cluster_to_members_records = load_clusters_to_members_dict(clstr_file, member_prefix_to_record, cluster_prefix_length_in_clstr_file)

    logger.info(f'{datetime.datetime.now()}: Writing clusters sequences...')

    trimmed_clusters = set()
    # sort records of each cluster by their size and keep only first $max_num_of_sequences_to_keep records
    for cluster in cluster_to_members_records:
        # sort cluster members by their "strength", i.e., counts
        cluster_to_members_records[cluster].sort(key=extract_sequence_counts_from_record, reverse=True)
        if len(cluster_to_members_records[cluster])>max_number_of_members_per_cluster:
            # discard (in-place) all sequences above the maximum required number
            cluster_to_members_records[cluster][max_number_of_members_per_cluster:] = []
            trimmed_clusters.add(cluster)

    max_number_of_leading_zeros = len(str(len(cluster_to_members_records)))

    sorted_clusters_by_size = sorted(cluster_to_members_records, reverse=True,
                                     key=lambda cluster: extract_cluster_size_from_records(cluster_to_members_records[cluster]))

    if file_prefix != '':
        file_prefix += '_'

    for i, cluster in enumerate(sorted_clusters_by_size):

        cluster_rank = str(i).zfill(4) #max_number_of_leading_zeros)
        number_of_unique_members = min(len(cluster_to_members_records[cluster]), max_number_of_members_per_cluster)
        cluster_counts = extract_cluster_size_from_records(cluster_to_members_records[cluster])

        filename = f'{file_prefix}clusterRank_' \
                   f'{cluster_rank}_uniqueMembers_' \
                   f'{"top" if cluster in trimmed_clusters else ""}' \
                   f'{number_of_unique_members}_' \
                   f'clusterSize_{cluster_counts:.2f}.faa'  # take only 2 digits after the floating point

        with open(os.path.join(output_dir, filename), 'w') as f:
            f.write(''.join(record for i, record in enumerate(cluster_to_members_records[cluster])))# if i<100))

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def convert_sequences_to_upper(in_fasta_file, out_fasta_file, done_file_path, argv='no_argv'):

    logger.info(f'{datetime.datetime.now()}: upper casing all sequences in {in_fasta_file}')

    verify_file_is_not_empty(in_fasta_file)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(in_fasta_file)
    with open(out_fasta_file, 'w') as f:
        for header in header_to_sequence:
            f.write(f'>{header}\n{header_to_sequence[header].upper()}\n')

    verify_file_is_not_empty(out_fasta_file)

    with open(done_file_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #8
0
def add_pssm_to_meme_file(msa_path, meme_path, add_header):
    if add_header:
        logger.info(f'Generating a new MEME file at {meme_path}')

    logger.info(f'Calculating PSSM of {msa_path}')
    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(msa_path)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(msa_path)
    letters = sorted(set(letter.upper() for letter in nnk_table.values()))  # don't differentiate between Q and q...
    column_to_letters_frequency_counter = get_pssm(header_to_sequence, msa_length, letters)

    consensus_sequence = ''.join(max(column_to_letters_frequency_counter[column], key=column_to_letters_frequency_counter[column].get)
                                 for column in column_to_letters_frequency_counter)

    mode = 'a'  # append to an existing file
    meta_info = ''
    if add_header:
        # override previous file!!
        mode = 'w'
        meta_info = f'MEME version 4\n\n' \
                    f'ALPHABET= {"".join(letters)}\n\n' \
                    f'Background letter frequencies\n' \
                    f'{get_background_letters_frequency_str(nnk_table)}\n'
    else:
        # the file already exists and contains at least one PSSM
        # just add some new lines before the next PSSM
        meta_info += '\n\n'
        assert os.path.exists(meme_path), \
            f"add_header parameter wasn't set but as if meme_path exists but it does not!\n{meme_path}\n"

    msa_name = os.path.split(os.path.splitext(msa_path)[0])[1]
    meta_info += f'MOTIF {consensus_sequence}_{msa_name}\n'
    meta_info += f'letter-probability matrix: ' \
                 f'alength= {len(letters)} ' \
                 f'w= {msa_length} ' \
                 f'nsites= {number_of_sequences}\n'

    with open(meme_path, mode) as f:
        f.write(meta_info)
        for column in column_to_letters_frequency_counter:
            # gaps are not counted so the total number of actual participating sequences can
            # be lower than $number_of_sequences
            number_of_participating_sequences = sum(column_to_letters_frequency_counter[column].values())
            column_distribution_str = ' '.join(f'{count/number_of_participating_sequences}'
                                               for count in column_to_letters_frequency_counter[column].values()) + '\n'
            f.write(column_distribution_str)
예제 #9
0
def reconstruct_msa(sequences_file_path,
                    output_file_path,
                    done_path,
                    argv='no_argv'):
    number_of_unique_members = get_unique_members_from(sequences_file_path)
    if number_of_unique_members > 1:
        import subprocess
        # TODO: module load mafft..
        # --auto Automatically selects an appropriate strategy from L-INS-i, FFT-NS-i and FFT-NS-2, according to data size.
        # --amino tells mafft that's an amino acid msa. If you let it decide by itself, it might wrong on small data sets
        # as they might look like dna but they are NOT! e.g.,
        # [orenavr2@powerlogin-be2 test]$ cat /groups/pupko/orenavr2/igomeProfilingPipeline/experiments/test/analysis/motif_inference/17b_03/unaligned_sequences/17b_03_clusterRank_215_uniqueMembers_2_clusterSize_252.81.faa
        # >seq_235_lib_12_len_12_counts_126.40626975097965
        # CNTDVACAAPGN
        # >seq_1112_lib_C8C_len_10_counts_126.40626975097965
        # CTTACAPVNC
        cmd = f'mafft --auto --amino {sequences_file_path} > {output_file_path}'
        logger.info(
            f'{datetime.datetime.now()}: Starting MAFFT. Executed command is:\n{cmd}'
        )
        subprocess.run(cmd, shell=True)
    else:
        logger.info(
            f'{datetime.datetime.now()}: skipping alignment for a cluster with a single member. '
            f'Writing the output file as is to\n'
            f'{output_file_path}')
        with open(sequences_file_path) as unaligned_f:
            content = unaligned_f.read()
        with open(output_file_path, 'w') as aligned_f:
            aligned_f.write(content)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(output_file_path)

    # override the results with clean ones (no redundant new lines. For further details see function's doc)
    remove_redundant_newlines_from_fasta(output_file_path, output_file_path)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(output_file_path)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #10
0
def create_meme_file(msas_path,
                     meme_path,
                     done_path,
                     minimal_number_of_columns_required,
                     argv='no_argv'):

    logger.info(
        f'{datetime.datetime.now()}: generating a new MEME file at {meme_path}'
    )
    letters = sorted(
        set(letter.upper() for letter in
            nnk_table.values()))  # don't differentiate between Q and q...

    meme_f = open(meme_path, 'w')
    # write meme file header
    meme_f.write(f'MEME version 4\n\n'
                 f'ALPHABET= {"".join(letters)}\n\n'
                 f'Background letter frequencies\n'
                 f'{get_background_letters_frequency_str(nnk_table)}\n')

    for msa_name in sorted(os.listdir(
            msas_path)):  # Sorting pssm in meme files by cluster's rank
        # clusterRank_000_uniqueMembers_72_clusterSize_757849.92.faa
        msa_path = os.path.join(msas_path, msa_name)
        logger.info(f'{datetime.datetime.now()}: writing pssm of {msa_path}')
        # make sure that there are results and the msa file is not empty
        verify_file_is_not_empty(msa_path)
        header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
            msa_path)
        if msa_length < minimal_number_of_columns_required:
            logger.warning(
                f'{datetime.datetime.now()}: skipping pssm for {msa_path} with only {msa_length} columns '
                f'(at least {minimal_number_of_columns_required} is required.')
            continue
        column_to_letters_frequency_counter = get_pssm(header_to_sequence,
                                                       msa_length, letters)
        write_pssm(meme_f, letters, msa_name,
                   column_to_letters_frequency_counter, msa_length,
                   number_of_sequences)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
예제 #11
0
def cluster_sequences(fasta_file, output_prefix, done_file_path, threshold, word_length,
                      throw_sequences_shorter_than, argv='no_argv'):

    verify_file_is_not_empty(fasta_file)

    logger.info(f'{datetime.datetime.now()}: clustering sequences in {fasta_file}')

    # TODO: module load CD hit
    cmd = f'cd-hit -i {fasta_file} ' \
          f'-o {output_prefix} ' \
          f'-c {threshold} ' \
          f'-n {word_length} ' \
          f'-l {throw_sequences_shorter_than}'
    logger.info(f'Starting CD-hit. Executed command is:\n{cmd}')
    subprocess.call(cmd, shell=True)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(f'{output_prefix}.clstr')

    with open(done_file_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def remove_configurations(in_fasta_file,
                          out_fasta_file,
                          allowed_configurations,
                          argv='no_argv'):

    logger.info(
        f'{datetime.datetime.now()}: removing all configurations that are not one of these:\n'
        f'{allowed_configurations}\n'
        f'From {in_fasta_file}')

    verify_file_is_not_empty(in_fasta_file)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
        in_fasta_file)
    with open(out_fasta_file, 'w') as f:
        for header in header_to_sequence:
            for conf in allowed_configurations:
                if f'lib_{conf}_' in header or f'Type_{conf}' in header:
                    f.write(
                        f'>{header}\n{header_to_sequence[header].upper()}\n')
                    break

    verify_file_is_not_empty(out_fasta_file)
예제 #13
0
def remove_sparse_columns(msa_path,
                          out_path,
                          done_path,
                          maximal_gap_frequency_allowed_per_column,
                          argv='no_argv'):
    logger.info(
        f'{datetime.datetime.now()}: Removing sparse columns from {msa_path} (allowing columns with gap frequency lower than {maximal_gap_frequency_allowed_per_column})'
    )
    verify_file_is_not_empty(msa_path)

    header_to_sequence, number_of_sequences, msa_length = load_fasta_to_dict(
        msa_path)
    cleaned_header_to_sequence = dict.fromkeys(header_to_sequence, '')
    for j in range(msa_length):
        column_j = [
            header_to_sequence[header][j] for header in header_to_sequence
        ]
        gap_frequency = column_j.count('-') / number_of_sequences
        if gap_frequency <= maximal_gap_frequency_allowed_per_column:
            # not a sparse column
            for header in header_to_sequence:  # add j'th column
                cleaned_header_to_sequence[header] += header_to_sequence[
                    header][j]
        else:
            logger.debug(
                f'{datetime.datetime.now()}: Removing column #{j}: {column_j}')

    with open(out_path, 'w') as f:
        for header in cleaned_header_to_sequence:
            f.write(f'>{header}\n{cleaned_header_to_sequence[header]}\n')

    logger.info(
        f'{datetime.datetime.now()}: Shortened from {msa_length} to {len(cleaned_header_to_sequence[header])} columns'
    )

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def unite_clusters(
        motif_inference_output_path,
        meme_file,
        biological_condition,
        sample_names,
        max_number_of_members_per_cluster,
        output_path,
        done_path,
        aln_cutoff,
        pcc_cutoff,
        unite_pssm_script_path='/groups/pupko/orenavr2/gershoni/src/UnitePSSMs/UnitePSSMs',
        argv='no_argv'):

    clusters_to_combine_path = os.path.join(output_path,
                                            'cluster_to_combine.csv')
    if not os.path.exists(clusters_to_combine_path):
        # TODO: any modules to load?
        cmd = f'{unite_pssm_script_path} -pssm {meme_file} -out {clusters_to_combine_path} ' \
              f'-aln_cutoff {aln_cutoff} -pcc_cutoff {pcc_cutoff}'
        logger.info(
            f'{datetime.datetime.now()}: starting UnitePSSMs. Executed command is:\n{cmd}'
        )
        subprocess.run(cmd, shell=True)

    # make sure that there are results and the file is not empty
    verify_file_is_not_empty(clusters_to_combine_path)

    logger.info(f'Result file is at {clusters_to_combine_path}')
    clusters_to_combine = []
    with open(clusters_to_combine_path) as f:
        for line in f:
            cluster_names = line.rstrip().split(',')
            # remove consensus sequence so we have the exact cluster (file) name
            cluster_without_prefix = [
                cluster[cluster.index('_') + 1:] for cluster in cluster_names
            ]
            clusters_to_combine.append(cluster_without_prefix)

    logger.info(f'Sorting clusters by rank...')
    # sort the sublist such that the first one will contain the highest copy number, etc...
    clusters_to_combine.sort(key=lambda clusters: sum(
        get_cluster_size_from_name(cluster) for cluster in clusters),
                             reverse=True)
    sorted_clusters_to_combine_path = clusters_to_combine_path.replace(
        'cluster_to_combine', 'sorted_cluster_to_combine')
    with open(sorted_clusters_to_combine_path, 'w') as f:
        for cluster_names in clusters_to_combine:
            f.write(','.join(cluster_names) + '\n')

    unaligned_sequences_path = os.path.join(output_path, 'unaligned_sequences')
    os.makedirs(unaligned_sequences_path, exist_ok=True)

    for cluster_rank in range(len(clusters_to_combine)):
        if cluster_rank % 25 == 0:
            logger.info(
                f'Merging sequences of the cluster ranked {cluster_rank}')

        clusters_sequences, cluster_file_name = get_clusters_sequences(
            motif_inference_output_path, biological_condition, sample_names,
            clusters_to_combine[cluster_rank], cluster_rank,
            max_number_of_members_per_cluster)
        with open(os.path.join(unaligned_sequences_path, cluster_file_name),
                  'w') as f:
            f.write(clusters_sequences)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')
def split_meme_and_cutoff_files(meme_file_path,
                                cutoffs_file_path,
                                motifs_per_file,
                                done_path,
                                argv='no_argv'):

    verify_file_is_not_empty(meme_file_path)
    verify_file_is_not_empty(cutoffs_file_path)

    splitted_meme_dir = os.path.join(os.path.split(meme_file_path)[0], 'memes')
    os.makedirs(splitted_meme_dir, exist_ok=True)

    splitted_cutoffs_dir = os.path.join(
        os.path.split(cutoffs_file_path)[0], 'cutoffs')
    os.makedirs(splitted_cutoffs_dir, exist_ok=True)

    logger.info(
        f'{datetime.datetime.now()}: splitting pssms and cuttoffs to:\n'
        f'{splitted_meme_dir}\n'
        f'{splitted_cutoffs_dir}')

    with open(meme_file_path) as meme_f:
        meta_info = ''
        data = ''
        motif_number = 0
        split_number = 0
        add_meta_info = True
        for line in meme_f:
            if add_meta_info:
                if "MOTIF" not in line:
                    meta_info += line
                    continue
                else:
                    add_meta_info = False
            if line.startswith("MOTIF"):
                if motif_number == motifs_per_file:
                    with open(
                            f'{splitted_meme_dir}/{str(split_number).zfill(2)}.txt',
                            'w') as f:
                        f.write(meta_info + data)
                    data = ''
                    motif_number = 0
                    split_number += 1
                motif_number += 1
            data += line
        # don't forget last batch!!
        with open(f'{splitted_meme_dir}/{str(split_number).zfill(2)}.txt',
                  'w') as f:
            f.write(meta_info + data)

    with open(cutoffs_file_path) as cutoffs_f:
        data = ''
        motif_number = 0
        split_number = 0
        for line in cutoffs_f:
            if line.startswith("###"):
                if motif_number == motifs_per_file:
                    with open(
                            f'{splitted_cutoffs_dir}/{str(split_number).zfill(2)}.txt',
                            'w') as f:
                        f.write(data)
                    data = ''
                    motif_number = 0
                    split_number += 1
                motif_number += 1
            data += line
        # don't forget last batch!!
        with open(f'{splitted_cutoffs_dir}/{str(split_number).zfill(2)}.txt',
                  'w') as f:
            f.write(data)

    with open(done_path, 'w') as f:
        f.write(' '.join(argv) + '\n')