Beispiel #1
0
def run_pullseq(db_path, names_path, VERBOSE_FLAG) -> str:
    '''
    run the package pullseq from within the pHMM pipeline
    :param db_path: absolute path to the directory where the sequences database is (the database to pull from)
    :param names_path: absolute path to the directory where the the names file (the file containing the genes that
    are to be pulled) is located.
    :return: a path (string) to the fasta file of the all the pulled genes (sequences and data). file name format is:
    unknown_genes_{date}.faa.
    '''
    global VERBOSE
    VERBOSE = VERBOSE_FLAG

    if (VERBOSE):
        FUNC_START()
    print("Running pullseq.\n"
          f"Pulling sequences from names path: {names_path}\n"
          f"Against db path: {db_path}\n"
          f"Output path is: {out_path}")

    output_filename = out_path + f'/unknown_genes_{date}.faa'

    cmd_line = ['pullseq', f'-i', f'{db_path}', f'-n', f'{names_path}']
    subprocess.run(cmd_line,
                   stdout=open(output_filename, 'w'),
                   stderr=subprocess.STDOUT)
    print("Finshed running pullseq, unknown genes file:\n"
          f"{output_filename}")

    if (VERBOSE):
        FUNC_END()
    return output_filename
Beispiel #2
0
def write_unknown_genes(unknown_genes_df: pd.DataFrame) -> str:
    '''
    Recive a list of genes to write to ouput file.
    The fucntion assumes nothing about gene list.
    :param unkonwn_genes_list: list of gene names to be written.
    :param output_path: desired path and name of output file, default value is 'txt'
    :return:
    output path for pullseq command.
    Raises 'ValueError' if list is empty.
    other values to be added in the future.
    '''

    if (VERBOSE):
        FUNC_START()

    if unknown_genes_df.empty == True:
        raise ValueError

    out_path1 = out_path + f"/unknown_genes_for_pullseq_{date}.csv"
    unknown_genes_df.to_csv(path_or_buf=out_path1,
                            columns=['unknown_gene'],
                            header=False,
                            index=False)

    out_path2 = out_path + f"/unknown_genes_for_analysis_{date}.csv"
    unknown_genes_df.to_csv(out_path2)

    if (VERBOSE):
        FUNC_END()
    return out_path1
Beispiel #3
0
def extract(structured_df: pd.DataFrame,
            VERBOSE_FLAG: bool = False) -> pd.DataFrame:
    '''
    Prepare output for pullseq make a list of unknown genes - ORF ID per line
    Then write it to a csv file.
    return the path of the csv file
    :param structured_df: the dataframe of the cassettes, this dataframe contains the identified ARGs and potential ARGs.
    A potential ARG is any gene in the cassette that was not identified earlier as an ARG.
    :return: a new dataframe containing all the potential ARGs and the cassette they originate from. The dataframe is
    internally used in the pipeline. During the module's run it also prints the a 1 column csv of the unknown genes that
    is later used for in the run_and_pullseq modulecassette_representing_gene.
    '''

    global VERBOSE
    VERBOSE = VERBOSE_FLAG

    if (VERBOSE):
        FUNC_START()
    # out_path = f"{os.getcwd()}/unknown_genes_{date}.csv"

    print("Extracing uknonwn genes from dataframe.\n"
          f"Output path is: {out_path}\n")

    unknown_genes_df = fetch_unknown_genes(structured_df)
    out_path_for_pullseq = write_unknown_genes(unknown_genes_df)

    print(
        "Finished extracting unknonwm genes.\n"
        f"Writing result to output path: {out_path}, returning a tuple: (unknown_genes_df, output_path of unknown_genes_csv)."
    )

    if (VERBOSE):
        FUNC_END()
    return (unknown_genes_df, out_path_for_pullseq)
Beispiel #4
0
def run_mmseqs_easy_clust(coverage,
                          min_seq_id,
                          input_path,
                          results_prefix,
                          temporary_output_dump: str = 'tmp'):
    '''
    Python wrapper function for running cmd package mmseqs2:
    cmd is : $mmseqs easy-cluster -c {coverage} --min-seq-id {min_seq_id} {input_path} {results_prefix} {temporary_output_dump}
    Assumes the the module mmseqs2 is already loaded to the terminal session.
    :param coverage: Minimal coverage of the substring in each sequence. for the -c flag.
    :param min_seq_id: Minimal identity between sequences. for the --min-seq-id option.
    :param input_path: Path to sequence database to be clustered.
    :param results_prefix: Prefix of all output files outputted by operation.
    :param temporary_output_dump: Temporary location to create temporary runtime files. this is later deleted by the
    function after completion of of clustering operation.
    :return: 0 upon success. Output files can be located by collecting files starting with {results_prefix}
    and further processed using the parse_mmseq_results module.
    '''
    if (VERBOSE):
        FUNC_START()
    cmd_line = [
        'mmseqs', 'easy-cluster', f"-c", f"{coverage}", f"--min-seq-id",
        f"{min_seq_id}", input_path, results_prefix, temporary_output_dump
    ]

    subprocess.run(cmd_line, stderr=subprocess.STDOUT)
    shutil.rmtree('tmp')
    if (VERBOSE):
        FUNC_END()
    return 0
def add_ResFam_members_counts(clusts_by_reps, all_clusters):
    '''
    For each cluster in the clusts_by_reps dataframe, add the distribution of the known ARGs in the different cassettes
    the the cluster members come from.
    :param clusts_by_reps: A dataframe of all the representatives of the clusters. this dataframe represents the data
    about the different clusters.
    :param all_clusters: A dataframe off all genes and their cluster representatives.
    :return: A new dataframe, a copy of the clusts_by_reps dataframe, with a new column representing the distribution of
    the identified members in the cassettes the cluster members originate from.
    '''
    if (VERBOSE):
        FUNC_START()

    counts_vector = []
    for cluster_index, cluster_data in clusts_by_reps.iterrows():
        mask = (all_clusters.rep_id == cluster_data.rep_id)
        tmp = all_clusters[mask]
        cluster_size = cluster_data.cluster_size

        counts_dict = tmp.members.str.get_dummies(',').apply(lambda col: col.sum(), axis=0).to_dict()
        counts = [f'{resFam_member}; {count}; {round(count / cluster_size, 3)}' for resFam_member, count in
                  sorted(counts_dict.items(), reverse=True, key=lambda item: item[1])]


        counts_vector.append(', '.join(counts))

    clusts_by_reps = clusts_by_reps.assign(ResFam_neighbours=counts_vector)

    if (VERBOSE):
        FUNC_END()

    return clusts_by_reps
def add_ResFam_members_from_cassete(all_clusters: pd.DataFrame, csts_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Use the cassttes dataframe (csts_df) to add to the all clusters datafrane (all_clusters) a new column.
    For each gene in all clusters, add the known and identified members of it's cassette, and the architecture of the
    cassette.
    :param csts_df: A dataframe contaitning all the cassettes, this is the same dataframe as 'final_df_{date}_n5'
     that is produced by the proccess_hmm_results module.
    :param all_clusters:  A dataframe off all genes and their cluster representatives.
    :return: A new dataframe, a copy of all_clusters but with the new columns -  'members', 'architecture'
    '''
    if (VERBOSE):
        FUNC_START()

    # For all unknown genes find out which ResFam members are located nearby
    # Extract original cassete for each unknonwn gene so it's possible to trace back what comprehends that cassete

    # Subset the cassetes df then merge it the members table.
    # Resulting a new columm for each member, containng the ResFam gene's it located near to.
    # Read final cassetes dataframe. iterate throughout it's contigs and look for their members in the
    # csts_df = pd.read_csv('/Users/dror/PycharmProjects/AMR_pipeline/data/pHMM_out/final_df_2020_02_28_n5.csv', index_col = 0) #For local use


    cols = ['full_id', 'members', 'architecture']
    tmp = csts_df[cols].rename(columns={'full_id': 'cassette_representing_gene'})
    all_clusters = all_clusters.merge(tmp, on='cassette_representing_gene', how='inner')

    if (VERBOSE):
        FUNC_END()
    return all_clusters
def add_annotation_frequencies_to_clusters(clusts_by_reps: pd.DataFrame,
                                           all_clusters: pd.DataFrame)-> pd.DataFrame:
    '''
    Add to the cluster representations dataframe a column describing the distribution of the annotation of the cluster members.
     for example, a cluster could be made of 100% (1.0) members that had the annotation of a 'hypothetical protein'.
    the new column format is 'annotation_1 frequency_1, annotation_2 frequency_2, ...'.
    :param clusts_by_reps: A dataframe with an entry for each cluster represented by it's representing member.
    :param all_clusters:  pandas dataframe containing all the genes. each entry has a rep_id column that indicates which
    cluster it belongs to.
    :return:
    '''
    if (VERBOSE):
        FUNC_START()
    annotation_freqs_vec = []

    # For representative in the rep's dataframe use it's id to mask out all the members of it's cluster, then add the
    # ... frequency of the diffrerent annotations in the cluster to the reps dataframe.
    for cluster_index, cluster_data in clusts_by_reps.iterrows():
        mask = (all_clusters.rep_id == cluster_data.rep_id)
        tmp = all_clusters[mask]

        annotation_freqs = [f'{anno} ({round(freq, 3)})' for anno, freq in
                            dict(tmp.description.value_counts(normalize=True)).items()]
        annotation_freqs_vec.append(', '.join(annotation_freqs))

    # Assign the annotation frequencies vector to the reps dataframe.
    clusts_by_reps = clusts_by_reps.assign(annotations_freqs=annotation_freqs_vec)

    if (VERBOSE):
        FUNC_END()
    return clusts_by_reps
def add_cluster_members_info(all_clusters: pd.DataFrame, unknown_genes_faa_path:str)-> pd.DataFrame:
    '''
    Enrich the all_clusters dataframe which only has the 'member_id' and the 'rep_id' columns with all the data from the
    unknown_genes fasta file. this includes - description, sequence, sequence length.
    :param all_clusters: a pandas dataframe containing all the genes. must have a 'member_id' column to merge by.
    :param unknown_genes_faa_path: Absolute path to unknown genes fasta file. if originates from this pipeline, it is
     created by the pull_and_cluster module.
    :return: A dataframe off all genes and their cluster representatives, each entry will have the member's original annotation
    and it's sequence.
    '''
    if (VERBOSE):
        FUNC_START()

    # All the genes we deal were originally found as unknown genes and therefore they still exist on the original
    # unknown genes fasta file that was previously created and should be located on the working directory
    # Parse this into a pd dataframe so we can take the additional features that are provided with each sequence such as length etc. and use it.

    all_unknown_seqs = fasta_to_df(unknown_genes_faa_path)
    # all_unknown_seqs = fasta_to_df(f'{out_path}/unknown_genes_2020_02_28.faa') #For local use
    all_unknown_seqs = all_unknown_seqs.rename(columns = {'id':'member_id'})

    # merge the parsed dataframe to the 'all_clusters' dataframe, resulting that for each cluster member we can now know it's description, length etc.
    all_clusters = all_clusters.merge(all_unknown_seqs, on='member_id')
    all_clusters = all_clusters.sort_values('rep_id')

    if (VERBOSE):
        FUNC_END()
    return all_clusters
Beispiel #9
0
def double_cluster(input_path, cov_rate1, cov_rate2, min_id1, min_id2, pref1,
                   pref2, VERBOSE_FLAG):
    global VERBOSE
    VERBOSE = VERBOSE_FLAG

    if (VERBOSE):
        FUNC_START()
    run_mmseqs_easy_clust(cov_rate1, min_id1, input_path, pref1, 'tmp')
    reps_filepath = out_path + f'/{pref1}_rep_seq.fasta'
    run_mmseqs_easy_clust(cov_rate2, min_id2, reps_filepath, pref2, 'tmp')
    if (VERBOSE):
        FUNC_END()
    return 0
Beispiel #10
0
def get_cluster_sizes(all_clusts_df: pd.DataFrame) -> pd.DataFrame:
    '''
    receive a datafrane of all sequences and the representatives of their cluster (symboling for which cluster they belong to)
    infer the cluster sizes by grouping the dataframe by the representatives id. return the inferred sizes as part of a new dataframe.
    :param all_clusts_df: a dataframe containing all the clusters and their members. cluster representatives id column must
    be names 'rep_id'.
    :return: a new dataframe, each entry has a representative id ('rep_id') and the rep's cluster size 'cluster size'.
    '''
    if(VERBOSE):
        FUNC_START()
    grp = all_clusts_df.groupby(['rep_id'])
    df_filtered = grp.count()
    temp = df_filtered.reset_index()
    temp.columns = ['rep_id', 'cluster_size']
    if (VERBOSE):
        FUNC_END()
    return temp
Beispiel #11
0
def fasta_to_df(fasta_input_path) -> pd.DataFrame:
    '''
    Use the SeqIO module to extract the data from the fasta formatted file and create a pandas dataframe out of it.
    :param fasta_input_path: Absolute path of the fasta file.
    :return: a dataframe with the data from the fasta file, columns are: 'id', 'description', 'seq_length', 'sequence'
    '''

    # For debugging purposes
    if (VERBOSE):
        FUNC_START()

    # Lists for holding the data
    ids = []
    descriptions = []
    seqs = []
    lens = []

    data = SeqIO.parse(open(fasta_input_path), 'fasta')

    # Iterate through all genes in the fasta file, extract their data and add it to the relevant list.
    for gene in data:
        desc = (' ').join(gene.description.split('_')[-1].split(' ')[1:])
        id = gene.id
        seq = gene.seq
        
        descriptions.append(desc)
        ids.append(id)
        seqs.append(str(seq))
        lens.append(len(seq))

    # Create pandas series from the lists
    ids = pd.Series(ids)
    seqs = pd.Series(seqs)
    descriptions = pd.Series(descriptions)
    lens = pd.Series(lens)

    # Create a dataframe from the series' and rename the columns, then return it.
    cols = ['id', 'description', 'seq_length', 'sequence', ]
    reps_df = pd.concat([ids, descriptions, lens, seqs], axis = 1)
    reps_df.columns = cols

    if (VERBOSE):
        FUNC_END()

    return reps_df
Beispiel #12
0
def add_cassete_source(all_clusters: pd.DataFrame, unknown_genes_df: pd.DataFrame) -> pd.DataFrame:
    '''
    Add new data to the all_clusters df. for each gene, add to it's entry a new column stating what cassette it comes
    from. The cassette will be represented by the ID of the cassette representing gene as it appears in
    the final_df_{date}_n5 file that is produced by the proccess_hmm_results module.
    :param unknown_genes_df: A dataframe containing all unknown genes with their original fasta name.
    :param all_clusters: A dataframe off all genes and their cluster representatives.
    :return: A new dataframe, a copy of all_clusters but with the new column - 'cassette_representing_gene'.
    '''
    if (VERBOSE):
        FUNC_START()
    #Rename the unknown dataframe columns to suit the merge command.
    unknown_genes_df.rename(columns={'unknown_gene': 'member_id'}, inplace=True)

    # Add this data to the members table
    all_clusters = pd.merge(all_clusters, unknown_genes_df, on='member_id', how='inner')
    if (VERBOSE):
        FUNC_END()
    return all_clusters
Beispiel #13
0
def fetch_unknown_genes(df: pd.DataFrame) -> list:
    '''
    TODO - Improve this so it runs faster, maybe create another column in the original final_df with the genes names.
    Receive a dataframe of format 'final' (as outputted by 'hits_df_to_structural_df2' function.
    for each sample and contig, extract gene numbers from the unknown genes column into a list.
    for each unknown gene in the list, format it's name back to the original fasta contigs file format.
    function does not change the original df, instead it creates a local copy.
    :param df:
    Original dataframe contating data
    :return:
    A list of gene names that can be used to pull genes from a fasta file.
    '''
    if (VERBOSE):
        FUNC_START()

    unknown_genes_list = []
    original_cassetes_list = []
    local_df = df.copy()
    for index, row in local_df.iterrows():
        full_id = row.full_id

        min_gene = min(
            row.known_gene_ids
        )  # This is also the number of digits that has to be replaced by a an unknwon gene id
        min_gene_len = len(str(min_gene))

        for unknown in row.unknown_gene_ids:
            unknown_gene_len = len(str(unknown))
            if unknown_gene_len == min_gene_len:  # this is usually the case, the known gene and the unknown gene are similiar, like 005 and 006
                curr_id = full_id[0:-min_gene_len]
                curr_id += str(unknown)
            elif unknown_gene_len >= min_gene_len:  # e.g known gene id is 9 and unknown gene is 11, needs to replace another digit
                #             print(f"unknown gene is longer the known gene case\n unknown is : {unknown}, known is {min_gene}")
                curr_id = full_id[0:-unknown_gene_len]
                curr_id += str(unknown)
            #             print(f"known gene id is:\n {full_id}")
            #             print(f"final curr id is:\n {curr_id}")
            else:  # e.g Known gene id is 10 and unknown gene id is 9, replace with 0s
                #             print(f"known gene is longer the unknown gene case\n unknown is : {unknown}, known is {min_gene}")
                curr_id = full_id[0:-min_gene_len]
                num_of_zeros = min_gene_len - unknown_gene_len
                for i in range(min_gene_len - unknown_gene_len):
                    curr_id += '0'
                curr_id += str(unknown)
            #             print(f"known gene id is:\n {full_id}")
            #             print(f"final curr id is:\n {curr_id}")

            unknown_genes_list.append(curr_id)
            original_cassetes_list.append(full_id)

    # cols = ['unknown_gene', 'cassete_representing_gene']
    res_df = pd.DataFrame({
        'unknown_gene': unknown_genes_list,
        'cassette_representing_gene': original_cassetes_list
    })
    res_df = res_df.drop_duplicates(subset='unknown_gene', keep="first")

    # res_df.columns = cols

    if (VERBOSE):
        FUNC_END()
    return res_df