Ejemplo n.º 1
0
def make_etc_coverage_df(etc_module_df, annotations, groupby_column='fasta'):
    etc_coverage_df_rows = list()
    for _, module_row in etc_module_df.iterrows():
        definition = module_row['definition']
        # remove optional subunits
        definition = re.sub(r'-K\d\d\d\d\d', '', definition)
        module_net, _ = make_module_network(definition)
        # add end node
        no_out = [
            node for node in module_net.nodes()
            if module_net.out_degree(node) == 0
        ]
        for node in no_out:
            module_net.add_edge(node, 'end')
        # go through each genome and check pathway coverage
        for group, frame in annotations.groupby(groupby_column):
            # get annotation genes
            grouped_ids = set(get_ids_from_annotation(frame).keys())
            path_len, path_coverage_count, path_coverage_percent, genes, missing_genes = \
                get_module_coverage(module_net, grouped_ids)
            complex_module_name = 'Complex %s: %s' % (
                module_row['complex'].replace('Complex ',
                                              ''), module_row['module_name'])
            etc_coverage_df_rows.append([
                module_row['module_id'], module_row['module_name'],
                module_row['complex'].replace('Complex ', ''), group, path_len,
                path_coverage_count, path_coverage_percent,
                ','.join(sorted(genes)), ','.join(sorted(missing_genes)),
                complex_module_name
            ])
    return pd.DataFrame(etc_coverage_df_rows, columns=ETC_COVERAGE_COLUMNS)
Ejemplo n.º 2
0
def make_functional_df(annotations, function_heatmap_form, groupby_column='fasta'):
    # clean up function heatmap form
    function_heatmap_form = function_heatmap_form.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    function_heatmap_form = function_heatmap_form.fillna('')
    # build dict of ids per genome
    genome_to_id_dict = dict()
    for genome, frame in annotations.groupby(groupby_column, sort=False):
        id_list = get_ids_from_annotation(frame).keys()
        genome_to_id_dict[genome] = set(id_list)
    # build long from data frame
    rows = list()
    for function, frame in function_heatmap_form.groupby('function_name', sort=False):
        for bin_name, id_set in genome_to_id_dict.items():
            presents_in_bin = list()
            functions_present = set()
            for _, row in frame.iterrows():
                function_id_set = set([i.strip() for i in row.function_ids.strip().split(',')])
                present_in_bin = id_set & function_id_set
                functions_present = functions_present | present_in_bin
                presents_in_bin.append(len(present_in_bin) > 0)
            function_in_bin = np.all(presents_in_bin)
            row = frame.iloc[0]
            rows.append([row.category, row.subcategory, row.function_name, ', '.join(functions_present),
                         '; '.join(get_ordered_uniques(frame.long_function_name)),
                         '; '.join(get_ordered_uniques(frame.gene_symbol)), bin_name, function_in_bin,
                         '%s: %s' % (row.category, row.function_name)])
    return pd.DataFrame(rows, columns=list(function_heatmap_form.columns) + ['genome', 'present',
                                                                             'category_function_name'])
Ejemplo n.º 3
0
def fill_genome_summary_frame(annotations, genome_summary_frame, groupby_column):
    for genome, frame in annotations.groupby(groupby_column, sort=False):
        genome_summary_id_sets = [set([k.strip() for k in j.split(',')]) for j in genome_summary_frame['gene_id']]
        id_dict = get_ids_from_annotation(frame)
        counts = list()
        for i in genome_summary_id_sets:
            identifier_count = 0
            for j in i:
                if j in id_dict:
                    identifier_count += id_dict[j]
            counts.append(identifier_count)
        genome_summary_frame[genome] = counts
    return genome_summary_frame
Ejemplo n.º 4
0
def add_custom_ms(annotations, distillate_form):
    metabolic_genes = set(distillate_form.index)

    new_amg_flags = list()
    for gene, row in annotations.iterrows():
        if 'M' in row['amg_flags']:
            new_amg_flags.append(row['amg_flags'])
        else:
            gene_annotations = set(
                get_ids_from_annotation(pd.DataFrame(row).transpose()).keys())
            if len(metabolic_genes & gene_annotations) > 0:
                new_amg_flags.append(row['amg_flags'] + 'M')
            else:
                new_amg_flags.append(row['amg_flags'])
    return new_amg_flags
Ejemplo n.º 5
0
def get_metabolic_flags(annotations,
                        metabolic_genes,
                        amgs,
                        verified_amgs,
                        scaffold_length_dict,
                        length_from_end=5000):
    flag_dict = dict()
    metabolic_genes = set(metabolic_genes)
    for scaffold, scaffold_annotations in annotations.groupby('scaffold'):
        # perc_xh = sum([i == 'Xh' if not pd.isna(i) else False for i in scaffold_annotations['vogdb_categories']]) \
        #           / scaffold_annotations.shape[0]
        # is_j = perc_xh >= 0.18
        for gene, row in scaffold_annotations.iterrows():
            # set up
            flags = ''
            gene_annotations = set(
                get_ids_from_annotation(pd.DataFrame(row).transpose()).keys())
            # is viral
            if not pd.isna(row['vogdb_categories']):
                if len({'Xr', 'Xs'}
                       & set(row['vogdb_categories'].split(';'))) > 0:
                    flags += 'V'
            # is metabolic
            if len(metabolic_genes & gene_annotations) > 0:
                flags += 'M'
            # is this a reported AMG reported
            if len(gene_annotations & set(amgs)) > 0:
                if 'M' not in flags:
                    flags += 'M'
                flags += 'K'
            # is this a experimentally verified amg
            if len(gene_annotations & set(verified_amgs)) > 0:
                flags += 'E'
            # is this gene a normal viral cell host entry gene
            if len(gene_annotations & CELL_ENTRY_CAZYS) > 0:
                flags += 'A'
            # is gene a normal virus peptidase
            if len(gene_annotations & VIRAL_PEPTIDASES_MEROPS) > 0:
                flags += 'P'
            # if there is a transposon in the contig
            if scaffold_annotations['is_transposon'].any():
                flags += 'T'
            # within 5 kb of end of contig
            if (int(row['start_position']) < length_from_end) or \
               (int(row['end_position']) > (scaffold_length_dict[row['scaffold']] - length_from_end)):
                flags += 'F'
            # if is_j:
            #     flags += 'J'
            flag_dict[gene] = flags
        # get 3 metabolic genes in a row flag
        for i in range(
                len(scaffold_annotations)
        ):  # this needs to be fixed. Will only give B to middle of 3 genes.
            if 0 < i < (len(scaffold_annotations) - 1):
                gene = scaffold_annotations.index[i]
                gene_flags = flag_dict[gene]
                previous_gene = scaffold_annotations.index[i - 1]
                previous_gene_flags = flag_dict[previous_gene]
                next_gene = scaffold_annotations.index[i + 1]
                next_gene_flags = flag_dict[next_gene]
                if 'M' in previous_gene_flags and 'M' in gene_flags and 'M' in next_gene_flags:
                    if 'B' not in flag_dict[previous_gene]:
                        flag_dict[previous_gene] += 'B'
                    if 'B' not in flag_dict[gene]:
                        flag_dict[gene] += 'B'
                    if 'B' not in flag_dict[next_gene]:
                        flag_dict[next_gene] += 'B'
    return flag_dict