Beispiel #1
0
def test_get_ids_from_row():
    id_set1 = get_ids_from_row(pd.Series({'kegg_id': 'K00001,K00003'}))
    assert id_set1 == {'K00001', 'K00003'}
    id_set2 = get_ids_from_row(
        pd.Series(
            {'kegg_hit':
             'Some text and then [EC:0.0.0.0]; also [EC:1.1.1.1]'}))
    assert id_set2 == {'EC:0.0.0.0', 'EC:1.1.1.1'}
    id_set3 = get_ids_from_row(pd.Series({'peptidase_family': 'ABC1;BCD2'}))
    assert id_set3 == {'ABC1', 'BCD2'}
    id_set4 = get_ids_from_row(
        pd.Series({'cazy_hits': 'GH4 some things;GT6 other things'}))
    assert id_set4 == {'GH4', 'GT6'}
Beispiel #2
0
def make_viral_distillate(potential_amgs, genome_summary_frame):
    rows = list()
    for gene, row in potential_amgs.iterrows():
        gene_ids = get_ids_from_row(row) & set(genome_summary_frame.index)
        if len(gene_ids) > 0:
            for gene_id in gene_ids:
                gene_summary = genome_summary_frame.loc[gene_id]
                if type(gene_summary) is pd.Series:
                    rows.append([
                        gene, row['scaffold'], gene_id,
                        gene_summary['gene_description'],
                        gene_summary['sheet'], gene_summary['header'],
                        gene_summary['subheader'], gene_summary['module'],
                        row['auxiliary_score'], row['amg_flags']
                    ])
                else:
                    for sub_gene_id, sub_gene_summary in gene_summary.iterrows(
                    ):
                        rows.append([
                            gene, row['scaffold'], gene_id,
                            sub_gene_summary['gene_description'],
                            sub_gene_summary['sheet'],
                            sub_gene_summary['header'],
                            sub_gene_summary['subheader'],
                            sub_gene_summary['module'], row['auxiliary_score'],
                            row['amg_flags']
                        ])
        else:
            warnings.warn("No distillate information found for gene %s" % gene)
            rows.append([
                gene, row['scaffold'], '', '', '', '', '', '',
                row['auxiliary_score'], row['amg_flags']
            ])
    return pd.DataFrame(rows, columns=VIRAL_DISTILLATE_COLUMNS)
Beispiel #3
0
def make_viral_functional_df(annotations,
                             genome_summary_frame,
                             groupby_column='scaffold'):
    # build dict of ids per genome
    vgf_to_id_dict = defaultdict(defaultdict_list)
    for vgf, frame in annotations.groupby(groupby_column, sort=False):
        for gene, row in frame.iterrows():
            id_list = get_ids_from_row(row)
            for id_ in id_list:
                vgf_to_id_dict[vgf][id_].append(gene)
    # build long from data frame
    rows = list()
    for category, category_frame in genome_summary_frame.groupby('sheet'):
        for header, header_frame in category_frame.groupby('module'):
            header_id_set = set(header_frame.index.to_list())
            curr_rows = list()
            for vgf, id_dict in vgf_to_id_dict.items():
                present_in_bin = False
                functions_present = list()
                amgs_present = list()
                for id_, amgs in id_dict.items():
                    if id_ in header_id_set:
                        present_in_bin = True
                        functions_present.append(id_)
                        amgs_present += amgs
                curr_rows.append([
                    category, header, ', '.join(amgs_present),
                    ', '.join(functions_present), vgf, present_in_bin
                ])
            if sum([i[-1] for i in curr_rows]) > 0:
                rows += curr_rows
    return pd.DataFrame(rows, columns=VIRAL_LIQUOR_HEADERS)
Beispiel #4
0
def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, groupby_column):
    genome_summary_id_sets = [set([k.strip() for k in j.split(',')]) for j in genome_summary_frame['gene_id']]
    for genome, frame in annotations.groupby(groupby_column, sort=False):
        # make dict of identifiers to gene names
        id_gene_dict = defaultdict(list)
        for gene, row in frame.iterrows():
            ids = get_ids_from_row(row)
            for id_ in ids:
                id_gene_dict[id_].append(gene)
        # fill in genome summary_frame
        values = list()
        for id_set in genome_summary_id_sets:
            this_value = list()
            for id_ in id_set:
                this_value += id_gene_dict[id_]
            values.append(','.join(this_value))
        genome_summary_frame[genome] = values
    return genome_summary_frame