Ejemplo n.º 1
0
def retrieve_info(indir, suffix='.tab', test_or_not=False):
    # deal with --tblout instead of --domtblout
    suffix = suffix.strip('.')
    gid2locus2ko = defaultdict(list)
    if isdir(indir):
        files_list = glob(join(indir, f'*.{suffix}'))
    elif isfile(indir):
        files_list = [indir]
    else:
        raise Exception()

    if test_or_not:
        files_list = files_list[:3]

    if not files_list:
        exit(
            f"no files could be found with input {join(indir, f'*.{suffix}')},please check the parameters. "
        )
    tqdm.write("reading all annotated result")
    for hf in tqdm(files_list):
        for row in open(hf):
            if row.startswith('#'):
                continue
            r = row.split(' ')
            r = [_ for _ in r if _]
            gene_id = r[0]
            ko = r[2]
            evalue = float(r[4])
            gid2locus2ko[convert_genome_ID_rev(gene_id)].append(
                (gene_id, ko, evalue))
    return gid2locus2ko
Ejemplo n.º 2
0
def filtration_part(gid2locus2ko, evalue=1e-50):
    # filter out with hard threshold of evalue
    post_filtered = {
        k: [(_[1], _[0], _[2]) for _ in v if _[2] <= evalue]
        for k, v in tqdm(gid2locus2ko.items())
    }
    # select minimum evalue among all matched KO for each locus
    # TODO: it may be corrected at following version
    ## it could considerate the position overlapping situations
    used_locus = {}
    locus2ko = {}
    tqdm.write("choose best ko for each locus")
    for gid, inlist in tqdm(post_filtered.items()):
        for key, v, evalue in inlist:
            if evalue <= used_locus.get(v, 100):
                used_locus[v] = evalue
                locus2ko[v] = key
    # tqdm.write("choose best ko for each locus")
    post_filtered = defaultdict(lambda: defaultdict(list))
    for locus, ko in locus2ko.items():
        gid = convert_genome_ID_rev(locus)
        post_filtered[gid][ko].append(locus)

    post_filtered = {
        g: {ko: ','.join(v)
            for ko, v in d.items()}
        for g, d in post_filtered.items()
    }
    return post_filtered
Ejemplo n.º 3
0
def retrieve_info(indir, test=False):
    gid2locus2ko = defaultdict(list)
    exists_db = set()
    files_list = glob(join(indir, '*', f'*.tsv'))

    if not files_list:
        exit(
            f"no files could be found with input {join(indir, '*', f'*.tsv')},please check the parameters. "
        )
    tqdm.write("reading all annotated result")
    if test:
        files_list = files_list[:10]
    for hf in tqdm(files_list):
        for row in open(hf):
            if not row:
                continue
            r = row.split('\t')
            info_dict = dict(zip(header, r))
            gene_id = info_dict['Protein Accession']
            db = info_dict['Analysis']
            sig_id = info_dict['Signature Accession']

            interpro_id = info_dict.get("InterPro accession", '')
            evalue = float(info_dict['Score'])
            Status = info_dict['Status']
            gid2locus2ko[convert_genome_ID_rev(gene_id)].append(
                (gene_id, db, sig_id, interpro_id, evalue, Status))
            exists_db.add(db)
    return gid2locus2ko, exists_db
Ejemplo n.º 4
0
def main(indir, suffix, num_genes, not_add_prefix_ids):
    all_genes = glob(join(indir, f'*.{suffix}'))
    gid2num = defaultdict(int)

    tqdm.write('reading all genes...')
    for f in tqdm(all_genes):
        records = SeqIO.parse(f, format='fasta')
        for r in records:
            gid = convert_genome_ID_rev(r.id,
                                        not_add_prefix_ids=not_add_prefix_ids)
            gid2num[gid] += 1
    genomes = {k for k, v in gid2num.items() if v >= num_genes}
    tqdm.write(f"detect {len(genomes)} match given params...")
    return genomes
Ejemplo n.º 5
0
def main(infiles):

    _df_dict = defaultdict(lambda: defaultdict(int))
    for f in infiles:
        name = basename(f).rpartition('.')[0]
        records = SeqIO.parse(f, format='fasta')
        records = list(records)
        for _ in records:
            _df_dict[name][convert_genome_ID_rev(_.id)] = 1
    df = pd.DataFrame.from_dict(_df_dict)
    df = df.fillna(0)
    df = df.reindex(
        columns=sorted(df.columns, key=lambda x: df[x].sum(), reverse=True))
    df = df.sort_values(list(df.columns))
    return df
Ejemplo n.º 6
0
def outut_for(l2ko, odir, name='mixed', transpose=False):
    if not exists(odir):
        os.makedirs(odir)
    tqdm.write('converting into locus2gene side by side table...no progress')
    l2ko_df = pd.DataFrame.from_dict(l2ko).T
    if l2ko_df.shape[1] != 3:
        print(f"it might be something wrong for {name}")
        return
    l2ko_df.columns = ["annotated ID", "database", 'interpro ID']
    l2ko_df.loc[:,
                'genome'] = [convert_genome_ID_rev(_) for _ in l2ko_df.index]
    l2ko_df.to_csv(join(odir, f"{name}_l2ID.tab"),
                   sep='\t',
                   index=1,
                   index_label='locus')

    tqdm.write(f"start to output {name} locus2gene")
    genome2interpro2locus = defaultdict(lambda: defaultdict(set))
    genome2gene2locus = defaultdict(lambda: defaultdict(set))
    for locus, row in tqdm(l2ko_df.iterrows(), total=l2ko_df.shape[0]):
        genome = row['genome']
        gene = row['annotated ID']
        interpro = row['interpro ID']
        genome2gene2locus[genome][gene].add(locus)
        if interpro:
            genome2interpro2locus[genome][interpro].add(locus)

    tqdm.write(f"packing......")
    for _, r in enumerate([genome2gene2locus, genome2interpro2locus]):
        if _ == 1:
            fname = f"{name}_interpro"
        else:
            fname = f"{name}"
        ofile_info = join(odir, f"{fname}_info.tab")
        ofile_binary = join(odir, f"{fname}_binary.tab")
        ofile_num = join(odir, f"{fname}_num.tab")
        final_df = pd.DataFrame.from_dict(r, orient='index')
        bin_df = final_df.applymap(lambda x: 0 if pd.isna(x) else 1)
        num_df = final_df.applymap(lambda x: 0
                                   if pd.isna(x) else len(str(x).split(',')))
        if transpose:
            final_df = final_df.T
            bin_df = bin_df.T
            num_df = num_df.T
        final_df.to_csv(ofile_info, sep='\t', index=1, index_label='gene')
        bin_df.to_csv(ofile_binary, sep='\t', index=1, index_label='gene')
        num_df.to_csv(ofile_num, sep='\t', index=1, index_label='gene')
Ejemplo n.º 7
0
def main(in_dir,
         odir,
         num_parellel,
         suffix='',
         new_suffix='',
         gids=None,
         force=False,
         mode=default_mode,
         fix_refseq=False,
         removed_gene_list=None,
         not_add_prefix_ids=[],
         **kwarg):
    suffix = suffix.strip('.')
    new_suffix = new_suffix.strip('.')
    if not exists(odir):
        os.makedirs(odir)
    if suffix:
        suffix = '.' + suffix
    file_list = glob(join(in_dir, f'*{suffix}'))
    if gids is not None:
        gids = set(gids)
        os.makedirs(join(odir, 'tmp'), exist_ok=1)
        new_file_list = []
        tqdm.write('iterating files to collect with giving genome ids')
        for f in tqdm(file_list):
            records = SeqIO.parse(f, format='fasta')
            records = [_ for _ in records if _.id in gids]
            if not records:
                records = SeqIO.parse(f, format='fasta')

                if not fix_refseq:
                    records = [
                        _ for _ in records if convert_genome_ID_rev(
                            _.id.split('_')[0],
                            not_add_prefix_ids=not_add_prefix_ids) in gids
                    ]
                else:
                    gids = [_.split('_')[-1] for _ in gids]
                    records = [
                        _ for _ in records if convert_genome_ID_rev(
                            _.id.split('_')[0],
                            prefix='',
                            not_add_prefix_ids=not_add_prefix_ids) in gids
                    ]
            n_f = join(odir, 'tmp', basename(f))
            if not records or len(records) == 1:
                print(f'failed records,for {f}, pass it')
                continue
            if removed_gene_list is not None:
                records = [_ for _ in records if _.id not in removed_gene_list]
            with open(n_f, 'w') as f1:
                SeqIO.write(records, f1, format='fasta-2line')
            new_file_list.append(n_f)
        file_list = new_file_list[::]
    tqdm.write("start to process %s file with '%s' as suffix" %
               (len(file_list), suffix))
    params = []
    for in_file in tqdm(file_list):
        if new_suffix and suffix:
            ofile = join(odir,
                         basename(in_file).replace(suffix, '.' + new_suffix))
        else:
            ofile = join(odir, basename(in_file))
        if not exists(ofile) or force:
            params.append((in_file, ofile, mode))
    with mp.Pool(processes=num_parellel) as tp:
        r = list(tqdm(tp.imap(run, params), total=len(params)))
Ejemplo n.º 8
0
infile = "./protein_annotations/kegg_merged.tab"
for row in tqdm(open(infile)):
    rows = row.split('\t')
    row_dict = dict(zip(header, rows))
    if len(used_dict[rows[0]]) > 10:
        continue
    if float(row_dict['evalue']) <= 1e-50:
        used_dict[rows[0]].append(locusID2kegg_dict.get(rows[1], None))

# all_ko = [_.split(':')[-1] for v in used_dict.values() for _ in v if _ is not None]
# all_ko = list(set(all_ko))

all_None_seqs = []
g2ko2tags = defaultdict(lambda: defaultdict(list))
for locus_tag, annotation in tqdm(used_dict.items()):
    gid = convert_genome_ID_rev(locus_tag)
    valid_annotations = list(set([_ for _ in annotation if _ is not None]))
    if len(valid_annotations) == 1:
        ko = valid_annotations[0].split(':')[-1]
        g2ko2tags[gid][ko].append(locus_tag)
    elif len(valid_annotations) > 1:
        for ko in set(valid_annotations):
            ko = ko.split(':')[0]
            g2ko2tags[gid][ko].append(locus_tag)
        # multi_match.append(locus_tag)
    elif len(valid_annotations) != 0:
        all_None_seqs.append(locus_tag)
    else:
        pass

g2ko2tags = {k: {_k: ','.join(_v) for _k, _v in v.items()}
Ejemplo n.º 9
0
def main(indir,
         outfile,
         genome_list,
         gene_list,
         remove_identical,
         seed,
         concat_type,
         graph,
         fill_gaps,
         suffix='aln',
         fix_refseq=False,
         not_add_prefix=None,
         partition_method='genes',
         simple_concat=False):
    if genome_list is None:
        genome_list = join(indir, 'selected_genomes.txt')
    gids = open(genome_list, 'r').read().split('\n')
    if simple_concat:
        gids = set(gids)
    else:
        gids = [convert_genome_ID(_) for _ in gids if _]
        if fix_refseq:
            prefix = 'GCF_'
        else:
            prefix = 'GCA_'
        if not_add_prefix is not None:
            not_add_prefix_ids = [
                _ for _ in open(not_add_prefix).read().split('\n') if _
            ]
        else:
            not_add_prefix_ids = []
    # from GCA become locus_tag
    record_pos_info = []
    gid2record = {gid: '' for gid in gids}

    las_pos = 0
    order_seqs = sorted(glob(join(indir, f'*.{suffix}')))
    if gene_list is not None:
        if exists(str(gene_list)):
            gene_list = [
                _.strip() for _ in open(gene_list).read().split('\n') if _
            ]
            order_seqs = [
                _ for _ in order_seqs
                if basename(_).replace(f'.{suffix}', '') in gene_list
            ]
        elif isinstance(gene_list, str):
            gene_list = [_.strip() for _ in gene_list.split(',') if _]
            order_seqs = [
                _ for _ in order_seqs
                if basename(_).replace(f'.{suffix}', '') in gene_list
            ]
    g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs}
    tqdm.write('itering all requested files ')
    for idx, aln_file in tqdm(enumerate(order_seqs), total=len(order_seqs)):
        aln_file_name = basename(aln_file).replace(f'.{suffix}', '')
        aln_record = AlignIO.read(aln_file, format='fasta')
        length_this_aln = aln_record.get_alignment_length()
        # record the partition
        name = "part%s" % int(idx + 1)
        start, end = las_pos + 1, length_this_aln + las_pos
        las_pos = end
        record_pos_info.append((name, start, end, aln_record))
        # done record
        for gid in gid2record:
            if simple_concat:
                records = [_ for _ in aln_record if _.id == gid]
            else:
                records = [_ for _ in aln_record if _.id.split('_')[0] == gid]
            if records:
                gid2record[gid] += str(records[0].seq)
            else:
                gid2record[gid] += '-' * length_this_aln

                g2num_miss[aln_file_name] += 1

    if outfile is None:
        outfile = join(indir, 'concat_aln.aln')
        outpartition = join(indir, 'concat_aln.partition')
        outphy = join(indir, 'concat_aln.phy')
        ograph = join(indir, 'aln_stats.png')
    else:
        outfile = process_path(outfile)
        if not exists(dirname(outfile)):
            os.makedirs(dirname(outfile))
        outpartition = outfile.rpartition('.')[0] + '.partition'
        outphy = outfile.rpartition('.')[0] + '.phy'
        ograph = join(dirname(outfile), 'aln_stats.png')

    with open(outfile, 'w') as f1:
        for gid, seq in gid2record.items():
            if set(str(seq)) == {'-'}:
                print(f"{gid} contains only gaps or missing data ")
                continue
            if simple_concat:
                f1.write(f">{gid}\n")
            else:
                f1.write(
                    f'>{convert_genome_ID_rev(gid, prefix=prefix,not_add_prefix_ids=not_add_prefix_ids)}\n'
                )
            f1.write(f'{seq}\n')

    if remove_identical:
        remove_identical_seqs(outfile, seed=seed)
    if concat_type.lower() in ['both', 'partition']:
        generate_partition_file(outpartition, record_pos_info)
    if concat_type.lower() in ['both', 'phy']:
        gids = open(genome_list, 'r').read().split('\n')

        if not simple_concat:
            name_convertor = lambda x: convert_genome_ID_rev(
                x, not_add_prefix_ids=not_add_prefix_ids)
        else:
            name_convertor = lambda x: x
        generate_phy_file(outphy,
                          record_pos_info,
                          gids,
                          fill_gaps=fill_gaps,
                          remove_identical=remove_identical,
                          partition_method=partition_method,
                          name_convertor=name_convertor)
    if graph:
        generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)
Ejemplo n.º 10
0
id2info = gid2taxon
id2info, info2color = get_colors_general(id2info)
text = to_color_branch(id2info, info2color, dataset_name='phylum/class', no_legend=True)

with open('./itol_txt/phylum_annotate_branch.txt', 'w') as f1:
    f1.write(text)

# annotate 27 genes
from Bio import SeqIO

rrna_dir = './rrna'
gid2genes = {k: [_k for _k, _v in v.items() if _v] for k, v in _subgenome2cdd.items()}

for record in SeqIO.parse(join(rrna_dir, '16S.fasta'), format='fasta'):
    gname = 'GCA_' + convert_genome_ID_rev(record.id.split('_')[0])
    if gname in gid2genes:
        gid2genes[gname].append('16S')
for record in SeqIO.parse(join(rrna_dir, '23S.fasta'), format='fasta'):
    gname = 'GCA_' + convert_genome_ID_rev(record.id.split('_')[0])
    if gname in gid2genes:
        gid2genes[gname].append('23S')

all_genes = set([_ for vl in gid2genes.values() for _ in vl])
text = to_binary_shape(gid2genes,
                       {g: {'color': '#007acc'} for g in all_genes})

with open('./itol_txt/27genes.txt', 'w') as f1:
    f1.write(text)
# annotate cog25
from dating_workflow.step_script.extract_cog25 import parse_annotation