def retrieve_info(indir, suffix='.tab', test_or_not=False): # deal with --tblout instead of --domtblout suffix = suffix.strip('.') gid2locus2ko = defaultdict(list) if isdir(indir): files_list = glob(join(indir, f'*.{suffix}')) elif isfile(indir): files_list = [indir] else: raise Exception() if test_or_not: files_list = files_list[:3] if not files_list: exit( f"no files could be found with input {join(indir, f'*.{suffix}')},please check the parameters. " ) tqdm.write("reading all annotated result") for hf in tqdm(files_list): for row in open(hf): if row.startswith('#'): continue r = row.split(' ') r = [_ for _ in r if _] gene_id = r[0] ko = r[2] evalue = float(r[4]) gid2locus2ko[convert_genome_ID_rev(gene_id)].append( (gene_id, ko, evalue)) return gid2locus2ko
def filtration_part(gid2locus2ko, evalue=1e-50): # filter out with hard threshold of evalue post_filtered = { k: [(_[1], _[0], _[2]) for _ in v if _[2] <= evalue] for k, v in tqdm(gid2locus2ko.items()) } # select minimum evalue among all matched KO for each locus # TODO: it may be corrected at following version ## it could considerate the position overlapping situations used_locus = {} locus2ko = {} tqdm.write("choose best ko for each locus") for gid, inlist in tqdm(post_filtered.items()): for key, v, evalue in inlist: if evalue <= used_locus.get(v, 100): used_locus[v] = evalue locus2ko[v] = key # tqdm.write("choose best ko for each locus") post_filtered = defaultdict(lambda: defaultdict(list)) for locus, ko in locus2ko.items(): gid = convert_genome_ID_rev(locus) post_filtered[gid][ko].append(locus) post_filtered = { g: {ko: ','.join(v) for ko, v in d.items()} for g, d in post_filtered.items() } return post_filtered
def retrieve_info(indir, test=False): gid2locus2ko = defaultdict(list) exists_db = set() files_list = glob(join(indir, '*', f'*.tsv')) if not files_list: exit( f"no files could be found with input {join(indir, '*', f'*.tsv')},please check the parameters. " ) tqdm.write("reading all annotated result") if test: files_list = files_list[:10] for hf in tqdm(files_list): for row in open(hf): if not row: continue r = row.split('\t') info_dict = dict(zip(header, r)) gene_id = info_dict['Protein Accession'] db = info_dict['Analysis'] sig_id = info_dict['Signature Accession'] interpro_id = info_dict.get("InterPro accession", '') evalue = float(info_dict['Score']) Status = info_dict['Status'] gid2locus2ko[convert_genome_ID_rev(gene_id)].append( (gene_id, db, sig_id, interpro_id, evalue, Status)) exists_db.add(db) return gid2locus2ko, exists_db
def main(indir, suffix, num_genes, not_add_prefix_ids): all_genes = glob(join(indir, f'*.{suffix}')) gid2num = defaultdict(int) tqdm.write('reading all genes...') for f in tqdm(all_genes): records = SeqIO.parse(f, format='fasta') for r in records: gid = convert_genome_ID_rev(r.id, not_add_prefix_ids=not_add_prefix_ids) gid2num[gid] += 1 genomes = {k for k, v in gid2num.items() if v >= num_genes} tqdm.write(f"detect {len(genomes)} match given params...") return genomes
def main(infiles): _df_dict = defaultdict(lambda: defaultdict(int)) for f in infiles: name = basename(f).rpartition('.')[0] records = SeqIO.parse(f, format='fasta') records = list(records) for _ in records: _df_dict[name][convert_genome_ID_rev(_.id)] = 1 df = pd.DataFrame.from_dict(_df_dict) df = df.fillna(0) df = df.reindex( columns=sorted(df.columns, key=lambda x: df[x].sum(), reverse=True)) df = df.sort_values(list(df.columns)) return df
def outut_for(l2ko, odir, name='mixed', transpose=False): if not exists(odir): os.makedirs(odir) tqdm.write('converting into locus2gene side by side table...no progress') l2ko_df = pd.DataFrame.from_dict(l2ko).T if l2ko_df.shape[1] != 3: print(f"it might be something wrong for {name}") return l2ko_df.columns = ["annotated ID", "database", 'interpro ID'] l2ko_df.loc[:, 'genome'] = [convert_genome_ID_rev(_) for _ in l2ko_df.index] l2ko_df.to_csv(join(odir, f"{name}_l2ID.tab"), sep='\t', index=1, index_label='locus') tqdm.write(f"start to output {name} locus2gene") genome2interpro2locus = defaultdict(lambda: defaultdict(set)) genome2gene2locus = defaultdict(lambda: defaultdict(set)) for locus, row in tqdm(l2ko_df.iterrows(), total=l2ko_df.shape[0]): genome = row['genome'] gene = row['annotated ID'] interpro = row['interpro ID'] genome2gene2locus[genome][gene].add(locus) if interpro: genome2interpro2locus[genome][interpro].add(locus) tqdm.write(f"packing......") for _, r in enumerate([genome2gene2locus, genome2interpro2locus]): if _ == 1: fname = f"{name}_interpro" else: fname = f"{name}" ofile_info = join(odir, f"{fname}_info.tab") ofile_binary = join(odir, f"{fname}_binary.tab") ofile_num = join(odir, f"{fname}_num.tab") final_df = pd.DataFrame.from_dict(r, orient='index') bin_df = final_df.applymap(lambda x: 0 if pd.isna(x) else 1) num_df = final_df.applymap(lambda x: 0 if pd.isna(x) else len(str(x).split(','))) if transpose: final_df = final_df.T bin_df = bin_df.T num_df = num_df.T final_df.to_csv(ofile_info, sep='\t', index=1, index_label='gene') bin_df.to_csv(ofile_binary, sep='\t', index=1, index_label='gene') num_df.to_csv(ofile_num, sep='\t', index=1, index_label='gene')
def main(in_dir, odir, num_parellel, suffix='', new_suffix='', gids=None, force=False, mode=default_mode, fix_refseq=False, removed_gene_list=None, not_add_prefix_ids=[], **kwarg): suffix = suffix.strip('.') new_suffix = new_suffix.strip('.') if not exists(odir): os.makedirs(odir) if suffix: suffix = '.' + suffix file_list = glob(join(in_dir, f'*{suffix}')) if gids is not None: gids = set(gids) os.makedirs(join(odir, 'tmp'), exist_ok=1) new_file_list = [] tqdm.write('iterating files to collect with giving genome ids') for f in tqdm(file_list): records = SeqIO.parse(f, format='fasta') records = [_ for _ in records if _.id in gids] if not records: records = SeqIO.parse(f, format='fasta') if not fix_refseq: records = [ _ for _ in records if convert_genome_ID_rev( _.id.split('_')[0], not_add_prefix_ids=not_add_prefix_ids) in gids ] else: gids = [_.split('_')[-1] for _ in gids] records = [ _ for _ in records if convert_genome_ID_rev( _.id.split('_')[0], prefix='', not_add_prefix_ids=not_add_prefix_ids) in gids ] n_f = join(odir, 'tmp', basename(f)) if not records or len(records) == 1: print(f'failed records,for {f}, pass it') continue if removed_gene_list is not None: records = [_ for _ in records if _.id not in removed_gene_list] with open(n_f, 'w') as f1: SeqIO.write(records, f1, format='fasta-2line') new_file_list.append(n_f) file_list = new_file_list[::] tqdm.write("start to process %s file with '%s' as suffix" % (len(file_list), suffix)) params = [] for in_file in tqdm(file_list): if new_suffix and suffix: ofile = join(odir, basename(in_file).replace(suffix, '.' + new_suffix)) else: ofile = join(odir, basename(in_file)) if not exists(ofile) or force: params.append((in_file, ofile, mode)) with mp.Pool(processes=num_parellel) as tp: r = list(tqdm(tp.imap(run, params), total=len(params)))
infile = "./protein_annotations/kegg_merged.tab" for row in tqdm(open(infile)): rows = row.split('\t') row_dict = dict(zip(header, rows)) if len(used_dict[rows[0]]) > 10: continue if float(row_dict['evalue']) <= 1e-50: used_dict[rows[0]].append(locusID2kegg_dict.get(rows[1], None)) # all_ko = [_.split(':')[-1] for v in used_dict.values() for _ in v if _ is not None] # all_ko = list(set(all_ko)) all_None_seqs = [] g2ko2tags = defaultdict(lambda: defaultdict(list)) for locus_tag, annotation in tqdm(used_dict.items()): gid = convert_genome_ID_rev(locus_tag) valid_annotations = list(set([_ for _ in annotation if _ is not None])) if len(valid_annotations) == 1: ko = valid_annotations[0].split(':')[-1] g2ko2tags[gid][ko].append(locus_tag) elif len(valid_annotations) > 1: for ko in set(valid_annotations): ko = ko.split(':')[0] g2ko2tags[gid][ko].append(locus_tag) # multi_match.append(locus_tag) elif len(valid_annotations) != 0: all_None_seqs.append(locus_tag) else: pass g2ko2tags = {k: {_k: ','.join(_v) for _k, _v in v.items()}
def main(indir, outfile, genome_list, gene_list, remove_identical, seed, concat_type, graph, fill_gaps, suffix='aln', fix_refseq=False, not_add_prefix=None, partition_method='genes', simple_concat=False): if genome_list is None: genome_list = join(indir, 'selected_genomes.txt') gids = open(genome_list, 'r').read().split('\n') if simple_concat: gids = set(gids) else: gids = [convert_genome_ID(_) for _ in gids if _] if fix_refseq: prefix = 'GCF_' else: prefix = 'GCA_' if not_add_prefix is not None: not_add_prefix_ids = [ _ for _ in open(not_add_prefix).read().split('\n') if _ ] else: not_add_prefix_ids = [] # from GCA become locus_tag record_pos_info = [] gid2record = {gid: '' for gid in gids} las_pos = 0 order_seqs = sorted(glob(join(indir, f'*.{suffix}'))) if gene_list is not None: if exists(str(gene_list)): gene_list = [ _.strip() for _ in open(gene_list).read().split('\n') if _ ] order_seqs = [ _ for _ in order_seqs if basename(_).replace(f'.{suffix}', '') in gene_list ] elif isinstance(gene_list, str): gene_list = [_.strip() for _ in gene_list.split(',') if _] order_seqs = [ _ for _ in order_seqs if basename(_).replace(f'.{suffix}', '') in gene_list ] g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs} tqdm.write('itering all requested files ') for idx, aln_file in tqdm(enumerate(order_seqs), total=len(order_seqs)): aln_file_name = basename(aln_file).replace(f'.{suffix}', '') aln_record = AlignIO.read(aln_file, format='fasta') length_this_aln = aln_record.get_alignment_length() # record the partition name = "part%s" % int(idx + 1) start, end = las_pos + 1, length_this_aln + las_pos las_pos = end record_pos_info.append((name, start, end, aln_record)) # done record for gid in gid2record: if simple_concat: records = [_ for _ in aln_record if _.id == gid] else: records = [_ for _ in aln_record if _.id.split('_')[0] == gid] if records: gid2record[gid] += str(records[0].seq) else: gid2record[gid] += '-' * length_this_aln g2num_miss[aln_file_name] += 1 if outfile is None: outfile = join(indir, 'concat_aln.aln') outpartition = join(indir, 'concat_aln.partition') outphy = join(indir, 'concat_aln.phy') ograph = join(indir, 'aln_stats.png') else: outfile = process_path(outfile) if not exists(dirname(outfile)): os.makedirs(dirname(outfile)) outpartition = outfile.rpartition('.')[0] + '.partition' outphy = outfile.rpartition('.')[0] + '.phy' ograph = join(dirname(outfile), 'aln_stats.png') with open(outfile, 'w') as f1: for gid, seq in gid2record.items(): if set(str(seq)) == {'-'}: print(f"{gid} contains only gaps or missing data ") continue if simple_concat: f1.write(f">{gid}\n") else: f1.write( f'>{convert_genome_ID_rev(gid, prefix=prefix,not_add_prefix_ids=not_add_prefix_ids)}\n' ) f1.write(f'{seq}\n') if remove_identical: remove_identical_seqs(outfile, seed=seed) if concat_type.lower() in ['both', 'partition']: generate_partition_file(outpartition, record_pos_info) if concat_type.lower() in ['both', 'phy']: gids = open(genome_list, 'r').read().split('\n') if not simple_concat: name_convertor = lambda x: convert_genome_ID_rev( x, not_add_prefix_ids=not_add_prefix_ids) else: name_convertor = lambda x: x generate_phy_file(outphy, record_pos_info, gids, fill_gaps=fill_gaps, remove_identical=remove_identical, partition_method=partition_method, name_convertor=name_convertor) if graph: generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)
id2info = gid2taxon id2info, info2color = get_colors_general(id2info) text = to_color_branch(id2info, info2color, dataset_name='phylum/class', no_legend=True) with open('./itol_txt/phylum_annotate_branch.txt', 'w') as f1: f1.write(text) # annotate 27 genes from Bio import SeqIO rrna_dir = './rrna' gid2genes = {k: [_k for _k, _v in v.items() if _v] for k, v in _subgenome2cdd.items()} for record in SeqIO.parse(join(rrna_dir, '16S.fasta'), format='fasta'): gname = 'GCA_' + convert_genome_ID_rev(record.id.split('_')[0]) if gname in gid2genes: gid2genes[gname].append('16S') for record in SeqIO.parse(join(rrna_dir, '23S.fasta'), format='fasta'): gname = 'GCA_' + convert_genome_ID_rev(record.id.split('_')[0]) if gname in gid2genes: gid2genes[gname].append('23S') all_genes = set([_ for vl in gid2genes.values() for _ in vl]) text = to_binary_shape(gid2genes, {g: {'color': '#007acc'} for g in all_genes}) with open('./itol_txt/27genes.txt', 'w') as f1: f1.write(text) # annotate cog25 from dating_workflow.step_script.extract_cog25 import parse_annotation