def main(indir, odir, suffix, evalue, transpose, prefix, test): indir = process_path(indir) odir = process_path(odir) gid2locus2ko = retrieve_info(indir, suffix, test_or_not=test) post_filtered = filtration_part(gid2locus2ko, evalue) if not exists(odir): os.makedirs(odir) if prefix is not None: ofile_info = join(odir, f"{prefix}_info.tab") ofile_binary = join(odir, f"{prefix}_binary.tab") ofile_num = join(odir, f"{prefix}_num.tab") else: ofile_info = join(odir, "merged_hmm_info.tab") ofile_binary = join(odir, "merged_hmm_binary.tab") ofile_num = join(odir, "merged_hmm_num.tab") tqdm.write("Complete filterations...") tqdm.write( "It need time to convert the generated dict into DataFrame. Be patient..." ) final_df = pd.DataFrame.from_dict(post_filtered, orient='index') bin_df = final_df.applymap(lambda x: 0 if pd.isna(x) else 1) num_df = final_df.applymap(lambda x: 0 if pd.isna(x) else len(str(x).split(','))) if transpose: final_df = final_df.T bin_df = bin_df.T num_df = num_df.T final_df.to_csv(ofile_info, sep='\t', index=1) bin_df.to_csv(ofile_binary, sep='\t', index=1) num_df.to_csv(ofile_num, sep='\t', index=1)
def main(indir, odir, evalue, transpose, test): indir = process_path(indir) odir = process_path(odir) gid2locus2ko, exists_db = retrieve_info(indir, test=test) locus2ko, sep_l2ko = filtration_part(gid2locus2ko, exists_db, evalue) tqdm.write("Complete filterations...") if not exists(odir): os.makedirs(odir) outut_for(locus2ko, odir, name='mixed', transpose=transpose) for db, l2ko in sep_l2ko.items(): outut_for(l2ko, join(odir, f'annotated_with_{db}'), name=db, transpose=transpose)
def main(indir, ofile): indir = process_path(indir) ofile = process_path(ofile) if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) all_tsv = glob(join(indir, '*', 'storage', 'bin_stats_ext.tsv')) result = {} for each_f in tqdm(all_tsv): text = open(each_f).read().strip('\n') gid, v = text.split('\t') v = eval(v) v = {k: _v for k, _v in v.items() if not k.startswith('GCN')} result[gid] = v new_df = pd.DataFrame.from_dict(result, orient='index') new_df.to_csv(ofile, sep='\t')
def process_IO(infile, out): if out is None: out = infile.rpartition('.')[0] + '.newick' else: out = process_path(out) if not exists(dirname(out)): os.makedirs(dirname(out)) return out
def main(indir, ofile, clean): indir = process_path(indir) ofile = process_path(ofile) if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) all_tsv = glob(join(indir, "*", "storage", "bin_stats_ext.tsv")) result = {} for each_f in tqdm(all_tsv): text = open(each_f).read().strip("\n") gid, v = text.split("\t") v = eval(v) v = {k: _v for k, _v in v.items() if not k.startswith("GCN")} result[gid] = v new_df = pd.DataFrame.from_dict(result, orient="index") if clean: new_df = new_df.loc[:, remaining_columns] new_df.to_csv(ofile, sep="\t")
def cli(intree_ori, mcmc_out_tree, output_dating_result_tree, itol_annotate, root_with, tree_format): output_dating_result_tree = process_path(output_dating_result_tree) if itol_annotate is None: itol_annotate = dirname(output_dating_result_tree) itol_annotate = process_path(itol_annotate) if ',' in str(root_with): root_with = [_.strip() for _ in root_with.split(',')] elif root_with is None: pass else: root_with = [root_with.strip()] if not os.path.exists(dirname(output_dating_result_tree)): os.makedirs(dirname(output_dating_result_tree)) main(intree_ori, mcmc_out_tree, output_dating_result_tree, itol_annotate=itol_annotate, root_with=root_with, tree_format=tree_format)
def cli(indir, ofile, suffix, num_percentage, num_genomes, genome_list): if genome_list is None: genome_list = join(indir, 'selected_genomes.txt') with open(genome_list, 'r') as f1: gids = f1.read().split('\n') gids = [convert_genome_ID(_) for _ in gids] if num_genomes is None and num_percentage is None: num_percentage = 100 num_genomes = len(gids) elif num_genomes is None: num_percentage = int(num_percentage) num_genomes = len(gids) * num_percentage / 100 else: num_genomes = int(num_genomes) genes = main(indir, suffix, num_genomes) ofile = process_path(ofile) if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) print(f"found {len(genes)} meet requirement.") with open(ofile, 'w') as f1: f1.write('\n'.join(genes))
def cli(indir, ofile, suffix, num_percentage, num_genes, num_total_genes, not_add_prefix): if num_genes is None and num_percentage is None: num_percentage = 100 num_genes = num_total_genes elif num_genes is None: num_percentage = int(num_percentage) num_genes = num_total_genes * num_percentage / 100 else: num_genes = int(num_genes) tqdm.write(f"Filter out genomes which only contain {num_genes} ") if not_add_prefix is not None: not_add_prefix_ids = [ _ for _ in open(not_add_prefix).read().split('\n') if _ ] else: not_add_prefix_ids = [] genomes = main(indir, suffix, num_genes, not_add_prefix_ids) ofile = process_path(ofile) if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) with open(ofile, 'w') as f1: f1.write('\n'.join(list(genomes)))
def main(indir, outfile, genome_list, gene_list, remove_identical, seed, concat_type, graph, fill_gaps, suffix='aln', fix_refseq=False, not_add_prefix=None, partition_method='genes', simple_concat=False): if genome_list is None: genome_list = join(indir, 'selected_genomes.txt') gids = open(genome_list, 'r').read().split('\n') if simple_concat: gids = set(gids) else: gids = [convert_genome_ID(_) for _ in gids if _] if fix_refseq: prefix = 'GCF_' else: prefix = 'GCA_' if not_add_prefix is not None: not_add_prefix_ids = [ _ for _ in open(not_add_prefix).read().split('\n') if _ ] else: not_add_prefix_ids = [] # from GCA become locus_tag record_pos_info = [] gid2record = {gid: '' for gid in gids} las_pos = 0 order_seqs = sorted(glob(join(indir, f'*.{suffix}'))) if gene_list is not None: if exists(str(gene_list)): gene_list = [ _.strip() for _ in open(gene_list).read().split('\n') if _ ] order_seqs = [ _ for _ in order_seqs if basename(_).replace(f'.{suffix}', '') in gene_list ] elif isinstance(gene_list, str): gene_list = [_.strip() for _ in gene_list.split(',') if _] order_seqs = [ _ for _ in order_seqs if basename(_).replace(f'.{suffix}', '') in gene_list ] g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs} tqdm.write('itering all requested files ') for idx, aln_file in tqdm(enumerate(order_seqs), total=len(order_seqs)): aln_file_name = basename(aln_file).replace(f'.{suffix}', '') aln_record = AlignIO.read(aln_file, format='fasta') length_this_aln = aln_record.get_alignment_length() # record the partition name = "part%s" % int(idx + 1) start, end = las_pos + 1, length_this_aln + las_pos las_pos = end record_pos_info.append((name, start, end, aln_record)) # done record for gid in gid2record: if simple_concat: records = [_ for _ in aln_record if _.id == gid] else: records = [_ for _ in aln_record if _.id.split('_')[0] == gid] if records: gid2record[gid] += str(records[0].seq) else: gid2record[gid] += '-' * length_this_aln g2num_miss[aln_file_name] += 1 if outfile is None: outfile = join(indir, 'concat_aln.aln') outpartition = join(indir, 'concat_aln.partition') outphy = join(indir, 'concat_aln.phy') ograph = join(indir, 'aln_stats.png') else: outfile = process_path(outfile) if not exists(dirname(outfile)): os.makedirs(dirname(outfile)) outpartition = outfile.rpartition('.')[0] + '.partition' outphy = outfile.rpartition('.')[0] + '.phy' ograph = join(dirname(outfile), 'aln_stats.png') with open(outfile, 'w') as f1: for gid, seq in gid2record.items(): if set(str(seq)) == {'-'}: print(f"{gid} contains only gaps or missing data ") continue if simple_concat: f1.write(f">{gid}\n") else: f1.write( f'>{convert_genome_ID_rev(gid, prefix=prefix,not_add_prefix_ids=not_add_prefix_ids)}\n' ) f1.write(f'{seq}\n') if remove_identical: remove_identical_seqs(outfile, seed=seed) if concat_type.lower() in ['both', 'partition']: generate_partition_file(outpartition, record_pos_info) if concat_type.lower() in ['both', 'phy']: gids = open(genome_list, 'r').read().split('\n') if not simple_concat: name_convertor = lambda x: convert_genome_ID_rev( x, not_add_prefix_ids=not_add_prefix_ids) else: name_convertor = lambda x: x generate_phy_file(outphy, record_pos_info, gids, fill_gaps=fill_gaps, remove_identical=remove_identical, partition_method=partition_method, name_convertor=name_convertor) if graph: generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)
def main(indir, outfile, genome_list, gene_list, concat_type, graph, fill_gaps, suffix='aln', fix_refseq=False, remove_identical=False, partition_method='genes', simple_concat=True): """ The simple_concat indicate that name in `genome_list` is the genome name. If it is False, it indicates that name in `genome_list` is converted/formatted genome name like the prefix of locus. """ if fix_refseq: prefix = 'GCF_' else: prefix = 'GCA_' # sampleing the genomes name2prefix = get_genomes(genome_list,simple_concat) # sampling the gene order_seqs = get_genes(indir,suffix,gene_list) # init parameters g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs} # concat seqs record_pos_info,name2record = concat_records(order_seqs, name2prefix, g2num_miss, suffix, simple_concat) print(f"Found {len([k for k,v in g2num_miss.items() if v==0])} backbone genes") if outfile is None and ',' not in indir: outfile = join(indir, 'concat_aln.aln') outpartition = join(indir, 'concat_aln.partition') outphy = join(indir, 'concat_aln.phy') ograph = join(indir, 'aln_stats.png') else: outfile = process_path(outfile) if not exists(dirname(outfile)): os.makedirs(dirname(outfile)) outpartition = outfile.rpartition('.')[0] + '.partition' outphy = outfile.rpartition('.')[0] + '.phy' ograph = join(dirname(outfile), 'aln_stats.png') with open(outfile, 'w') as f1: for final_name, seq in name2record.items(): if set(str(seq)) == {'-'}: print(f"{final_name} contains only gaps or missing data ") continue if simple_concat: f1.write(f">{final_name}\n") else: f1.write(f'>{convert_genome_ID_rev(final_name, prefix=prefix)}\n') f1.write(f'{seq}\n') if remove_identical: remove_identical_seqs(outfile) if concat_type.lower() in ['both', 'partition']: generate_partition_file(outpartition, record_pos_info) if concat_type.lower() in ['both', 'phy']: gids = list(name2prefix) def name_convertor(x): tmp = [k for k,v in name2prefix.items() if x.split('_')[0] in v or x in v] if not tmp: return else: return tmp[0] generate_phy_file(outphy, record_pos_info, gids, fill_gaps=fill_gaps, remove_identical=remove_identical, partition_method=partition_method, name_convertor=name_convertor) if graph: generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)