Ejemplo n.º 1
0
def main(indir, odir, suffix, evalue, transpose, prefix, test):
    indir = process_path(indir)
    odir = process_path(odir)
    gid2locus2ko = retrieve_info(indir, suffix, test_or_not=test)
    post_filtered = filtration_part(gid2locus2ko, evalue)
    if not exists(odir):
        os.makedirs(odir)

    if prefix is not None:
        ofile_info = join(odir, f"{prefix}_info.tab")
        ofile_binary = join(odir, f"{prefix}_binary.tab")
        ofile_num = join(odir, f"{prefix}_num.tab")
    else:
        ofile_info = join(odir, "merged_hmm_info.tab")
        ofile_binary = join(odir, "merged_hmm_binary.tab")
        ofile_num = join(odir, "merged_hmm_num.tab")
    tqdm.write("Complete filterations...")
    tqdm.write(
        "It need time to convert the generated dict into DataFrame. Be patient..."
    )
    final_df = pd.DataFrame.from_dict(post_filtered, orient='index')
    bin_df = final_df.applymap(lambda x: 0 if pd.isna(x) else 1)
    num_df = final_df.applymap(lambda x: 0
                               if pd.isna(x) else len(str(x).split(',')))
    if transpose:
        final_df = final_df.T
        bin_df = bin_df.T
        num_df = num_df.T
    final_df.to_csv(ofile_info, sep='\t', index=1)
    bin_df.to_csv(ofile_binary, sep='\t', index=1)
    num_df.to_csv(ofile_num, sep='\t', index=1)
Ejemplo n.º 2
0
def main(indir, odir, evalue, transpose, test):
    indir = process_path(indir)
    odir = process_path(odir)
    gid2locus2ko, exists_db = retrieve_info(indir, test=test)
    locus2ko, sep_l2ko = filtration_part(gid2locus2ko, exists_db, evalue)
    tqdm.write("Complete filterations...")
    if not exists(odir):
        os.makedirs(odir)
    outut_for(locus2ko, odir, name='mixed', transpose=transpose)
    for db, l2ko in sep_l2ko.items():
        outut_for(l2ko,
                  join(odir, f'annotated_with_{db}'),
                  name=db,
                  transpose=transpose)
Ejemplo n.º 3
0
def main(indir, ofile):
    indir = process_path(indir)
    ofile = process_path(ofile)

    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))
    all_tsv = glob(join(indir, '*', 'storage', 'bin_stats_ext.tsv'))
    result = {}
    for each_f in tqdm(all_tsv):
        text = open(each_f).read().strip('\n')
        gid, v = text.split('\t')
        v = eval(v)
        v = {k: _v for k, _v in v.items() if not k.startswith('GCN')}
        result[gid] = v
    new_df = pd.DataFrame.from_dict(result, orient='index')
    new_df.to_csv(ofile, sep='\t')
Ejemplo n.º 4
0
def process_IO(infile, out):
    if out is None:
        out = infile.rpartition('.')[0] + '.newick'
    else:
        out = process_path(out)
        if not exists(dirname(out)):
            os.makedirs(dirname(out))
    return out
Ejemplo n.º 5
0
def main(indir, ofile, clean):
    indir = process_path(indir)
    ofile = process_path(ofile)

    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))
    all_tsv = glob(join(indir, "*", "storage", "bin_stats_ext.tsv"))
    result = {}
    for each_f in tqdm(all_tsv):
        text = open(each_f).read().strip("\n")
        gid, v = text.split("\t")
        v = eval(v)
        v = {k: _v for k, _v in v.items() if not k.startswith("GCN")}
        result[gid] = v
    new_df = pd.DataFrame.from_dict(result, orient="index")
    if clean:
        new_df = new_df.loc[:, remaining_columns]
    new_df.to_csv(ofile, sep="\t")
Ejemplo n.º 6
0
def cli(intree_ori, mcmc_out_tree, output_dating_result_tree, itol_annotate,
        root_with, tree_format):
    output_dating_result_tree = process_path(output_dating_result_tree)
    if itol_annotate is None:
        itol_annotate = dirname(output_dating_result_tree)
    itol_annotate = process_path(itol_annotate)
    if ',' in str(root_with):
        root_with = [_.strip() for _ in root_with.split(',')]
    elif root_with is None:
        pass
    else:
        root_with = [root_with.strip()]

    if not os.path.exists(dirname(output_dating_result_tree)):
        os.makedirs(dirname(output_dating_result_tree))
    main(intree_ori,
         mcmc_out_tree,
         output_dating_result_tree,
         itol_annotate=itol_annotate,
         root_with=root_with,
         tree_format=tree_format)
def cli(indir, ofile, suffix, num_percentage, num_genomes, genome_list):
    if genome_list is None:
        genome_list = join(indir, 'selected_genomes.txt')
    with open(genome_list, 'r') as f1:
        gids = f1.read().split('\n')
    gids = [convert_genome_ID(_) for _ in gids]

    if num_genomes is None and num_percentage is None:
        num_percentage = 100
        num_genomes = len(gids)
    elif num_genomes is None:
        num_percentage = int(num_percentage)
        num_genomes = len(gids) * num_percentage / 100
    else:
        num_genomes = int(num_genomes)
    genes = main(indir, suffix, num_genomes)

    ofile = process_path(ofile)
    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))
    print(f"found {len(genes)} meet requirement.")
    with open(ofile, 'w') as f1:
        f1.write('\n'.join(genes))
Ejemplo n.º 8
0
def cli(indir, ofile, suffix, num_percentage, num_genes, num_total_genes,
        not_add_prefix):
    if num_genes is None and num_percentage is None:
        num_percentage = 100
        num_genes = num_total_genes
    elif num_genes is None:
        num_percentage = int(num_percentage)
        num_genes = num_total_genes * num_percentage / 100
    else:
        num_genes = int(num_genes)
    tqdm.write(f"Filter out genomes which only contain {num_genes} ")
    if not_add_prefix is not None:
        not_add_prefix_ids = [
            _ for _ in open(not_add_prefix).read().split('\n') if _
        ]
    else:
        not_add_prefix_ids = []
    genomes = main(indir, suffix, num_genes, not_add_prefix_ids)

    ofile = process_path(ofile)
    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))
    with open(ofile, 'w') as f1:
        f1.write('\n'.join(list(genomes)))
Ejemplo n.º 9
0
def main(indir,
         outfile,
         genome_list,
         gene_list,
         remove_identical,
         seed,
         concat_type,
         graph,
         fill_gaps,
         suffix='aln',
         fix_refseq=False,
         not_add_prefix=None,
         partition_method='genes',
         simple_concat=False):
    if genome_list is None:
        genome_list = join(indir, 'selected_genomes.txt')
    gids = open(genome_list, 'r').read().split('\n')
    if simple_concat:
        gids = set(gids)
    else:
        gids = [convert_genome_ID(_) for _ in gids if _]
        if fix_refseq:
            prefix = 'GCF_'
        else:
            prefix = 'GCA_'
        if not_add_prefix is not None:
            not_add_prefix_ids = [
                _ for _ in open(not_add_prefix).read().split('\n') if _
            ]
        else:
            not_add_prefix_ids = []
    # from GCA become locus_tag
    record_pos_info = []
    gid2record = {gid: '' for gid in gids}

    las_pos = 0
    order_seqs = sorted(glob(join(indir, f'*.{suffix}')))
    if gene_list is not None:
        if exists(str(gene_list)):
            gene_list = [
                _.strip() for _ in open(gene_list).read().split('\n') if _
            ]
            order_seqs = [
                _ for _ in order_seqs
                if basename(_).replace(f'.{suffix}', '') in gene_list
            ]
        elif isinstance(gene_list, str):
            gene_list = [_.strip() for _ in gene_list.split(',') if _]
            order_seqs = [
                _ for _ in order_seqs
                if basename(_).replace(f'.{suffix}', '') in gene_list
            ]
    g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs}
    tqdm.write('itering all requested files ')
    for idx, aln_file in tqdm(enumerate(order_seqs), total=len(order_seqs)):
        aln_file_name = basename(aln_file).replace(f'.{suffix}', '')
        aln_record = AlignIO.read(aln_file, format='fasta')
        length_this_aln = aln_record.get_alignment_length()
        # record the partition
        name = "part%s" % int(idx + 1)
        start, end = las_pos + 1, length_this_aln + las_pos
        las_pos = end
        record_pos_info.append((name, start, end, aln_record))
        # done record
        for gid in gid2record:
            if simple_concat:
                records = [_ for _ in aln_record if _.id == gid]
            else:
                records = [_ for _ in aln_record if _.id.split('_')[0] == gid]
            if records:
                gid2record[gid] += str(records[0].seq)
            else:
                gid2record[gid] += '-' * length_this_aln

                g2num_miss[aln_file_name] += 1

    if outfile is None:
        outfile = join(indir, 'concat_aln.aln')
        outpartition = join(indir, 'concat_aln.partition')
        outphy = join(indir, 'concat_aln.phy')
        ograph = join(indir, 'aln_stats.png')
    else:
        outfile = process_path(outfile)
        if not exists(dirname(outfile)):
            os.makedirs(dirname(outfile))
        outpartition = outfile.rpartition('.')[0] + '.partition'
        outphy = outfile.rpartition('.')[0] + '.phy'
        ograph = join(dirname(outfile), 'aln_stats.png')

    with open(outfile, 'w') as f1:
        for gid, seq in gid2record.items():
            if set(str(seq)) == {'-'}:
                print(f"{gid} contains only gaps or missing data ")
                continue
            if simple_concat:
                f1.write(f">{gid}\n")
            else:
                f1.write(
                    f'>{convert_genome_ID_rev(gid, prefix=prefix,not_add_prefix_ids=not_add_prefix_ids)}\n'
                )
            f1.write(f'{seq}\n')

    if remove_identical:
        remove_identical_seqs(outfile, seed=seed)
    if concat_type.lower() in ['both', 'partition']:
        generate_partition_file(outpartition, record_pos_info)
    if concat_type.lower() in ['both', 'phy']:
        gids = open(genome_list, 'r').read().split('\n')

        if not simple_concat:
            name_convertor = lambda x: convert_genome_ID_rev(
                x, not_add_prefix_ids=not_add_prefix_ids)
        else:
            name_convertor = lambda x: x
        generate_phy_file(outphy,
                          record_pos_info,
                          gids,
                          fill_gaps=fill_gaps,
                          remove_identical=remove_identical,
                          partition_method=partition_method,
                          name_convertor=name_convertor)
    if graph:
        generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)
Ejemplo n.º 10
0
def main(indir, 
         outfile, 
         genome_list, 
         gene_list, 
         concat_type, 
         graph, 
         fill_gaps, 
         suffix='aln', 
         fix_refseq=False,
         remove_identical=False,
         partition_method='genes',
         simple_concat=True):
    """
    The simple_concat indicate that name in `genome_list` is the genome name.
    If it is False, it indicates that name in `genome_list` is converted/formatted genome name like the prefix of locus.
    """
    if fix_refseq:
        prefix = 'GCF_'
    else:
        prefix = 'GCA_'
    # sampleing the genomes
    name2prefix = get_genomes(genome_list,simple_concat)
    # sampling the gene 
    order_seqs = get_genes(indir,suffix,gene_list)
    
    # init parameters
    g2num_miss = {basename(_).replace(f'.{suffix}', ''): 0 for _ in order_seqs}
    
    
    # concat seqs
    record_pos_info,name2record = concat_records(order_seqs,
                                                 name2prefix,
                                                 g2num_miss,
                                                 suffix,
                                                 simple_concat)
    print(f"Found {len([k for k,v in g2num_miss.items() if v==0])} backbone genes")
    if outfile is None and ',' not in indir:
        outfile = join(indir, 'concat_aln.aln')
        outpartition = join(indir, 'concat_aln.partition')
        outphy = join(indir, 'concat_aln.phy')
        ograph = join(indir, 'aln_stats.png')
    else:
        outfile = process_path(outfile)
        if not exists(dirname(outfile)):
            os.makedirs(dirname(outfile))
        outpartition = outfile.rpartition('.')[0] + '.partition'
        outphy = outfile.rpartition('.')[0] + '.phy'
        ograph = join(dirname(outfile), 'aln_stats.png')

    with open(outfile, 'w') as f1:
        for final_name, seq in name2record.items():
            if set(str(seq)) == {'-'}:
                print(f"{final_name} contains only gaps or missing data ")
                continue
            if simple_concat:
                f1.write(f">{final_name}\n")
            else:
                f1.write(f'>{convert_genome_ID_rev(final_name, prefix=prefix)}\n')
            f1.write(f'{seq}\n')

    if remove_identical:
        remove_identical_seqs(outfile)
    if concat_type.lower() in ['both', 'partition']:
        generate_partition_file(outpartition, record_pos_info)
    if concat_type.lower() in ['both', 'phy']:
        gids = list(name2prefix)
        def name_convertor(x):
            tmp = [k for k,v in name2prefix.items() if x.split('_')[0] in v or x in v]
            if not tmp:
                return
            else:
                return tmp[0]
        generate_phy_file(outphy, 
                          record_pos_info, 
                          gids,
                          fill_gaps=fill_gaps,
                          remove_identical=remove_identical,
                          partition_method=partition_method,
                          name_convertor=name_convertor)
    if graph:
        generate_stats_graph(g2num_miss, total=len(gids), ofile=ograph)