Esempio n. 1
0
def main(fasta_file, nb_chunks, output):
    header_len = [[header, len(seq)] for header, seq in sfp(open(fasta_file))]
    header_len = sorted(header_len, key=lambda x: -x[1])
    # open nb_chunks handles
    os.system("mkdir -p " + output)
    if len(fasta_file.split(".")) > 1:
        ext = "." + fasta_file.split(".")[-1]
    else:
        ext = ""
    fasta_path = output + "/" + fasta_file.split('/')[-1].split(".")[0]
    handles = [
        open("%s_%s%s" % (fasta_path, nb, ext), "w") for nb in range(nb_chunks)
    ]
    # map header to handle, biggest to smallest seq, so it sort of even out
    index = 0
    header_to_handle = {}
    for header, _ in header_len:
        header_to_handle[header] = handles[index]
        index = (index + 1) % nb_chunks
    # read the file again, and this time write on all handles at the same time
    for header, seq in sfp(open(fasta_file)):
        header_to_handle[header].write(">%s\n%s\n" % (header, seq))
    # close all handles
    for handle in handles:
        handle.close()
Esempio n. 2
0
def Rewrite_gfa(fasta_file, gfa_file, output):
    # get contig name since megahit_toolkit delete them (grr)
    # may look ineficient but takes only 10 min in total on 16G assembly
    with open(fasta_file) as Handle:
        Dico_Seq_name = {seq: name.split(' ')[0] for name, seq in sfp(Handle)}

    NewGfa = []
    with open(gfa_file) as Handle:
        Dict_old_to_new = {}
        for line in Handle:
            line = line.rstrip().split('\t')
            if line[0] == "S":
                Dict_old_to_new[line[1]] = Dico_Seq_name[line[2]]
                line[1] = Dico_Seq_name[line[2]]
                NewGfa.append("\t".join(line))

    with open(gfa_file) as Handle:
        for line in Handle:
            line = line.rstrip().split('\t')
            if line[0] == "L":
                line[1] = Dict_old_to_new[line[1]]
                line[3] = Dict_old_to_new[line[3]]
                NewGfa.append("\t".join(line))

    with open(output, 'w') as H:
        H.write("\n".join(NewGfa))
Esempio n. 3
0
def main(paths, output, gfa_files):
    cog_to_gfa = {basename(gfa).split("_")[0]: gfa for gfa in gfa_files}
    cog_lines = [[header] + [s for s in seq.split(",")]
                 for header, seq in sfp(open(paths))]
    strain_paths = defaultdict(list)
    # sort by strain
    cogs2 = set()  # selected cogs for bayespath are not always outputed
    for line in cog_lines:
        cog, strain = line[0].split("_")
        cogs2 |= {cog}
        strain_paths[strain].append(
            ["%s_%s" % (cog, unitig) for unitig in line[1:]])
    # get new edges and buffer vertices
    new_vertices = set()
    new_strain_edges = "".join([
        new_edgs for strain, paths in strain_paths.items()
        for new_edgs in generate_edges(paths, new_vertices)
    ])
    # add buffer vertices
    new_vertices = "".join([
        "S\t%s\t%s\tKC:i:10\tCL:z:#000000\tC2:z:#000000\t\n" %
        (name, 100 * 'N') for name in new_vertices
    ])
    # rename contigs and add edges,
    new_edges = ""
    for cog in cogs2:
        with open(cog_to_gfa[cog]) as handle:
            for line in handle:
                splitline = line.rstrip().split("\t")
                if line[0] == "S":
                    new_contig = "%s_%s" % (cog, splitline[1])
                    new_vertices += "S\t%s\t%s\n" % (new_contig, "\t".join(
                        splitline[2:]))
                if line[0] == "L":
                    new_contig1 = "%s_%s" % (cog, splitline[1])
                    new_contig2 = "%s_%s" % (cog, splitline[3])
                    new_edges += "\t".join([
                        "L", new_contig1, splitline[2], new_contig2,
                        splitline[4], splitline[5]
                    ]) + "\n"
    new_edges += new_strain_edges
    # output joint gfa
    with open(output, "w") as handle:
        handle.write(new_vertices + new_edges)
Esempio n. 4
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("cog_annotation",
                        help="annotation of the full assembly")
    parser.add_argument(
        "cluster_definition",
        help="csv file, firts column is the contig, second is the bin")
    parser.add_argument("mag_list", help="list of mags")
    parser.add_argument("cogs_list", help="list of COGs to collate from bins")
    parser.add_argument(
        "output_folder",
        help="output folder name, where do we want to put the sequences")
    args = parser.parse_args()
    # set of SCG
    with open(args.cogs_list, 'r') as f:
        core_cogs = set([x.rstrip() for x in f.readlines()])
    # get mags
    mags = {element.rstrip() for element in open(args.mag_list)}
    # get mags definition
    contigs_to_mags = {}
    with open(args.cluster_definition) as handle:
        for line in handle:
            contig, bin_ = line.rstrip().split(",")
            if bin_ in mags:
                contigs_to_mags[contig] = bin_
    # get SCGs linked to bin
    corecog_bin = defaultdict(lambda: defaultdict(lambda: ["", ""]))
    for header, seq in sfp(open(args.cog_annotation)):
        orf, COG, strand = header.rstrip().split(' ')
        contig = "_".join(orf.split("_")[:-1])
        if contig in contigs_to_mags:
            if len(seq) > len(corecog_bin[COG][contigs_to_mags[contig]][1]):
                corecog_bin[COG][contigs_to_mags[contig]] = [header, seq]
    # output
    path_output = args.output_folder
    if not os.path.isdir(path_output):
        os.system("mkdir -p " + path_output)
    for corecog, dict_List_bins in corecog_bin.items():
        with open(path_output + corecog + ".fna", "w") as output_handle:
            for bin_name, (header, seq) in dict_List_bins.items():
                new_name = "Bin_" + bin_name + "_" + corecog + " " + header
                output_handle.write(">" + new_name + "\n" + seq + "\n")
Esempio n. 5
0
def main(scg_cov, map_name_to_orfs, output):
    path = dirname(output)
    msa_files = glob.glob("%s/*_trim.msa"%path)
    dist_mat = glob.glob("%s/*_dist_mat.tsv"%path)

    strain_to_cog_seq = defaultdict(lambda:{})
    cog_to_strain_seq = defaultdict(lambda:{})
    for file in msa_files:
        cog = basename(file).replace("_trim.msa","")
        for strain,seq in sfp(open(file)):
            strain_to_cog_seq[strain][cog] = seq
            cog_to_strain_seq[cog][strain] = seq
    sorted_cogs = sorted(cog_to_strain_seq.keys())
    sorted_strains = sorted(strain_to_cog_seq.keys())
    mean_cog_len = {cog:int(np.mean([len(seq) for seq in dict_strain.values()])) for cog,dict_strain in cog_to_strain_seq.items()}

    ### deal with multiples mag cog : find which should go with which
    # get coverage tot of each mag cog orf
    cog_to_haplo_cov_tot = get_cog_coverage(scg_cov, map_name_to_orfs, cog_to_strain_seq)
    # choose the consensus sequence orfs, this way too complicated for what it does
    new_mag_to_old = mags_seqs(path,sorted_cogs,sorted_strains,cog_to_haplo_cov_tot)

    # get strain concatenated sequence
    sorted_strains = sorted([strain for strain in sorted_strains if not is_mag(strain)]+list(new_mag_to_old.keys()))
    strain_to_seq = defaultdict(str)
    for strain in sorted_strains:
        for cog in sorted_cogs: 
            if is_mag(strain):
                if cog in new_mag_to_old[strain]:
                    cog_to_seq = strain_to_cog_seq[new_mag_to_old[strain][cog]]
                else:
                    cog_to_seq = {}
            else:
                cog_to_seq = strain_to_cog_seq[strain]
            if cog in cog_to_seq:
                strain_to_seq[strain]+=cog_to_seq[cog]
            else :
                # deal with missing cogs
                strain_to_seq[strain]+=mean_cog_len[cog]*"-"
    with open(output,"w") as handle : 
        handle.writelines(">%s\n%s\n"%(strain,seq) for strain,seq in strain_to_seq.items())
Esempio n. 6
0
        '-a',
        help=
        "contig assignmeent file : bin/group/strain assigned to contig, first term is contig name following are list of bin/... contig it is in. Some contigs may not be assigned to anything, they are then colored grey, .tsv file"
    )
    parser.add_argument("-s",
                        action='store_true',
                        help="flag, output 1 gfa per strain ")
    parser.add_argument("output", help="output name for colored gfa")
    args = parser.parse_args()
    # deal with multiple way of passing contig assignment
    if args.p:
        # we assume the name of the gfa file contain the name of the cog of interest
        cog = basename(args.gfa_file.replace(".gfa", ""))
        cog_lines = [
            [header] +
            [s.replace("-", "").replace("+", "") for s in seq.split(",")]
            for header, seq in sfp(open(args.p)) if cog in header
        ]
        contig_to_strains = defaultdict(list)
        for line in cog_lines:
            strain = line[0].split("_")[1]
            for contig in line[1:]:
                contig_to_strains[contig].append(strain)
    else:
        contig_to_strains = {
            line.split(",")[0]: line.rstrip().split(",")[1:]
            for line in open(args.a)
        }

    main(args.gfa_file, contig_to_strains, args.output, args.s)
Esempio n. 7
0
def get_initial_number_of_bins(file, MAX_BIN_NB):
    nb_bin = int(2 * np.median(
        list(
            Counter([header.split(" ")[1]
                     for header, seq in sfp(open(file))]).values())))
    return min(nb_bin, int(MAX_BIN_NB))
Esempio n. 8
0
def map_strain_to_seq(fna_file, fa_file, strain_to_cog_to_orfs):
    orfs = set([
        orf for strain, cog_to_orf in strain_to_cog_to_orfs.items()
        for cog, orfs in cog_to_orf.items() for orf in orfs
    ])
    orf_to_seq_full = {
        header: seq
        for header, seq in sfp(open(fna_file)) if header.split(" ")[0] in orfs
    }
    orf_to_coordinate = {
        header.split(" ")[0]: header.split(" # ")[1:3]
        for header in orf_to_seq_full
    }
    orf_to_seq = {
        header.split(" ")[0]: seq
        for header, seq in orf_to_seq_full.items()
    }

    def merge_orfs(contig, start, end):
        for header, seq in sfp(open(fa_file)):
            if contig in header:
                return seq[start:end + 1]

    def same_contig(contig, orfs):
        # case where all orfs are on the same contig
        nbs = [int(orf.split("_")[-1]) for orf in orfs]
        if list(range(min(nbs), max(nbs) + 1)) == sorted(nbs):
            # case where they are all next to each other
            coordinates = [
                int(coords) for orf in orfs
                for coords in orf_to_coordinate[orf]
            ]
            start = min(coordinates)
            end = max(coordinates)
            seq = merge_orfs(contig, start, end)
            assert (seq != ""), "no sequence found for %" % contig
            return [seq]
        else:
            # in this case, we assume they are distincts sequences and should be accounted for.
            return [orf_to_seq[orf] for orf in orfs]

    #### deal with case where multiple orfs are present
    # we merge cogs if are on the same contig next to each other (see same_contig)
    # we append the seq if they are on different contigs or not following each other
    strain_to_cog_to_seq = defaultdict(lambda: defaultdict(list))
    for strain, cog_to_orfs in strain_to_cog_to_orfs.items():
        for cog, orfs in cog_to_orfs.items():
            if len(orfs) == 1:
                orf = orfs[0]
                strain_to_cog_to_seq[strain][cog] = [orf_to_seq[orf]]
            else:
                contigs_to_orfs = defaultdict(list)
                for orf in orfs:
                    contig = "_".join(orf.split("_")[:-1])
                    contigs_to_orfs[contig].append(orf)
                # we need orfs always on the same order for futur concatenation
                sorted_contigs = sorted(contigs_to_orfs.keys())
                for contig in sorted_contigs:
                    orfs = sorted(contigs_to_orfs[contig])
                    if len(orfs) == 1:
                        strain_to_cog_to_seq[strain][cog] += [orf_to_seq[orf]]
                    else:
                        strain_to_cog_to_seq[strain][cog] += same_contig(
                            contig, orfs)
    return strain_to_cog_to_seq
Esempio n. 9
0
 def merge_orfs(contig, start, end):
     for header, seq in sfp(open(fa_file)):
         if contig in header:
             return seq[start:end + 1]
Esempio n. 10
0
#!/usr/bin/env python3

from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-b", help="orfs bed")
    parser.add_argument("-f", help="SCG file")
    parser.add_argument("-o", help="output SCG bed")
    args = parser.parse_args()
    orfs_bed=args.b
    scg_file=args.f
    scg_bedfile=args.o


    seq_names = {name.split()[0] for name, seq in sfp(open(scg_file))}
    with open(scg_bedfile, 'w') as scg_bed:
        for line in open(orfs_bed):
            if line.split()[3] in seq_names:
               scg_bed.write(line)

Esempio n. 11
0
    parser.add_argument("-o", help="output folder")
    args = parser.parse_args()
    GROUPS = args.g
    output_folder = args.o

    map_mag_to_bin = [[
        "mag_name", "assembly_nb", "mag_nb", "assembly_name", "bin_name"
    ]]
    mag_nb = 0
    for index_group, group in enumerate(GROUPS):
        path = "%s/binning/consensus" % group
        file = "%s/consensus_MAG_list.txt" % path
        bins_path = "%s/bins" % path
        for line in open(file):
            bin_name = line.rstrip()
            mag_nb += 1
            name = "a%s_m%s.fa" % (index_group, mag_nb)
            map_mag_to_bin.append([
                name.split(".fa")[0], index_group, mag_nb, group,
                "Bin_%s" % bin_name
            ])
            with open("%s/%s" % (output_folder, name), "w") as handle:
                for header, seq in sfp(
                        open("%s/Bin_%s.fa" % (bins_path, bin_name))):
                    new_header = "assembly_%s_%s" % (index_group, header)
                    handle.write(">%s\n%s\n" % (new_header, seq))
    with open(output["map"], "w") as handle:
        handle.write("\n".join(
            ["\t".join(list(map(str, line)))
             for line in map_mag_to_bin]) + "\n")