def main(fasta_file, nb_chunks, output): header_len = [[header, len(seq)] for header, seq in sfp(open(fasta_file))] header_len = sorted(header_len, key=lambda x: -x[1]) # open nb_chunks handles os.system("mkdir -p " + output) if len(fasta_file.split(".")) > 1: ext = "." + fasta_file.split(".")[-1] else: ext = "" fasta_path = output + "/" + fasta_file.split('/')[-1].split(".")[0] handles = [ open("%s_%s%s" % (fasta_path, nb, ext), "w") for nb in range(nb_chunks) ] # map header to handle, biggest to smallest seq, so it sort of even out index = 0 header_to_handle = {} for header, _ in header_len: header_to_handle[header] = handles[index] index = (index + 1) % nb_chunks # read the file again, and this time write on all handles at the same time for header, seq in sfp(open(fasta_file)): header_to_handle[header].write(">%s\n%s\n" % (header, seq)) # close all handles for handle in handles: handle.close()
def Rewrite_gfa(fasta_file, gfa_file, output): # get contig name since megahit_toolkit delete them (grr) # may look ineficient but takes only 10 min in total on 16G assembly with open(fasta_file) as Handle: Dico_Seq_name = {seq: name.split(' ')[0] for name, seq in sfp(Handle)} NewGfa = [] with open(gfa_file) as Handle: Dict_old_to_new = {} for line in Handle: line = line.rstrip().split('\t') if line[0] == "S": Dict_old_to_new[line[1]] = Dico_Seq_name[line[2]] line[1] = Dico_Seq_name[line[2]] NewGfa.append("\t".join(line)) with open(gfa_file) as Handle: for line in Handle: line = line.rstrip().split('\t') if line[0] == "L": line[1] = Dict_old_to_new[line[1]] line[3] = Dict_old_to_new[line[3]] NewGfa.append("\t".join(line)) with open(output, 'w') as H: H.write("\n".join(NewGfa))
def main(paths, output, gfa_files): cog_to_gfa = {basename(gfa).split("_")[0]: gfa for gfa in gfa_files} cog_lines = [[header] + [s for s in seq.split(",")] for header, seq in sfp(open(paths))] strain_paths = defaultdict(list) # sort by strain cogs2 = set() # selected cogs for bayespath are not always outputed for line in cog_lines: cog, strain = line[0].split("_") cogs2 |= {cog} strain_paths[strain].append( ["%s_%s" % (cog, unitig) for unitig in line[1:]]) # get new edges and buffer vertices new_vertices = set() new_strain_edges = "".join([ new_edgs for strain, paths in strain_paths.items() for new_edgs in generate_edges(paths, new_vertices) ]) # add buffer vertices new_vertices = "".join([ "S\t%s\t%s\tKC:i:10\tCL:z:#000000\tC2:z:#000000\t\n" % (name, 100 * 'N') for name in new_vertices ]) # rename contigs and add edges, new_edges = "" for cog in cogs2: with open(cog_to_gfa[cog]) as handle: for line in handle: splitline = line.rstrip().split("\t") if line[0] == "S": new_contig = "%s_%s" % (cog, splitline[1]) new_vertices += "S\t%s\t%s\n" % (new_contig, "\t".join( splitline[2:])) if line[0] == "L": new_contig1 = "%s_%s" % (cog, splitline[1]) new_contig2 = "%s_%s" % (cog, splitline[3]) new_edges += "\t".join([ "L", new_contig1, splitline[2], new_contig2, splitline[4], splitline[5] ]) + "\n" new_edges += new_strain_edges # output joint gfa with open(output, "w") as handle: handle.write(new_vertices + new_edges)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument("cog_annotation", help="annotation of the full assembly") parser.add_argument( "cluster_definition", help="csv file, firts column is the contig, second is the bin") parser.add_argument("mag_list", help="list of mags") parser.add_argument("cogs_list", help="list of COGs to collate from bins") parser.add_argument( "output_folder", help="output folder name, where do we want to put the sequences") args = parser.parse_args() # set of SCG with open(args.cogs_list, 'r') as f: core_cogs = set([x.rstrip() for x in f.readlines()]) # get mags mags = {element.rstrip() for element in open(args.mag_list)} # get mags definition contigs_to_mags = {} with open(args.cluster_definition) as handle: for line in handle: contig, bin_ = line.rstrip().split(",") if bin_ in mags: contigs_to_mags[contig] = bin_ # get SCGs linked to bin corecog_bin = defaultdict(lambda: defaultdict(lambda: ["", ""])) for header, seq in sfp(open(args.cog_annotation)): orf, COG, strand = header.rstrip().split(' ') contig = "_".join(orf.split("_")[:-1]) if contig in contigs_to_mags: if len(seq) > len(corecog_bin[COG][contigs_to_mags[contig]][1]): corecog_bin[COG][contigs_to_mags[contig]] = [header, seq] # output path_output = args.output_folder if not os.path.isdir(path_output): os.system("mkdir -p " + path_output) for corecog, dict_List_bins in corecog_bin.items(): with open(path_output + corecog + ".fna", "w") as output_handle: for bin_name, (header, seq) in dict_List_bins.items(): new_name = "Bin_" + bin_name + "_" + corecog + " " + header output_handle.write(">" + new_name + "\n" + seq + "\n")
def main(scg_cov, map_name_to_orfs, output): path = dirname(output) msa_files = glob.glob("%s/*_trim.msa"%path) dist_mat = glob.glob("%s/*_dist_mat.tsv"%path) strain_to_cog_seq = defaultdict(lambda:{}) cog_to_strain_seq = defaultdict(lambda:{}) for file in msa_files: cog = basename(file).replace("_trim.msa","") for strain,seq in sfp(open(file)): strain_to_cog_seq[strain][cog] = seq cog_to_strain_seq[cog][strain] = seq sorted_cogs = sorted(cog_to_strain_seq.keys()) sorted_strains = sorted(strain_to_cog_seq.keys()) mean_cog_len = {cog:int(np.mean([len(seq) for seq in dict_strain.values()])) for cog,dict_strain in cog_to_strain_seq.items()} ### deal with multiples mag cog : find which should go with which # get coverage tot of each mag cog orf cog_to_haplo_cov_tot = get_cog_coverage(scg_cov, map_name_to_orfs, cog_to_strain_seq) # choose the consensus sequence orfs, this way too complicated for what it does new_mag_to_old = mags_seqs(path,sorted_cogs,sorted_strains,cog_to_haplo_cov_tot) # get strain concatenated sequence sorted_strains = sorted([strain for strain in sorted_strains if not is_mag(strain)]+list(new_mag_to_old.keys())) strain_to_seq = defaultdict(str) for strain in sorted_strains: for cog in sorted_cogs: if is_mag(strain): if cog in new_mag_to_old[strain]: cog_to_seq = strain_to_cog_seq[new_mag_to_old[strain][cog]] else: cog_to_seq = {} else: cog_to_seq = strain_to_cog_seq[strain] if cog in cog_to_seq: strain_to_seq[strain]+=cog_to_seq[cog] else : # deal with missing cogs strain_to_seq[strain]+=mean_cog_len[cog]*"-" with open(output,"w") as handle : handle.writelines(">%s\n%s\n"%(strain,seq) for strain,seq in strain_to_seq.items())
'-a', help= "contig assignmeent file : bin/group/strain assigned to contig, first term is contig name following are list of bin/... contig it is in. Some contigs may not be assigned to anything, they are then colored grey, .tsv file" ) parser.add_argument("-s", action='store_true', help="flag, output 1 gfa per strain ") parser.add_argument("output", help="output name for colored gfa") args = parser.parse_args() # deal with multiple way of passing contig assignment if args.p: # we assume the name of the gfa file contain the name of the cog of interest cog = basename(args.gfa_file.replace(".gfa", "")) cog_lines = [ [header] + [s.replace("-", "").replace("+", "") for s in seq.split(",")] for header, seq in sfp(open(args.p)) if cog in header ] contig_to_strains = defaultdict(list) for line in cog_lines: strain = line[0].split("_")[1] for contig in line[1:]: contig_to_strains[contig].append(strain) else: contig_to_strains = { line.split(",")[0]: line.rstrip().split(",")[1:] for line in open(args.a) } main(args.gfa_file, contig_to_strains, args.output, args.s)
def get_initial_number_of_bins(file, MAX_BIN_NB): nb_bin = int(2 * np.median( list( Counter([header.split(" ")[1] for header, seq in sfp(open(file))]).values()))) return min(nb_bin, int(MAX_BIN_NB))
def map_strain_to_seq(fna_file, fa_file, strain_to_cog_to_orfs): orfs = set([ orf for strain, cog_to_orf in strain_to_cog_to_orfs.items() for cog, orfs in cog_to_orf.items() for orf in orfs ]) orf_to_seq_full = { header: seq for header, seq in sfp(open(fna_file)) if header.split(" ")[0] in orfs } orf_to_coordinate = { header.split(" ")[0]: header.split(" # ")[1:3] for header in orf_to_seq_full } orf_to_seq = { header.split(" ")[0]: seq for header, seq in orf_to_seq_full.items() } def merge_orfs(contig, start, end): for header, seq in sfp(open(fa_file)): if contig in header: return seq[start:end + 1] def same_contig(contig, orfs): # case where all orfs are on the same contig nbs = [int(orf.split("_")[-1]) for orf in orfs] if list(range(min(nbs), max(nbs) + 1)) == sorted(nbs): # case where they are all next to each other coordinates = [ int(coords) for orf in orfs for coords in orf_to_coordinate[orf] ] start = min(coordinates) end = max(coordinates) seq = merge_orfs(contig, start, end) assert (seq != ""), "no sequence found for %" % contig return [seq] else: # in this case, we assume they are distincts sequences and should be accounted for. return [orf_to_seq[orf] for orf in orfs] #### deal with case where multiple orfs are present # we merge cogs if are on the same contig next to each other (see same_contig) # we append the seq if they are on different contigs or not following each other strain_to_cog_to_seq = defaultdict(lambda: defaultdict(list)) for strain, cog_to_orfs in strain_to_cog_to_orfs.items(): for cog, orfs in cog_to_orfs.items(): if len(orfs) == 1: orf = orfs[0] strain_to_cog_to_seq[strain][cog] = [orf_to_seq[orf]] else: contigs_to_orfs = defaultdict(list) for orf in orfs: contig = "_".join(orf.split("_")[:-1]) contigs_to_orfs[contig].append(orf) # we need orfs always on the same order for futur concatenation sorted_contigs = sorted(contigs_to_orfs.keys()) for contig in sorted_contigs: orfs = sorted(contigs_to_orfs[contig]) if len(orfs) == 1: strain_to_cog_to_seq[strain][cog] += [orf_to_seq[orf]] else: strain_to_cog_to_seq[strain][cog] += same_contig( contig, orfs) return strain_to_cog_to_seq
def merge_orfs(contig, start, end): for header, seq in sfp(open(fa_file)): if contig in header: return seq[start:end + 1]
#!/usr/bin/env python3 from Bio.SeqIO.FastaIO import SimpleFastaParser as sfp import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-b", help="orfs bed") parser.add_argument("-f", help="SCG file") parser.add_argument("-o", help="output SCG bed") args = parser.parse_args() orfs_bed=args.b scg_file=args.f scg_bedfile=args.o seq_names = {name.split()[0] for name, seq in sfp(open(scg_file))} with open(scg_bedfile, 'w') as scg_bed: for line in open(orfs_bed): if line.split()[3] in seq_names: scg_bed.write(line)
parser.add_argument("-o", help="output folder") args = parser.parse_args() GROUPS = args.g output_folder = args.o map_mag_to_bin = [[ "mag_name", "assembly_nb", "mag_nb", "assembly_name", "bin_name" ]] mag_nb = 0 for index_group, group in enumerate(GROUPS): path = "%s/binning/consensus" % group file = "%s/consensus_MAG_list.txt" % path bins_path = "%s/bins" % path for line in open(file): bin_name = line.rstrip() mag_nb += 1 name = "a%s_m%s.fa" % (index_group, mag_nb) map_mag_to_bin.append([ name.split(".fa")[0], index_group, mag_nb, group, "Bin_%s" % bin_name ]) with open("%s/%s" % (output_folder, name), "w") as handle: for header, seq in sfp( open("%s/Bin_%s.fa" % (bins_path, bin_name))): new_header = "assembly_%s_%s" % (index_group, header) handle.write(">%s\n%s\n" % (new_header, seq)) with open(output["map"], "w") as handle: handle.write("\n".join( ["\t".join(list(map(str, line))) for line in map_mag_to_bin]) + "\n")