def check_circular(file, name): contigs = fastaparser.read_fasta(file) count = [] circular_contigs = {} input_fasta = name + "_input_with_circ.fasta" with open(input_fasta, 'w') as output: for contig in contigs: circular_contigs[contig[0].split(" ")[0][1:]] = [ len(contig[1]), "-" ] for kval in range(200, 50, -1): if kval >= len(contig[1]) or len(contig[1]) < 500: continue start = contig[1][:kval] end = contig[1][-kval:] if start == end: circular_contigs[contig[0].split(" ")[0][1:]] = [ len(contig[1]), "+" ] break output.write(contig[0] + "\n") if circular_contigs[contig[0].split(" ")[0][1:]][1] == "-": output.write(contig[1] + "\n") elif circular_contigs[contig[0].split(" ")[0][1:]][1] == "+": # if contig[0].split(" ")[0][1:] in circ_set: # output.write(contig[1]+contig[1][:5000]+"\n") # else: output.write(contig[1] + contig[1][kval:5000] + "\n") return (circular_contigs)
def split_and_rename(infile, outdir): contigs = fastaparser.read_fasta(infile) for contig in contigs: filename = contig[0].split()[0][1:] filename = join(outdir, filename + ".fasta") print filename fastaparser.write_fasta_to_file(filename, [contig])
def extract_circular_from_file(file, indir, outdir): out_file = join(outdir, os.path.splitext(file)[0] + ".circular.fasta") contigs = fastaparser.read_fasta(join(sys.argv[1], file)) circulars = [] count = [] for contig in contigs: arr = contig[0].strip(';').split('_') # if float(arr[3]) > 500: if len(contig[1]) < 500: continue for kval in range(200, 50, -1): # kval = 55 if kval >= len(contig[1]) or len(contig[1]) < 500: continue start = contig[1][:kval] end = contig[1][-kval:] if start == end: # print (">" + contig[0][1:]) # print (contig[1]) # print (" k equal " + str(kval)) print(contig[0] + " is circular " + str(kval)) # contig[0] = contig[0] + " k: " + str(kval) circulars.append(contig) break fastaparser.write_fasta_to_file(out_file, circulars)
def parse_mash(contig_file, table): contigs = fastaparser.read_fasta(contig_file) similar_lists = {} for contig in contigs: similar_lists[get_short_name(contig[0])] = [] for line in open(table, 'r'): arr = line.split() dist = float(arr[2]) if dist < 0.1: similar_lists[arr[0]].append(arr[1]) print "processed input" to_sort = [] for l in similar_lists: to_sort.append([l, len(similar_lists[l])]) sorted_similar = sorted(to_sort, key=itemgetter(1), reverse=True) outcontigs = [] used = set() for contig_info in sorted_similar: if contig_info[0] not in used: for similar in similar_lists[contig_info[0]]: used.add(similar) if contig_info[1] > 10: print contig_info print similar_lists[contig_info[0]] #far from optimal but whynot for contig in contigs: if get_short_name(contig[0]) == contig_info[0]: outcontigs.append(contig) break result_f = join(os.path.dirname(contig_file), "interesting.fasta") os.system("rm " + result_f) fastaparser.write_fasta_to_file(result_f, outcontigs)
def make_rectangles_from_genome(options): k = options.k ingraph = Graph() _, genome = fastaparser.read_fasta(options.genome).next() ingraph.make_graph(genome, int(k)) edges_before_loop_DG = ingraph.find_loops(10, 1000) ingraph.save(os.path.join(options.out_dir,"graph")) rs = RectangleSet(ingraph, int(options.d)) rs.filter_without_prd() f_left = open(os.path.join(options.out_dir, "paired_genom_contigs_1.fasta"),"w") # TODO: what is it? f_right = open(os.path.join(options.out_dir, "paired_genom_contigs_2.fasta"),"w") # TODO: what is it? contigs_id = 0 for key, rect in rs.rectangles.items(): for key, diag in rect.diagonals.items(): e1 = rect.e1.seq e2 = rect.e2.seq f_left.write(">" + str(contigs_id) + "/1\n") f_left.write(e1[diag.offseta:diag.offsetc]) f_left.write("\n") f_right.write(">"+str(contigs_id) + "/2\n") f_right.write(e2[diag.offsetb:diag.offsetd]) f_right.write("\n") contigs_id += 1 bgraph = rs.bgraph_from_genome() bgraph.condense() outgraph = bgraph.project(options.out_dir, False) outgraph.fasta(open(os.path.join(options.out_dir, 'rectangles.fasta'), 'w'))
def GC_content(contigs_fpath, skip=False): """ Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows) """ total_GC_amount = 0 total_contig_length = 0 GC_bin_num = int(100 / qconfig.GC_bin_size) + 1 GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC % GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x total_GC = None if skip: return total_GC, (GC_distribution_x, GC_distribution_y) for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq) total_GC_amount += seq_full.count("G") + seq_full.count("C") total_contig_length += len(seq_full) - seq_full.count("N") n = 100 # blocks of length 100 # non-overlapping windows for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]: # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs") ACGT_len = len(seq) - seq.count("N") if ACGT_len < (n / 2): continue GC_len = seq.count("G") + seq.count("C") GC_percent = 100.0 * GC_len / ACGT_len GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1 # GC_info = [] # for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq) # total_GC_amount += seq_full.count("G") + seq_full.count("C") # total_contig_length += len(seq_full) - seq_full.count("N") # n = 100 # blocks of length 100 # # non-overlapping windows # for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]: # # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs") # ACGT_len = len(seq) - seq.count("N") # if ACGT_len < (n / 2): # continue # # contig_length = len(seq) # GC_amount = seq.count("G") + seq.count("C") # #GC_info.append((contig_length, GC_amount * 100.0 / contig_length)) # GC_info.append((1, 100 * GC_amount / ACGT_len)) # # sliding windows # seq = seq_full[0:n] # GC_amount = seq.count("G") + seq.count("C") # GC_info.append((1, GC_amount * 100.0 / n)) # for i in range(len(seq_full) - n): # GC_amount = GC_amount - seq_full[i].count("G") - seq_full[i].count("C") # GC_amount = GC_amount + seq_full[i + n].count("G") + seq_full[i + n].count("C") # GC_info.append((1, GC_amount * 100.0 / n)) if total_contig_length == 0: total_GC = None else: total_GC = total_GC_amount * 100.0 / total_contig_length return total_GC, (GC_distribution_x, GC_distribution_y)
def GC_content(contigs_fpath, skip=False): """ Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows) """ total_GC_amount = 0 total_contig_length = 0 GC_bin_num = int(100 / qconfig.GC_bin_size) + 1 GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC % GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x total_GC = None if skip: return total_GC, (GC_distribution_x, GC_distribution_y) for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq) total_GC_amount += seq_full.count("G") + seq_full.count("C") total_contig_length += len(seq_full) - seq_full.count("N") n = 100 # blocks of length 100 # non-overlapping windows for seq in [seq_full[i : i + n] for i in range(0, len(seq_full), n)]: # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs") ACGT_len = len(seq) - seq.count("N") if ACGT_len < (n / 2): continue GC_len = seq.count("G") + seq.count("C") GC_percent = 100.0 * GC_len / ACGT_len GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1 # GC_info = [] # for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq) # total_GC_amount += seq_full.count("G") + seq_full.count("C") # total_contig_length += len(seq_full) - seq_full.count("N") # n = 100 # blocks of length 100 # # non-overlapping windows # for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]: # # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs") # ACGT_len = len(seq) - seq.count("N") # if ACGT_len < (n / 2): # continue # # contig_length = len(seq) # GC_amount = seq.count("G") + seq.count("C") # #GC_info.append((contig_length, GC_amount * 100.0 / contig_length)) # GC_info.append((1, 100 * GC_amount / ACGT_len)) # # sliding windows # seq = seq_full[0:n] # GC_amount = seq.count("G") + seq.count("C") # GC_info.append((1, GC_amount * 100.0 / n)) # for i in range(len(seq_full) - n): # GC_amount = GC_amount - seq_full[i].count("G") - seq_full[i].count("C") # GC_amount = GC_amount + seq_full[i + n].count("G") + seq_full[i + n].count("C") # GC_info.append((1, GC_amount * 100.0 / n)) if total_contig_length == 0: total_GC = None else: total_GC = total_GC_amount * 100.0 / total_contig_length return total_GC, (GC_distribution_x, GC_distribution_y)
def glue_and_rename(indir, outfile): for file in os.listdir(indir): arr = file.split('.') if len(arr) < 4: continue contigs = fastaparser.read_fasta(join(indir, file)) for contig in contigs: new_name = contig[0] + " " + arr[0] + "." + arr[1] print new_name fastaparser.write_fasta_to_file(outfile, zip([new_name], [contig[1]]))
def found_most_similar(work_dir): contigs_info = [] for file in os.listdir(work_dir): arr = file.split('.') if arr[-1] == "fasta": contigs = fastaparser.read_fasta(join(work_dir, file)) contigs_info.append([file, len(contigs[0][1])]) all_sorted = sorted(contigs_info, key=itemgetter(1)) max_ind = len(all_sorted) low_ind = 0 high_ind = 0 similar_list = [] used = [] for i in range(0, max_ind): used.append(False) cur_len = all_sorted[i][1] first_mash = join(work_dir, all_sorted[i][0] + ".msh") while all_sorted[low_ind][1] < cur_len * 0.8 and low_ind < max_ind - 1: low_ind += 1 while all_sorted[high_ind][1] < cur_len * 1.2 and high_ind < max_ind: high_ind += 1 if i % 10 == 0: print "processing... " + str(i) + " range: " + str( low_ind) + "-" + str(high_ind) sim = [] for j in range(low_ind, high_ind): second_mash = join(work_dir, all_sorted[j][0] + ".msh") process = subprocess.Popen( [mash_bin, 'dist', first_mash, second_mash], stdout=subprocess.PIPE) stdout = process.communicate()[0] arr = stdout.split() dist = float(arr[2]) if dist < 0.2: sim.append(j) similar_list.append([i, len(sim), sim]) if i % 10 == 0: print(len(sim)) most_similar = sorted(similar_list, key=itemgetter(1), reverse=True) for k in most_similar: print k for contigs in most_similar: print all_sorted[contigs[0]][0] + " " + str(contigs[1]) + " " + str( used(contigs[0])) for j in contigs[2]: used[j] = True
def extract_not_listed(infasta, list): listed = set() for line in open(list, 'r'): listed.add(">" + line.split()[0]) print len(listed) contigs = fastaparser.read_fasta(infasta) print len(contigs) outcontigs = [] for contig in contigs: if not contig[0].split()[0] in listed: # print contig[0] outcontigs.append(contig) # else: # listed.remove(contig[0]) # for c in listed: # print c print len(outcontigs) outfasta = infasta[:-6] + ".unknown.fasta" os.system("rm " + outfasta) fastaparser.write_fasta_to_file(outfasta, outcontigs)
def break_scaffolds(argv): if (len(argv) != 4) and (len(argv) != 2): print( "Usage: " + argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)") print( "Usage: " + argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)" ) sys.exit() BREAK_SCAFFOLDS = False if len(argv) == 4: BREAK_SCAFFOLDS = True N_NUMBER = None counter = 0 if BREAK_SCAFFOLDS: N_NUMBER = int(argv[2]) sizes_of_Ns_regions = dict() new_fasta = [] for id, (name, seq) in enumerate(fastaparser.read_fasta(argv[1])): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER: new_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end if not BREAK_SCAFFOLDS: if (end - start) in sizes_of_Ns_regions: sizes_of_Ns_regions[(end - start)] += 1 else: sizes_of_Ns_regions[(end - start)] = 1 if BREAK_SCAFFOLDS: new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) counter += cur_contig_number if BREAK_SCAFFOLDS: fastaparser.write_fasta_to_file(argv[3], new_fasta) #print (" * " + str(id + 1) + " scaffold(s) were broken into " + str(counter) + " contig(s)") else: list_of_sizes = sizes_of_Ns_regions.keys() list_of_sizes.sort() avg_len = 0.0 nruns = 0 for k, v in sizes_of_Ns_regions: avg_len += k * v nruns += v print k, sizes_of_Ns_regions[k] avg_len /= nruns print "N-runs: " + str(nruns) + ", avg. len: " + str(avg_len)
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum( fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot( contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
if len(sys.argv) < 3: print 'Contigs concatenator: makes one big contig from the assembly' print 'Usage: ', sys.argv[0], ' INPUT_FILE OUTPUT_FILE [COORDS_FILE]' sys.exit(0) infilename = sys.argv[1] outfilename = sys.argv[2] coords = 0 if (len(sys.argv) > 3): coords = open(sys.argv[3], 'w') padding = "" for i in xrange(0, padding_length): padding += "N" fasta = fastaparser.read_fasta(infilename) summary_seq = "" cur_coord = 1 for name, seq in fasta: if (len(seq) >= min_contig): if (coords != 0): coords.write( str(cur_coord) + " " + str(cur_coord + len(seq) - 1) + "\n") cur_coord += len(seq) + padding_length summary_seq += (seq + padding) out = open(outfilename, 'w') out.write(">sum_contig total_length=" + str(len(summary_seq)) + '\n') for i in xrange(0, len(summary_seq), 60): out.write(summary_seq[i:i + 60] + '\n')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') logger.print_timestamp() logger.info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) results = dict() logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >>gaps_file, chr_name cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, i - cur_gap_size, i - 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix) found_file = open(found_fpath, 'w') print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End') print >>found_file, '============================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 i = str(region.id) if i == 'None': i = '# ' + str(region.number + 1) print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return results, genes_in_contigs, operons_in_contigs
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write( '================================================================================================================\n' ) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip( aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot( len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot( len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.')
if last_gap not in trusted_gaps: last_gap = (0,0) total_gaps = len(chunks) - 1 if not circular: total_gaps += 1 print (" Total gaps between aligned regions: " + str(total_gaps) + "; gaps, that have read pairs spanning over: " + str(len(trusted_gaps))) trusted_chunks_index = [(0,0) for i in range(0, len(chunks))] for i in range(0, len(chunks) - 1): if (chunks[i][1], chunks[i + 1][0]) in trusted_gaps: trusted_chunks_index[i] = (chunks[i][1], chunks[i + 1][0]) chunks_file = os.path.join(output_dir, os.path.splitext(os.path.basename(reference))[0] + "gaps_" + dataset + ".fasta") fasta = fastaparser.read_fasta(chunks_file) singlef = os.path.join(output_dir, dataset + "_single_reads.fasta") simulate_ideal_by_fasta.simulate_single(singlef, fasta, rl, circular) if ins < rl: continue pairedf = os.path.join(output_dir, dataset + "_paired_reads.fasta") simulate_ideal_by_fasta.simulate_paired(pairedf, fasta, ins, rl, circular) gapf = os.path.join(output_dir, dataset + "_gapped_reads.fasta") simulate_ideal_by_fasta.simulate_paired_over_gaps(gapf, fasta, chunks, trusted_chunks_index, last_gap, ref_len, ins, rl, circular) # total report
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points*2): import math multiplicator = int(num_contigs/qconfig.max_points) max_points = num_contigs/multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points) if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:])) else: corr_lists_of_lengths = lists_of_lengths # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] largest_contig = 0 import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig/1000)/600) # divide on height of plot if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report and not qconfig.is_combined_ref: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) import plotter ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') if not qconfig.is_combined_ref: ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') logger.main_info('Done.')
for line in in_file: if line.startswith(" CONTIG:"): cur_contig_id = line.split(" CONTIG:")[1].strip() if (line.find("Extensive misassembly") != -1) and (cur_contig_id != ""): mis_contigs_ids.append(cur_contig_id.split()[0]) cur_contig_id = "" if line.startswith("Analyzing coverage..."): break # printing IDs of misassembled contigs print("Misassembled contigs:") for contig_id in mis_contigs_ids: print(contig_id) in_file.close() if (len(sys.argv) == 4): import fastaparser input_contigs = fastaparser.read_fasta(sys.argv[2]) mis_contigs = open(sys.argv[3], "w") for (name, seq) in input_contigs: corr_name = re.sub(r'\W', '', re.sub(r'\s', '_', name)) if mis_contigs_ids.count(corr_name) != 0: mis_contigs.write(name + '\n') for i in xrange(0, len(seq), 60): mis_contigs.write(seq[i:i + 60] + '\n') mis_contigs.close()
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(" Reference genome:") logger.info( " " + os.path.basename(ref_fpath) + ", Reference length = " + str(reference_length) + ", Reference GC % = " + "%.2f" % reference_GC ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(" Estimated reference length = " + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(" Contig files: ") lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(" " + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count("N") lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs / multiplicator lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] corr_lists_of_lengths = [ [ sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths ] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :])) else: corr_lists_of_lengths = lists_of_lengths # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths) json_saver.save_tick_x(json_output_dir, multiplicator) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(" Calculating N50 and L50...") list_of_GC_distributions = [] largest_contig = 0 import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns) ): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) logger.info( " " + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + ", N50 = " + str(n50) + ", L50 = " + str(l50) + ", Total length = " + str(total_length) + ", GC % = " + ("%.2f" % total_GC if total_GC is not None else "undefined") + ", # N's per 100 kbp = " + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else "undefined" ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))) ) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report and not qconfig.is_combined_ref: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) import plotter ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot( results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + "/Nx_plot", "Nx", [], json_output_dir=json_output_dir, ) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot( results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + "/NGx_plot", "NGx", [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir, ) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot( ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length" ) if not qconfig.is_combined_ref: ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) plotter.GC_content_plot( ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot" ) logger.main_info("Done.")
#!/usr/bin/python ############################################################################ # Copyright (c) 2015 Saint Petersburg State University # Copyright (c) 2011-2014 Saint Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ # Convert contigs (i.e a reference) for experiment of running SPAdes on E. coli MC reads in "IonTorrent" mode # (all series of repeated nucleotides are changed to single nucleotides). import sys import os import fastaparser # MAIN if len(sys.argv) < 3: print("Usage: " + sys.argv[0] + " <input fasta> <output fasta>") sys.exit() new_fasta = [] for name, seq in fastaparser.read_fasta(sys.argv[1]): new_seq = seq[0] for i in range(1, len(seq)): if seq[i - 1] != seq[i]: new_seq += seq[i] new_fasta.append((name, new_seq)) fastaparser.write_fasta_to_file(sys.argv[2], new_fasta)
# MAIN if len(sys.argv) != 4: print("Usage: " + sys.argv[0] + " <input fasta> <K or K1,K2,K3> <output_dir>") sys.exit() if len(sys.argv[2].split(',')) > 1: K_list = map(int, sys.argv[2].split(',')) else: K_list = [int(sys.argv[2])] output_dir = os.path.abspath(sys.argv[3]) if not os.path.isdir(output_dir): os.makedirs(output_dir) # creating single-entry references and chains params_subst_dict = dict() input_fasta = fastaparser.read_fasta(sys.argv[1]) cwd = os.getcwd() os.chdir(ideal_assembler_bin_dir) for K in K_list: print("Starting with K=" + str(K)) result_fasta = [] for id, fasta_entry in enumerate(input_fasta): cur_ref_name = os.path.join(output_dir, 'chr_' + str(id) + '.fasta') cur_chain_name = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '_chain') log_filename = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '.log') fastaparser.write_fasta_to_file(cur_ref_name, [fasta_entry]) shutil.copy(chain_template, cur_chain_name) cur_params_subst_dict = dict(params_subst_dict) cur_params_subst_dict['OUT_BASE'] = 'chr_' + str(id) + '_K' + str(K) tmp_dir = os.path.join(ideal_assembler_bin_dir, 'data/cap/cache/env_' + cur_params_subst_dict['OUT_BASE']) cur_params_subst_dict['REFERENCE'] = cur_ref_name
def main(): args = parse_args(sys.argv[1:]) base = os.path.basename(args.f) name_file = os.path.splitext(base)[0] dirname = os.path.dirname(__file__) outdir = args.o try: os.makedirs(outdir) except OSError as e: if e.errno != errno.EEXIST: raise name = os.path.join(outdir, name_file) ids = [] with open(args.f, "r") as ins: for line in ins: if line[0] == ">": ids.append(line.split()[0][1:]) if args.hmm: hmm = args.hmm else: print("No HMM database provided") exit(1) if args.db: from parse_blast_xml import parser blastdb = args.db if args.t: threads = str(args.t) else: threads = str(20) # Check for circular: contig_len_circ = check_circular(args.f, name) infile_circ = name + "_input_with_circ.fasta" # Run gene prediction print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Gene prediction...") res = os.system("prodigal -p meta -c -i " + infile_circ + " -a " + name + "_proteins.fa -o " + name + "_genes.fa 2>" + name + "_prodigal.log") if res != 0: print("Prodigal run failed") exit(1) # Filter genes predicted over the end of the contig proteins = fastaparser.read_fasta(name + "_proteins.fa") with open(name + "_proteins_circ.fa", 'w') as protein_output: for i in proteins: contig_name = i[0].split()[0].rsplit("_", 1)[0][1:] gene_start = i[0].split("#")[1] if int(gene_start.strip()) < int( (contig_len_circ[contig_name][0])): protein_output.write(i[0] + "\n") protein_output.write(i[1] + "\n") # HMM search print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("HMM domains prediction...") res = os.system("hmmsearch --noali --cut_nc -o " + name + "_out_pfam --domtblout " + name + "_domtblout --cpu " + threads + " " + hmm + " " + name + "_proteins_circ.fa") if res != 0: print("hmmsearch run failed") exit(1) print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Parsing...") tblout_pfam = name + "_domtblout" feature_table = get_table_from_tblout(tblout_pfam) feature_table = [i.strip().split(' ', 1) for i in feature_table] with open(name + '_feature_table.txt', 'w') as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(feature_table) feature_table_names = [] feature_table_genes = [] for i in feature_table: feature_table_names.append(i[0]) feature_table_genes.append(i[1]) print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Classification...") t = feature_table_genes k = naive_bayes(t) names_result = {} for i in range(0, len(k)): names_result[feature_table_names[i]] = [ k[i][0], k[i][3], feature_table_genes[i] ] if args.db: #run blast print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Running BLAST...") os.system("blastn -query " + args.f + " -db " + blastdb + " -evalue 0.0001 -outfmt 5 -out " + name + ".xml -num_threads " + threads + " -num_alignments 50") print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Parsing BLAST") parser(name + ".xml", outdir) print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) #### add blast results plasmids = [ line.strip().split("\t") for line in open(name + "_plasmid.names") ] plasmids_list = {} for i in range(0, len(plasmids) - 1): if len(plasmids[i]) == 1: plasmids_list[plasmids[i][0].split()[0]] = [ float(plasmids[i + 1][1].split(":")[1]), float(plasmids[i + 1][2].split(":")[1]), plasmids[i + 1][0] ] chrom = [ line.rstrip().split("\t") for line in open(name + "_chromosome.names") ] chrom_list = {} for i in range(0, len(chrom) - 1): if len(chrom[i]) == 1: chrom_list[chrom[i][0].split()[0]] = [ float(chrom[i + 1][1].split(":")[1]), float(chrom[i + 1][2].split(":")[1]), chrom[i + 1][0] ] vir = [ line.rstrip().split("\t") for line in open(name + "_viruses.names") ] vir_list = {} for i in range(0, len(vir) - 1): if len(vir[i]) == 1: vir_list[vir[i][0].split()[0]] = [ float(vir[i + 1][1].split(":")[1]), float(vir[i + 1][2].split(":")[1]), vir[i + 1][0] ] nos = [line.rstrip() for line in open(name + "_no_significant.names")] nos_list = [] for i in nos: if len(i.split()) > 0: nos_list.append(i.split()[0]) nos_list = [i.strip().split()[0] for i in nos_list] other = [line.rstrip() for line in open(name + "_other.names")] other_list = [] for i in other: if len(i.split()) > 0: other_list.append(i.split()[0]) other_list = [i.strip().split()[0] for i in other_list] final_table = collections.OrderedDict() if args.db: for i in ids: if i in names_result: if names_result[i][0] == "Uncertain - too short": if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1] == "+"): names_result[i][0] = "Uncertain - viral or bacterial" if i in plasmids_list: final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2], "Plasmid", round(plasmids_list[i][0], 2), round(plasmids_list[i][1], 2), plasmids_list[i][2] ] if i in chrom_list: final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2], "Chromosome", chrom_list[i][0], chrom_list[i][1], chrom_list[i][2] ] if i in vir_list: final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2], "Virus", vir_list[i][0], vir_list[i][1], vir_list[i][2] ] if i in nos_list: final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2], "Non-significant" ] if i in other_list: final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2], "Other", ] else: if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1] == "+"): names_result[i] = "Uncertain - viral or bacterial" else: names_result[i] = "Uncertain - too short" if i in plasmids_list: final_table[i] = [ names_result[i], contig_len_circ[i][0], contig_len_circ[i][1], "-", "-", "Plasmid", plasmids_list[i][0], plasmids_list[i][1], plasmids_list[i][2] ] if i in chrom_list: final_table[i] = [ names_result[i], contig_len_circ[i][0], contig_len_circ[i][1], "-", "-", "Chromosome", chrom_list[i][0], chrom_list[i][1], chrom_list[i][2] ] if i in vir_list: final_table[i] = [ names_result[i], contig_len_circ[i][0], contig_len_circ[i][1], "-", "-", "Virus", vir_list[i][0], vir_list[i][1], vir_list[i][2] ] if i in nos_list: final_table[i] = [ names_result[i], contig_len_circ[i][0], contig_len_circ[i][1], "-", "-", "Non-significant" ] if i in other_list: final_table[i] = [ names_result[i], contig_len_circ[i][0], contig_len_circ[i][1], "-", "-", "Other", other_list[i][0], other_list[i][1], other_list[i][2] ] else: for i in ids: if i in names_result: if names_result[i][0] == "Uncertain - too short": if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1] == "+"): names_result[i][0] = "Uncertain - viral or bacterial" final_table[i] = [ names_result[i][0], contig_len_circ[i][0], contig_len_circ[i][1], names_result[i][1], names_result[i][2] ] else: if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1] == "+"): final_table[i] = [ "Uncertain - viral or bacterial", contig_len_circ[i][0], contig_len_circ[i][1], "-" ] else: final_table[i] = [ "Uncertain - too short", contig_len_circ[i][0], contig_len_circ[i][1], "-" ] result_file = name + "_result_table.csv" with open(result_file, 'w') as output: writer = csv.writer(output, lineterminator='\n') for i in final_table: writer.writerow([i] + final_table[i]) if not os.path.exists(outdir + "/Prediction_results_fasta/"): os.mkdir(outdir + "/Prediction_results_fasta/") with open( outdir + "/Prediction_results_fasta/" + name_file + "_virus.fasta", "w") as vir_file: with open( outdir + "/Prediction_results_fasta/" + name_file + "_plasmid.fasta", "w") as plasmid_file: with open( outdir + "/Prediction_results_fasta/" + name_file + "_chromosome.fasta", "w") as chrom_file: with open( outdir + "/Prediction_results_fasta/" + name_file + "_virus_uncertain.fasta", "w") as vc_file: with open( outdir + "/Prediction_results_fasta/" + name_file + "_plasmid_uncertain.fasta", "w") as pc_file: contigs = fastaparser.read_fasta(args.f) for i in contigs: contig_name = i[0].split(" ")[0][1:] if final_table[contig_name][0] == "Virus": vir_file.write(i[0] + "\n") vir_file.write(i[1] + "\n") elif final_table[contig_name][0] == "Chromosome": chrom_file.write(i[0] + "\n") chrom_file.write(i[1] + "\n") elif final_table[contig_name][0] == "Plasmid": if args.p: plasmid_file.write(i[0] + "\n") plasmid_file.write(i[1] + "\n") else: chrom_file.write(i[0] + "\n") chrom_file.write(i[1] + "\n") elif final_table[contig_name][ 0] == "Uncertain - viral or bacterial": vc_file.write(i[0] + "\n") vc_file.write(i[1] + "\n") elif final_table[contig_name][ 0] == "Uncertain - plasmid or chromosomal": if args.p: pc_file.write(i[0] + "\n") pc_file.write(i[1] + "\n") else: chrom_file.write(i[0] + "\n") chrom_file.write(i[1] + "\n") if not args.p: os.remove(outdir + "/Prediction_results_fasta/" + name_file + "_plasmid.fasta") os.remove(outdir + "/Prediction_results_fasta/" + name_file + "_plasmid_uncertain.fasta") print("Done!") print( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d %H:%M:%S')) print("Verification results can be found in " + os.path.abspath(result_file))
############################################################################ # Copyright (c) 2015 Saint Petersburg State University # Copyright (c) 2011-2014 Saint Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ import sys import os # Deletes reverse-complementary duplicates and simple duplicates from FASTA-file sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '..')) import fastaparser if len(sys.argv) < 2: print 'Usage', sys.argv[0], 'in.fasta > out.fasta' exit(1) fastafilename = sys.argv[1] fasta = fastaparser.read_fasta(fastafilename) fasta_res = {} for name, seq in fasta: if (seq not in fasta_res) and (fastaparser.rev_comp(seq) not in fasta_res): fasta_res[seq] = name fastaparser.write_fasta((name, seq) for seq, name in fasta_res.iteritems())
def main_utils(): program_name = sys.argv[0][:sys.argv[0].rfind('.')] # parse running string of main program and get all arguments: args = UtilsPipeline.get_arguments() WELL_FULLY_COVERAGE_THRESHOLDS = rqconfig.well_fully_coverage_thresholds( args.lower_threshold, args.upper_threshold) ALIGNMENT_THRESHOLDS = rqconfig.alignment_thresholds() # run rnaQUAST on test_data: if args.test: UtilsPipeline.run_rnaQUAST_on_test_data(args, rquast_dirpath, program_name) # UtilsPipeline.run_rnaQUAST_on_debug_data(args, rquast_dirpath, program_name) sys.exit() UtilsPipeline.get_abspath_input_data(args) # create output directory: args.output_dir = UtilsPipeline.create_output_folder( args.output_dir, program_name) # create temporary directory: tmp_dir = UtilsPipeline.create_empty_folder( os.path.join(args.output_dir, 'tmp')) # create directory for log files: log_dir = UtilsPipeline.create_empty_folder( os.path.join(args.output_dir, 'logs')) # SET LOGGER: if args.debug: rqconfig.debug = True logger.set_up_console_handler(debug=True) else: logger.set_up_console_handler() logger.set_up_file_handler(log_dir) logger.print_command_line([os.path.realpath(__file__)] + sys.argv[1:], wrap_after=None) logger.start(args.blat, tmp_dir) UtilsPipeline.get_input_data_exist_error(args, logger) # THREADING: args.threads = UtilsPipeline.get_num_threads(args.threads, logger) if args.meta: logger.info( '\nYOU RUN QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES') # GET segregate FILES: if args.reference and args.gtf and len(args.reference) != len(args.gtf): logger.error('Numbers of references and gene databases are different', exit_with_code=1) args.reference = \ UtilsPipeline.get_single_file(args.reference, tmp_dir, 'reference', rqconfig.list_ext_fa, args.meta, logger) args.gtf = \ UtilsPipeline.get_single_file(args.gtf, tmp_dir, 'gene_database', rqconfig.list_ext_gtf, args.meta, logger) # READ REFERENCE FROM MULTIFASTA: reference_dict = None ids_chrs = None if args.reference is not None: logger.print_timestamp() logger.info('Getting reference...') reference_dict = UtilsGeneral.list_to_dict( fastaparser.read_fasta(args.reference)) logger.info('Done.') genome_len = UtilsGeneral.get_genome_len(reference_dict) ids_chrs = reference_dict.keys() # correction for fasta contained Y, W and etc: # for id_chr in ids_chrs: # reference_dict[id_chr] = UtilsGeneral.correct_nucl_seq(reference_dict[id_chr]) # for strand specific data we store + and - keys in dictionaries and only + for non strand specific data: strands = UtilsGeneral.get_strands(args, logger) if args.prokaryote: type_organism = 'prokaryotes' else: type_organism = 'eukaryotes' # USE ANNOTATION: sqlite3_db_genes = None sorted_exons_attr = None db_genes_metrics = None type_genes, type_isoforms, type_exons = \ UtilsAnnotations.default_type_genes, \ UtilsAnnotations.default_type_isoforms, \ UtilsAnnotations.default_type_exons if args.gtf is not None or args.gene_db is not None: if args.gene_db is not None: gene_db_name = os.path.split(args.gene_db)[1] label_db = gene_db_name[:gene_db_name.rfind('.db')] else: gtf_name = os.path.split(args.gtf)[1] label_db = gtf_name[:gtf_name.rfind('.g')] if ids_chrs is not None: args.gtf = UtilsAnnotations.clear_gtf_by_reference_chr( args.gtf, ids_chrs, tmp_dir, label_db, logger) sqlite3_db_genes = \ UtilsAnnotations.create_sqlite3_db(args.gene_db, args.gtf, label_db, args.disable_infer_genes, args.disable_infer_transcripts, args.output_dir, tmp_dir, logger) type_genes, type_isoforms, type_exons = \ UtilsAnnotations.get_type_features(sqlite3_db_genes, UtilsAnnotations.default_type_genes, UtilsAnnotations.default_type_isoforms, UtilsAnnotations.default_type_exons, args.prokaryote, logger) # if UtilsAnnotations.default_type_exons == type_exons: # type_organism = 'eukaryotes' # else: # type_organism = 'prokaryotes' db_genes_metrics = GeneDatabaseMetrics.GeneDatabaseMetrics( sqlite3_db_genes, type_genes, type_isoforms, logger) ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT = db_genes_metrics.max_intron_len + 100 logger.info( '\nSets maximum intron size equal {}. Default is 1500000 bp.\n'. format(ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT)) # set exons starts / ends and ids for binning strategy: if ids_chrs is not None: sorted_exons_attr = \ SortedExonsAttributes.SortedExonsAttributes(sqlite3_db_genes, type_exons, strands, ids_chrs, reference_dict, logger) reads_coverage = None if args.reads_alignment is not None or \ ((args.single_reads is not None or (args.left_reads is not None and args.right_reads is not None)) and args.reference is not None and sqlite3_db_genes is not None): reads_coverage = \ ReadsCoverage.ReadsCoverage(args.reads_alignment, args.tophat, args.reference, args.single_reads, args.left_reads, args.right_reads, reference_dict, sqlite3_db_genes, type_isoforms, sorted_exons_attr, args.strand_specific, db_genes_metrics.tot_isoforms_len, genome_len, tmp_dir, args.threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir) if args.transcripts is not None: # GET TRANSCRIPTS: transcripts_dicts = [] for i_transcripts in range(len(args.transcripts)): logger.print_timestamp(' ') logger.info(' Getting transcripts from {}...'.format( args.transcripts[i_transcripts])) transcripts_dicts.append( UtilsGeneral.list_to_dict( fastaparser.read_fasta(args.transcripts[i_transcripts]))) logger.info(' Done.') # get labels for folders names and names of transcripts in reports: all_labels_from_dirs = False if args.labels is None: args.labels = UtilsPipeline.process_labels(args.transcripts, args.labels, all_labels_from_dirs) else: logger.warning('No transcripts. Use --transcripts option.') # GET PSL ALIGNMENT FILE: if args.alignment is None and args.reference is not None and args.transcripts is not None: if args.blat: args.alignment = UtilsTools.run_blat(None, args.reference, transcripts_dicts, args.labels, args.threads, tmp_dir, logger, log_dir) else: args.alignment = UtilsTools.run_gmap(args.reference, genome_len, args.transcripts, args.labels, args.threads, args.gmap_index, tmp_dir, logger, log_dir) #if args.fusion_misassemble_analyze: # if not (args.left_reads is not None and args.right_reads is not None): # logger.error('Usage: --left_reads LEFT_READS --right RIGHT_READS for analyse fusions and misassemblies', # exit_with_code=2, to_stderr=True) # sys.exit(2) # FOR MISASSEMBLIES SEARCH: # GET DATABASE FOR FA ISOFORMS: args.blast = False if args.reference is not None and sqlite3_db_genes is not None and args.alignment is not None: blastn_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blastn') if not os.path.isfile(blastn_run): blastn_run = "blastn" if UtilsGeneral.which(blastn_run) is None: logger.warning( 'blastn not found! Please add blastn to PATH for better MISASSEMBLIES metrics.' ) else: args.blast = True isoforms_fa_path = os.path.join(tmp_dir, '{}.isoforms.fa'.format(label_db)) isoforms_list = UtilsGeneral.dict_to_list( UtilsAnnotations.get_fa_isoforms(sqlite3_db_genes, type_isoforms, type_exons, reference_dict, logger)) fastaparser.write_fasta(isoforms_fa_path, isoforms_list) isoforms_blast_db = UtilsTools.get_blast_db( isoforms_fa_path, label_db, tmp_dir, logger, log_dir) # LOGGING INPUT DATA: logger.print_input_files(args) # INITIALIZATION TRANSCRIPTS METRICS AND REPORTS: transcripts_metrics = [] separated_reports = [] if args.transcripts is not None: alignments_reports = [] blast_alignments = [] for i_transcripts in range(len(args.transcripts)): # INITIALIZE TRANSCRIPTS METRICS: #if args.sam_file is not None: # sam_file_tmp = args.sam_file[i_transcripts] #else: transcripts_metrics.append( TranscriptsMetrics.TranscriptsMetrics( args, args.labels[i_transcripts])) # INITIALIZE SEPARATED REPORTS: separated_reports.append( SeparatedReport.SeparatedReport( args.labels[i_transcripts], args.output_dir, transcripts_metrics[i_transcripts], WELL_FULLY_COVERAGE_THRESHOLDS)) '''from joblib import Parallel, delayed n = len(args.transcripts) run_n = n / args.threads for i_run in range(run_n): tmp = Parallel(n_jobs=args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict, annotated_exons, annotated_isoforms, strands, transcripts_metrics, basic_isoforms_metrics, separated_reports) for i_transcripts in range(i_run * args.threads, args.threads * (i_run + 1), 1)) for i in range(args.threads): i_transcripts = i + i_run * args.threads transcripts_metrics[i_transcripts] = tmp[i][0] separated_reports[i_transcripts] = tmp[i][1] if n - run_n * args.threads != 0: tmp = Parallel(n_jobs=n - run_n * args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict, annotated_exons, annotated_isoforms, strands, transcripts_metrics, basic_isoforms_metrics, separated_reports) for i_transcripts in range(run_n * args.threads, n, 1)) for i in range(n - run_n * args.threads): i_transcripts = i + run_n * args.threads transcripts_metrics[i_transcripts] = tmp[i][0] separated_reports[i_transcripts] = tmp[i][1]''' logger.info() logger.info('Processing transcripts from {}:'.format( args.transcripts[i_transcripts])) if args.blast: blast_alignments.append\ (UtilsTools.align_transcripts_to_isoforms_by_blastn (args.transcripts[i_transcripts], isoforms_blast_db, tmp_dir, args.labels[i_transcripts], logger, log_dir)) else: blast_alignments.append(None) # PROCESS TRANSCRIPTS ALIGNMENTS: if transcripts_metrics[i_transcripts].simple_metrics is not None: # GET FILES WITH ALIGNMENTS REPORTS: alignments_reports.append\ (UtilsAlignment.AlignmentsReport.get_alignments_report (args.labels[i_transcripts], args.alignment[i_transcripts], blast_alignments[i_transcripts], transcripts_dicts[i_transcripts], tmp_dir, args.min_alignment, logger, ALIGNMENT_THRESHOLDS)) # UPDATE METRICS BY ASSEMBLED TRANSCRIPTS: transcripts_metrics[i_transcripts].processing_assembled_psl_file\ (alignments_reports[i_transcripts].blat_report.assembled_psl_file, sorted_exons_attr, args.strand_specific, logger, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS) # UPDATE METRICS BY MISASSEMBLED TRANSCRIPTS: # by blat: transcripts_metrics[i_transcripts].processing_misassembled_psl_file\ (alignments_reports[i_transcripts].blat_report.misassembled_psl_union_file, logger, True) # by blast: if args.blast: transcripts_metrics[i_transcripts].processing_misassembled_psl_file\ (alignments_reports[i_transcripts].blast6_report.misassembled_blast6_union_file, logger, False) # GET METRICS: transcripts_metrics[i_transcripts].get_transcripts_metrics\ (args, type_organism, reference_dict, args.transcripts[i_transcripts], transcripts_dicts[i_transcripts], args.labels[i_transcripts], args.threads, sqlite3_db_genes, db_genes_metrics, reads_coverage, logger, tmp_dir, log_dir, WELL_FULLY_COVERAGE_THRESHOLDS, rqconfig.TRANSCRIPT_LENS) # GET SEPARATED REPORT: separated_reports[i_transcripts].get_separated_report\ (args, args.labels[i_transcripts], transcripts_dicts[i_transcripts], transcripts_metrics[i_transcripts], db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # GET COMPARISON REPORT: comparison_report = None if len(separated_reports) != 1: comparison_report = ComparisonReport.ComparisonReport() comparison_report.get_comparison_report( args, args.output_dir, args.labels, transcripts_metrics, db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # GET SHORT REPORT: short_report = \ ShortReport.ShortReport(args, db_genes_metrics, transcripts_metrics, args.output_dir, separated_reports, comparison_report, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # REMOVE TEMPORARY DIRECTORY FROM OUTPUT DIRECTORY: if os.path.exists(tmp_dir) and not args.debug: logger.debug('Remove temporary directory {}'.format(tmp_dir)) shutil.rmtree(tmp_dir) logger.debug('Done.') # LOGGING RESULTS PATHES: logger.print_path_results(args, separated_reports, comparison_report, short_report) if args.debug: UtilsGeneral.profile_memory(args, reference_dict, db_genes_metrics, transcripts_metrics, separated_reports, comparison_report, logger) # FINISH LOGGING: logger.finish_up()
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta( contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len( sorted_contigs_names ) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = { } # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None aligned_blocks_by_contig_name[contig_name].append( AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_label + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >> gaps_file, chr_name cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, i - cur_gap_size, i - 1 aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt') ]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_label + suffix) found_file = open(found_fpath, 'w') print >> found_file, '%s\t\t%s\t%s\t%s' % ('ID or #', 'Start', 'End', 'Type') print >> found_file, '=========================================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[ region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [ AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end) ] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[ i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) print >> found_file, '%s\t\t%d\t%d\tcomplete' % ( region_id, region.start, region.end) feature_in_contigs[ contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min( region.end, block.end) - max( region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) print >> found_file, '%s\t\t%d\t%d\tpartial' % ( region_id, region.start, region.end) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
############################################################################ # Copyright (c) 2015 Saint Petersburg State University # Copyright (c) 2011-2014 Saint Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ import os import sys import itertools import shutil import fastaparser ######################################################################## if len(sys.argv) != 3: print 'FASTA-file converter from multi-line reads to one-line ones' print 'Usage: ', sys.argv[0], ' <input-file> <output-file>' sys.exit(0) out = open(sys.argv[2], 'w') fasta = fastaparser.read_fasta(sys.argv[1]) for name, seq in fasta: out.write(name + '\n') out.write(seq + '\n') out.close()