def parse_gff(file, feature): genes = [] number = 0 for line in file: m = gff_pattern.match(line) if m and m.group('feature') == feature: gene = Gene(seqname=qutils.correct_name(m.group('seqname')), start=int(m.group('start')), end=int(m.group('end'))) attributes = m.group('attributes').split(';') for attr in attributes: if attr and attr != '' and '=' in attr: key, val = attr.split('=') if key.lower() == 'id': gene.id = val if key.lower() == 'name': gene.name = val gene.number = number number += 1 genes.append(gene) return genes
def parse_ncbi(ncbi_file): annotation_pattern = re.compile(r'Annotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)', re.I) chromosome_pattern = re.compile(r'Chromosome: (?P<chromosome>\S+);', re.I) id_pattern = re.compile(r'ID: (?P<id>\d+)', re.I) genes = [] line = ncbi_file.readline() while line != '': while line.rstrip() == '' or line.startswith('##'): if line == '': break line = ncbi_file.readline() m = ncbi_start_pattern.match(line.rstrip()) while not m: m = ncbi_start_pattern.match(line.rstrip()) gene = Gene(number=int(m.group('number')), name=qutils.correct_name(m.group('name'))) the_rest_lines = [] line = ncbi_file.readline() while line != '' and not ncbi_start_pattern.match(line.rstrip()): the_rest_lines.append(line.rstrip()) line = ncbi_file.readline() for info_line in the_rest_lines: if info_line.startswith('Chromosome:'): m = re.match(chromosome_pattern, info_line) if m: gene.chromosome = m.group('chromosome') if info_line.startswith('Annotation:'): m = re.match(annotation_pattern, info_line) if m: gene.seqname = m.group('seqname') gene.start = int(m.group('start')) gene.end = int(m.group('end')) to_trim = 'Chromosome' + ' ' + str(gene.chromosome) if gene.chromosome and gene.seqname.startswith(to_trim): gene.seqname = gene.seqname[len(to_trim):] gene.seqname.lstrip(' ,') else: logger.warning('Wrong NCBI annotation for gene ' + str(gene.number) + '. ' + gene.name + '. Skipping this gene.') if info_line.startswith('ID:'): m = re.match(id_pattern, info_line) if m: gene.id = m.group('id') else: logger.warning('Can\'t parse gene\'s ID in NCBI format. Gene is ' + str(gene.number) + '. ' + gene.name + '. Skipping it.') if gene.start is not None and gene.end is not None: genes.append(gene) # raise ParseException('NCBI format parsing error: provide start and end for gene ' + gene.number + '. ' + gene.name + '.') return genes
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def parse_gff(file, feature): genes = [] number = 0 for line in file: m = gff_pattern.match(line) if m and m.group('feature').lower() == feature: gene = Gene(seqname=qutils.correct_name(m.group('seqname')), start=int(m.group('start')), end=int(m.group('end'))) attributes = m.group('attributes').split(';') for attr in attributes: if attr and attr != '' and '=' in attr: key = attr.split('=')[0] val = attr[len(key) + 1:] if key.lower() == 'id': gene.id = val if key.lower() == 'name': gene.name = val gene.number = number number += 1 genes.append(gene) return genes
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_LENGTH: _, fasta_ext = os.path.splitext(corrected_fpath) splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref') os.makedirs(splitted_ref_dirpath) for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries): if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext qconfig.splitted_ref.append(splitted_ref_fpath) fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)]) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def parse_txt(file): genes = [] for line in file: m = txt_pattern_gi.match(line) if not m: m = txt_pattern.match(line) if m: gene = Gene(number=int(m.group('number')), seqname=qutils.correct_name(m.group('seqname'))) s = int(m.group('start')) e = int(m.group('end')) gene.start = min(s, e) gene.end = max(s, e) gene.id = m.group('number') genes.append(gene) return genes
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size): chr_to_aligned_blocks = dict() for chr in chr_names: chr_init = [] for fpath in contigs_fpaths: f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None) f.label = qutils.label_from_fpath(fpath) f.unshifted_start = 0 f.unshifted_end = 0 chr_init.append(f) chr_to_aligned_blocks.setdefault(chr, chr_init) for assembly in assemblies.assemblies: for align in assembly.alignments: chr_to_aligned_blocks[align.ref_name].append(align) summary_fname = 'alignment_summary.html' summary_path = os.path.join(output_dir_path, summary_fname) output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) import contigs_analyzer if contigs_analyzer.ref_labels_by_chromosomes: contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes chr_full_names = list(set([contig_names_by_refs[contig] for contig in chr_names])) elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len(chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT: chr_full_names = [NAME_FOR_ONE_PLOT] else: chr_full_names = chr_names if cov_fpath: cov_data = dict() not_covered = dict() cur_len = dict() with open(cov_fpath, 'r') as coverage: name = chr_names[0] contig_to_chr = {} for chr in chr_full_names: cov_data.setdefault(chr, []) not_covered.setdefault(chr, []) cur_len.setdefault(chr, 0) if contigs_analyzer.ref_labels_by_chromosomes: contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] for contig in contigs: contig_to_chr[contig] = chr for index, line in enumerate(coverage): c = list(line.split()) name = contig_to_chr[qutils.correct_name(c[0])] cur_len[name] += int(c[2]) if index % 100 == 0 and index > 0: cov_data[name].append(cur_len[name]/100) cur_len[name] = 0 if c[2] == '0': not_covered[name].append(c[1]) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() aligned_bases_by_chr = {} num_misassemblies = {} aligned_assemblies = {} for i, chr in enumerate(chr_full_names): short_chr = chr[:30] num_misassemblies[chr] = 0 aligned_bases_by_chr[chr] = [] aligned_assemblies[chr] = [] with open(os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result: result.write('"use strict";\n') if contigs_analyzer.ref_labels_by_chromosomes: contigs = [contig for contig in chr_names if contig_names_by_refs[contig] == chr] result.write('var links_to_chromosomes = {};\n') links_to_chromosomes = [] used_chromosomes = [] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] chr_size = sum([chromosomes_length[contig] for contig in contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(contigs) for contig in contigs: aligned_bases_by_chr[chr].extend(aligned_bases[contig]) data_str = 'var chromosomes_len = {};\n' for contig in contigs: l = chromosomes_length[contig] data_str += 'chromosomes_len["{contig}"] = {l};\n'.format(**locals()) result.write(data_str) # adding assembly data data_str = 'var contig_data = {};\n' data_str += 'contig_data["{chr}"] = [ '.format(**locals()) prev_len = 0 chr_lengths = [0] + [chromosomes_length[contig] for contig in contigs] for num_contig, contig in enumerate(contigs): if num_contig > 0: prev_len += chr_lengths[num_contig] if len(chr_to_aligned_blocks[contig]) > 0: for alignment in chr_to_aligned_blocks[contig]: if alignment.misassembled: num_misassemblies[chr] += 1 corr_start = prev_len + alignment.unshifted_start corr_end = prev_len + alignment.unshifted_end data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \ 'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals()) if alignment.name != 'FICTIVE': if len(aligned_assemblies[chr]) < len(contigs_fpaths) and alignment.label not in aligned_assemblies[chr]: aligned_assemblies[chr].append(alignment.label) data_str += ', structure: [' for el in alignment.misassembled_structure: if type(el) == list: if el[5] in contigs: num_chr = contigs.index(el[5]) corr_len = sum(chr_lengths[:num_chr+1]) else: corr_len = -int(el[1]) if contigs_analyzer.ref_labels_by_chromosomes and el[5] not in used_chromosomes: used_chromosomes.append(el[5]) new_chr = contig_names_by_refs[el[5]] links_to_chromosomes.append('links_to_chromosomes["{el[5]}"] = "{new_chr}";\n'.format(**locals())) corr_start = corr_len + int(el[0]) corr_end = corr_len + int(el[1]) data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format(**locals()) elif type(el) == str: data_str += '{{type: "M", mstype: "{el}"}},'.format(**locals()) if data_str[-1] == '[': data_str = data_str + ']},' else: data_str = data_str[: -1] + ']},' else: data_str += '},' data_str = data_str[:-1] + '];\n\n' result.write(data_str) if contigs_analyzer.ref_labels_by_chromosomes: result.write(''.join(links_to_chromosomes)) if cov_fpath: # adding coverage data data_str = 'var coverage_data = {};\n' if cov_data[chr]: data_str += 'coverage_data["{chr}"] = [ '.format(**locals()) for e in cov_data[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] + '];\n' result.write(data_str) data_str = '' data_str = 'var not_covered = {};\n' data_str += 'not_covered["{chr}"] = [ '.format(**locals()) if len(not_covered[chr]) > 0: for e in not_covered[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] data_str += '];\n' result.write(data_str) data_str = '' with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template: with open(os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result: for line in template: if line.find('<script type="text/javascript" src=""></script>') != -1: result.write('<script type="text/javascript" src="data_{short_chr}.js"></script>\n'.format(**locals())) else: result.write(line) if line.find('<body>') != -1: chr_size = chr_sizes[chr] chr_name = chr.replace('_', ' ') if len(chr_name) > 50: chr_name = chr_name[:50] + '...' title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ('%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '') + '%s bp)' % format_long_numbers(chr_size) result.write('<div class = "block title"><a href="../{summary_fname}"><button class="back_button">↵</button></a>{title}</div>\n'.format(**locals())) if line.find('<script type="text/javascript">') != -1: chromosome = '","'.join(contigs) result.write('var CHROMOSOME = "{chr}";\n'.format(**locals())) result.write('var chrContigs = ["{chromosome}"];\n'.format(**locals())) with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template: with open(summary_path, 'w') as result: num_aligned_assemblies = [len(aligned_assemblies[chr]) for chr in chr_full_names] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 for line in template: result.write(line) if line.find('<!--- assemblies: ---->') != -1: if not is_unaligned_asm_exists: result.write('<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths)) if line.find('<!--- th_assemblies: ---->') != -1: if is_unaligned_asm_exists: result.write('<th># assemblies</th>') if line.find('<!--- references: ---->') != -1: for chr in sorted(chr_full_names): result.write('<tr>') short_chr = chr[:30] chr_link = os.path.join(alignment_plots_dirname, '_{short_chr}.html'.format(**locals())) chr_name = chr.replace('_', ' ') aligned_lengths = [aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None] chr_genome = sum(aligned_lengths) * 100.0 / (chr_sizes[chr] * len(contigs_fpaths)) chr_size = chr_sizes[chr] result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name)) result.write('<td>%s</td>' % num_contigs[chr]) result.write('<td>%s</td>' % format_long_numbers(chr_size)) if is_unaligned_asm_exists: result.write('<td>%s</td>' % len(aligned_assemblies[chr])) result.write('<td>%.3f</td>' % chr_genome) result.write('<td>%s</td>' % num_misassemblies[chr]) result.write('</tr>') copyfile(html_saver.get_real_path(os.path.join('static', 'contig_alignment_plot.css')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css')) copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')), os.path.join(output_all_files_dir_path, 'd3.js')) copyfile(html_saver.get_real_path(os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))
def get_corr_name(name): return qutils.correct_name(name)
def parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths): aligned_blocks = [] with open(report_fpath) as report_file: misassembled_contigs_ids = [] for line in report_file: if line.startswith('Analyzing contigs...'): break cur_contig_id = '' last_contig_id = '' misassembled_id_to_structure = dict() for line in report_file: if line.startswith('CONTIG:'): cur_contig_id = line.split('CONTIG:')[1].strip() last_contig_id = cur_contig_id.split(' ')[0] if last_contig_id not in misassembled_id_to_structure: misassembled_id_to_structure[last_contig_id] = [False] if (line.find('Alignment') != -1 or line.find('most ') != -1) and line.find('Excluding') == -1: l = line.split(':')[1].split(' ') misassembled_id_to_structure[last_contig_id].append([l[1], l[2], l[4], l[5], l[10], qutils.correct_name(l[12])]) if line.find('misassembly') != -1 and line.find('Fake') == -1: misassembled_id_to_structure[last_contig_id].append(line.split('(')[1].split(')')[0]) if line.find('Extensive misassembly') != -1 and cur_contig_id != '': misassembled_contigs_ids.append(cur_contig_id.split()[0]) cur_contig_id = '' if line.startswith('Analyzing coverage...'): break cur_shift = 0 ref_blocks = [] for line in report_file: split_line = line.strip().split(' ') if split_line and split_line[0] == 'Reference': ref_name = split_line[1][:-1] if ref_name in sorted_ref_names: cur_shift = cumulative_ref_lengths[sorted_ref_names.index(ref_name)] else: logger.warning('reference name ' + ref_name + ' not found in file with reference!\nCannot draw contig alignment plot!') return None elif split_line and split_line[0] == 'Align' and 'Excluding' not in split_line and 'Fake' not in split_line: unshifted_start = int(split_line[2]) unshifted_end = int(split_line[3]) start = unshifted_start + cur_shift end = unshifted_end + cur_shift contig_id = split_line[4] start_in_contig = int(split_line[5]) end_in_contig = int(split_line[6]) is_rc = ((start - end) * (start_in_contig - end_in_contig)) < 0 position_in_contig = min(start_in_contig, end_in_contig) position_in_ref = max(int(split_line[2]), int(split_line[3])) block = Alignment( contig_id, start, end, unshifted_start, unshifted_end, is_rc, position_in_contig, position_in_ref, ref_name) if contig_id in misassembled_contigs_ids: block.misassembled = True block.misassembled_structure = misassembled_id_to_structure[contig_id] if contig_id in misassembled_contigs_ids: block.misassembled = True aligned_blocks.append(block) if ref_blocks: aligned_blocks.extend(ref_blocks) return aligned_blocks
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning( 'Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum( len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [ ] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len / qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join( split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning( "Skipping reference because all of its chromosomes exceeded Nucmer's constraint." ) return False return True
def parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths): aligned_blocks = [] with open(report_fpath) as report_file: misassembled_contigs_ids = [] for line in report_file: if line.startswith('Analyzing contigs...'): break cur_contig_id = '' last_contig_id = '' misassembled_id_to_structure = dict() for line in report_file: if line.startswith('CONTIG:'): cur_contig_id = line.split('CONTIG:')[1].strip() last_contig_id = cur_contig_id.split(' ')[0] if last_contig_id not in misassembled_id_to_structure: misassembled_id_to_structure[last_contig_id] = [False] if (line.find('Alignment') != -1 or line.find('most ') != -1) and line.find('Excluding') == -1: l = line.split(':')[1].split(' ') misassembled_id_to_structure[last_contig_id].append([ l[1], l[2], l[4], l[5], l[10], qutils.correct_name(l[12]) ]) if line.find('misassembly') != -1 and line.find('Fake') == -1: misassembled_id_to_structure[last_contig_id].append( line.split('(')[1].split(')')[0]) if line.find( 'Extensive misassembly') != -1 and cur_contig_id != '': misassembled_contigs_ids.append(cur_contig_id.split()[0]) cur_contig_id = '' if line.startswith('Analyzing coverage...'): break cur_shift = 0 ref_blocks = [] for line in report_file: split_line = line.strip().split(' ') if split_line and split_line[0] == 'Reference': ref_name = split_line[1][:-1] if ref_name in sorted_ref_names: cur_shift = cumulative_ref_lengths[sorted_ref_names.index( ref_name)] else: logger.warning( 'reference name ' + ref_name + ' not found in file with reference!\nCannot draw contig alignment plot!' ) return None elif split_line and split_line[ 0] == 'Align' and 'Excluding' not in split_line and 'Fake' not in split_line: unshifted_start = int(split_line[2]) unshifted_end = int(split_line[3]) start = unshifted_start + cur_shift end = unshifted_end + cur_shift contig_id = split_line[4] start_in_contig = int(split_line[5]) end_in_contig = int(split_line[6]) is_rc = ((start - end) * (start_in_contig - end_in_contig)) < 0 position_in_contig = min(start_in_contig, end_in_contig) position_in_ref = max(int(split_line[2]), int(split_line[3])) block = Alignment(contig_id, start, end, unshifted_start, unshifted_end, is_rc, position_in_contig, position_in_ref, ref_name) if contig_id in misassembled_contigs_ids: block.misassembled = True block.misassembled_structure = misassembled_id_to_structure[ contig_id] if contig_id in misassembled_contigs_ids: block.misassembled = True aligned_blocks.append(block) if ref_blocks: aligned_blocks.extend(ref_blocks) return aligned_blocks
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def js_data_gen(assemblies, contigs_fpaths, chr_names, chromosomes_length, output_dir_path, cov_fpath, ref_fpath, genome_size): chr_to_aligned_blocks = dict() for chr in chr_names: chr_init = [] for fpath in contigs_fpaths: f = Alignment('FICTIVE', 0, 0, 0, 0, False, 0, 0, None) f.label = qutils.label_from_fpath(fpath) f.unshifted_start = 0 f.unshifted_end = 0 chr_init.append(f) chr_to_aligned_blocks.setdefault(chr, chr_init) for assembly in assemblies.assemblies: for align in assembly.alignments: chr_to_aligned_blocks[align.ref_name].append(align) summary_fname = 'alignment_summary.html' summary_path = os.path.join(output_dir_path, summary_fname) output_all_files_dir_path = os.path.join(output_dir_path, alignment_plots_dirname) if not os.path.exists(output_all_files_dir_path): os.mkdir(output_all_files_dir_path) import contigs_analyzer if contigs_analyzer.ref_labels_by_chromosomes: contig_names_by_refs = contigs_analyzer.ref_labels_by_chromosomes chr_full_names = list( set([contig_names_by_refs[contig] for contig in chr_names])) elif genome_size < MAX_SIZE_FOR_COMB_PLOT and len( chr_names) >= MIN_CONTIGS_FOR_COMB_PLOT: chr_full_names = [NAME_FOR_ONE_PLOT] else: chr_full_names = chr_names if cov_fpath: cov_data = dict() not_covered = dict() cur_len = dict() with open(cov_fpath, 'r') as coverage: name = chr_names[0] contig_to_chr = {} for chr in chr_full_names: cov_data.setdefault(chr, []) not_covered.setdefault(chr, []) cur_len.setdefault(chr, 0) if contigs_analyzer.ref_labels_by_chromosomes: contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] for contig in contigs: contig_to_chr[contig] = chr for index, line in enumerate(coverage): c = list(line.split()) name = contig_to_chr[qutils.correct_name(c[0])] cur_len[name] += int(c[2]) if index % 100 == 0 and index > 0: cov_data[name].append(cur_len[name] / 100) cur_len[name] = 0 if c[2] == '0': not_covered[name].append(c[1]) chr_sizes = {} num_contigs = {} aligned_bases = genome_analyzer.get_ref_aligned_lengths() aligned_bases_by_chr = {} num_misassemblies = {} aligned_assemblies = {} for i, chr in enumerate(chr_full_names): short_chr = chr[:30] num_misassemblies[chr] = 0 aligned_bases_by_chr[chr] = [] aligned_assemblies[chr] = [] with open( os.path.join(output_all_files_dir_path, 'data_%s.js' % short_chr), 'w') as result: result.write('"use strict";\n') if contigs_analyzer.ref_labels_by_chromosomes: contigs = [ contig for contig in chr_names if contig_names_by_refs[contig] == chr ] result.write('var links_to_chromosomes = {};\n') links_to_chromosomes = [] used_chromosomes = [] elif chr == NAME_FOR_ONE_PLOT: contigs = chr_names else: contigs = [chr] chr_size = sum([chromosomes_length[contig] for contig in contigs]) chr_sizes[chr] = chr_size num_contigs[chr] = len(contigs) for contig in contigs: aligned_bases_by_chr[chr].extend(aligned_bases[contig]) data_str = 'var chromosomes_len = {};\n' for contig in contigs: l = chromosomes_length[contig] data_str += 'chromosomes_len["{contig}"] = {l};\n'.format( **locals()) result.write(data_str) # adding assembly data data_str = 'var contig_data = {};\n' data_str += 'contig_data["{chr}"] = [ '.format(**locals()) prev_len = 0 chr_lengths = [0] + [ chromosomes_length[contig] for contig in contigs ] for num_contig, contig in enumerate(contigs): if num_contig > 0: prev_len += chr_lengths[num_contig] if len(chr_to_aligned_blocks[contig]) > 0: for alignment in chr_to_aligned_blocks[contig]: if alignment.misassembled: num_misassemblies[chr] += 1 corr_start = prev_len + alignment.unshifted_start corr_end = prev_len + alignment.unshifted_end data_str += '{{name: "{alignment.name}", corr_start: {corr_start}, corr_end: {corr_end},' \ 'start: {alignment.unshifted_start}, end: {alignment.unshifted_end}, assembly: "{alignment.label}", similar: "{alignment.similar}", misassembled: "{alignment.misassembled}" '.format(**locals()) if alignment.name != 'FICTIVE': if len(aligned_assemblies[chr]) < len( contigs_fpaths ) and alignment.label not in aligned_assemblies[ chr]: aligned_assemblies[chr].append(alignment.label) data_str += ', structure: [' for el in alignment.misassembled_structure: if type(el) == list: if el[5] in contigs: num_chr = contigs.index(el[5]) corr_len = sum(chr_lengths[:num_chr + 1]) else: corr_len = -int(el[1]) if contigs_analyzer.ref_labels_by_chromosomes and el[ 5] not in used_chromosomes: used_chromosomes.append(el[5]) new_chr = contig_names_by_refs[ el[5]] links_to_chromosomes.append( 'links_to_chromosomes["{el[5]}"] = "{new_chr}";\n' .format(**locals())) corr_start = corr_len + int(el[0]) corr_end = corr_len + int(el[1]) data_str += '{{type: "A", corr_start: {corr_start}, corr_end: {corr_end}, start: {el[0]}, end: {el[1]}, start_in_contig: {el[2]}, end_in_contig: {el[3]}, IDY: {el[4]}, chr: "{el[5]}"}},'.format( **locals()) elif type(el) == str: data_str += '{{type: "M", mstype: "{el}"}},'.format( **locals()) if data_str[-1] == '[': data_str = data_str + ']},' else: data_str = data_str[:-1] + ']},' else: data_str += '},' data_str = data_str[:-1] + '];\n\n' result.write(data_str) if contigs_analyzer.ref_labels_by_chromosomes: result.write(''.join(links_to_chromosomes)) if cov_fpath: # adding coverage data data_str = 'var coverage_data = {};\n' if cov_data[chr]: data_str += 'coverage_data["{chr}"] = [ '.format( **locals()) for e in cov_data[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] + '];\n' result.write(data_str) data_str = '' data_str = 'var not_covered = {};\n' data_str += 'not_covered["{chr}"] = [ '.format(**locals()) if len(not_covered[chr]) > 0: for e in not_covered[chr]: data_str += '{e},'.format(**locals()) if len(data_str) > 10000 and e != cov_data[chr][-1]: result.write(data_str) data_str = '' data_str = data_str[:-1] data_str += '];\n' result.write(data_str) data_str = '' with open(html_saver.get_real_path('_chr_templ.html'), 'r') as template: with open( os.path.join(output_all_files_dir_path, '_{short_chr}.html'.format(**locals())), 'w') as result: for line in template: if line.find( '<script type="text/javascript" src=""></script>' ) != -1: result.write( '<script type="text/javascript" src="data_{short_chr}.js"></script>\n' .format(**locals())) else: result.write(line) if line.find('<body>') != -1: chr_size = chr_sizes[chr] chr_name = chr.replace('_', ' ') if len(chr_name) > 50: chr_name = chr_name[:50] + '...' title = 'CONTIG ALIGNMENT BROWSER: %s (' % chr_name + ( '%s fragments, ' % num_contigs[chr] if num_contigs[chr] > 1 else '' ) + '%s bp)' % format_long_numbers(chr_size) result.write( '<div class = "block title"><a href="../{summary_fname}"><button class="back_button">↵</button></a>{title}</div>\n' .format(**locals())) if line.find( '<script type="text/javascript">') != -1: chromosome = '","'.join(contigs) result.write( 'var CHROMOSOME = "{chr}";\n'.format( **locals())) result.write( 'var chrContigs = ["{chromosome}"];\n'. format(**locals())) with open(html_saver.get_real_path('alignment_summary_templ.html'), 'r') as template: with open(summary_path, 'w') as result: num_aligned_assemblies = [ len(aligned_assemblies[chr]) for chr in chr_full_names ] is_unaligned_asm_exists = len(set(num_aligned_assemblies)) > 1 for line in template: result.write(line) if line.find('<!--- assemblies: ---->') != -1: if not is_unaligned_asm_exists: result.write( '<div class="subtitle"># assemblies: %s</div>' % len(contigs_fpaths)) if line.find('<!--- th_assemblies: ---->') != -1: if is_unaligned_asm_exists: result.write('<th># assemblies</th>') if line.find('<!--- references: ---->') != -1: for chr in sorted(chr_full_names): result.write('<tr>') short_chr = chr[:30] chr_link = os.path.join( alignment_plots_dirname, '_{short_chr}.html'.format(**locals())) chr_name = chr.replace('_', ' ') aligned_lengths = [ aligned_len for aligned_len in aligned_bases_by_chr[chr] if aligned_len is not None ] chr_genome = sum(aligned_lengths) * 100.0 / ( chr_sizes[chr] * len(contigs_fpaths)) chr_size = chr_sizes[chr] result.write('<td><a href="%s">%s</a></td>' % (chr_link, chr_name)) result.write('<td>%s</td>' % num_contigs[chr]) result.write('<td>%s</td>' % format_long_numbers(chr_size)) if is_unaligned_asm_exists: result.write('<td>%s</td>' % len(aligned_assemblies[chr])) result.write('<td>%.3f</td>' % chr_genome) result.write('<td>%s</td>' % num_misassemblies[chr]) result.write('</tr>') copyfile( html_saver.get_real_path( os.path.join('static', 'contig_alignment_plot.css')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot.css')) copyfile(html_saver.get_real_path(os.path.join('static', 'd3.js')), os.path.join(output_all_files_dir_path, 'd3.js')) copyfile( html_saver.get_real_path( os.path.join('static', 'scripts', 'contig_alignment_plot_script.js')), os.path.join(output_all_files_dir_path, 'contig_alignment_plot_script.js'))