def translate_mvf(args): """Main method""" args.qprint("Running TranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff_genes, gene_order = parse_gff_exome(args) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.contig_data = dict( ( i, dict((y, z) for (y, z) in gff_genes[x].items() if y not in ('cds', ))) for (i, x) in enumerate(gene_order)) outmvf.contig_indices = list(range(len(gene_order))) outmvf.contig_ids = [gff_genes[x]['id'] for x in gene_order] outmvf.contig_labels = [gff_genes[x]['label'] for x in gene_order] outmvf.flavor = args.output_data outmvf.metadata.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: running_gene_index = -1 for igene, gene in enumerate(gene_order): xcontiglabel = gff_genes[gene]['contig'] xcontig = mvf.get_contig_indices( labels=gff_genes[gene]['contig']) if xcontig is None: print("Warning: contig {} not found".format( gff_genes[gene]['contig'])) xcontigid = mvf.get_contig_ids(indices=xcontig)[0] min_gene_coord = gff_genes[gene]['cds'][0][0] max_gene_coord = gff_genes[gene]['cds'][-1][1] mvf_entries = {} if not igene % 100: args.qprint("Processing gene {} on {}".format( gene, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): if pos < min_gene_coord: continue if pos > max_gene_coord: break mvf_entries[pos] = allelesets[0] reverse_strand = gff_genes[gene]['strand'] == '-' coords = [] running_gene_index += 1 for elem in gff_genes[gene]['cds']: coords.extend(list(range(elem[0], elem[1] + 1))) if reverse_strand: coords = coords[::-1] for codoncoord in range(0, len(coords), 3): alleles = tuple(mvf_entries.get(x, '-') for x in coords[codoncoord:codoncoord + 3]) if len(alleles) < 3: alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles))) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) amino_acids = outmvf.encode(''.join(amino_acids)) if args.output_data == 'protein': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, ) )) elif args.output_data == 'codon': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, alleles[0], alleles[1], alleles[2] ) )) elif args.output_data == 'dna': for j, elem in enumerate( range(codoncoord, min(codoncoord + 3, len(coords)))): entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[elem] if args.retain_coords else elem + 1 ), ( alleles[j], ) )) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.copy_header(first_mvf) # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig() mvfmetadata.append(mvf.metadata) for i, sid in enumerate(mvf.get_sample_ids()): if sid not in concatmvf.get_sample_ids(): new_sindex = concatmvf.max_sample_index + 0 concatmvf.max_sample_index += 1 concatmvf.sample_indices.append(new_sindex) concatmvf.sample_ids.append(sid) concatmvf.sample_data[new_sindex] = {} concatmvf.sample_data[new_sindex]['id'] = sid concatmvf.sample_id_to_index[sid] = new_sindex transformer.set_label(i, concatmvf.sample_id_to_index[sid]) for cindex in mvf.contig_indices: if (mvf.contig_data[cindex]['label'] not in concatmvf.contig_label_to_index): new_cindex = (mvf.contig_data[cindex]['id'] if mvf.contig_data[cindex]['id'] not in concatmvf.contig_ids else concatmvf.get_next_contig_index()) concatmvf.contig_data[new_cindex] = ( mvf.contig_data[cindex].copy()) else: new_cindex = concatmvf.contig_label_to_index[ mvf.contig_data[cindex]['label']] transformer.set_contig(cindex, new_cindex) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_max_sample() concatmvf.notes.append(args.command_string) concatmvf.write_data(concatmvf.get_header()) # Now loop through each file blank_entry = '-' * len(concatmvf.sample_indices) for cons_contig in concatmvf.contig_indices: contig_merged_entries = {} args.qprint("Merging Contig Index: {}".format(cons_contig)) for ifile, mvffile in enumerate(inputfiles): if cons_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[cons_contig] if 'idx' not in mvffile.contig_data[localcontig]: print("not found") continue for _, pos, allelesets in mvffile.itercontigentries(localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base in '-X': continue raise RuntimeError( ("Merging columns have two different bases: " "{} {} {}").format( pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) if contig_merged_entries: concatmvf.write_entries( ((cons_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( cons_contig, len(contig_merged_entries))) return ''
def legacy_translate_mvf(args): """Main method""" args.qprint("Running LegacyTranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff = parse_gff_legacy_translate( args.gff, args, parent_gene_pattern=args.parent_gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: args.qprint("Indexing GFF gene names.") # mvfid_to_gffname = outmvf.get_contig_reverse_dict() for xcontig in outmvf.get_contig_indices(): mvf_entries = {} xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0] xcontigid = outmvf.get_contig_ids(indices=xcontig)[0] if xcontiglabel not in gff: if args.verbose: print( ("No entries in GFF, " "skipping contig: index:{} id:{} label:{}").format( xcontig, xcontigid, xcontiglabel)) continue if not xcontig % 100: args.qprint("Processing contig: {} {}".format( xcontigid, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): mvf_entries[pos] = allelesets[0] for coords in sorted(gff[xcontiglabel]): reverse_strand = coords[3] == '-' alleles = (tuple(mvf_entries.get(x, '-') for x in coords[2::-1]) if reverse_strand is True else tuple(mvf_entries.get(x, '-') for x in coords[0:3])) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) # print("aminx", amino_acids) amino_acids = outmvf.encode(''.join(amino_acids)) # if all(x in '-X' for x in amino_acids): # continue # print("amino", amino_acids) # print("translated", amino_acids, alleles) if args.output_data == 'protein': entrybuffer.append((xcontig, coords[0], (amino_acids,))) else: entrybuffer.append(( xcontigid, coords[0], ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] concatmvf_reverse_contig = dict( (x['label'], k) for (k, x) in concatmvf.metadata['contigs'].items()) inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig_id() mvfmetadata.append(mvf.metadata) for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } # if concatmvf.metadata['labels'].index(label) != i: transformer.set_label(i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in concatmvf_reverse_contig: newid = (contigid if contigid not in concatmvf.metadata['contigs'] else concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata concatmvf_reverse_contig[contigdata['label']] = newid else: newid = concatmvf_reverse_contig[contigdata['label']] transformer.set_contig(contigid, newid) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_ncol() concatmvf.write_data(concatmvf.get_header()) contigs = concatmvf.metadata['contigs'] # Now loop through each file blank_entry = '-' * len(concatmvf.metadata['samples']) for current_contig in contigs: contig_merged_entries = {} args.qprint("Merging Contig: {}".format(current_contig)) for ifile, mvffile in enumerate(inputfiles): if current_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[current_contig] for chrom, pos, allelesets in mvffile.itercontigentries( localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base == '-' or base == 'X': continue raise RuntimeError( "Merging columns have two different bases: {} {} {}" .format(pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) concatmvf.write_entries( ((current_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( current_contig, len(contig_merged_entries))) return ''