def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] concatmvf_reverse_contig = dict( (x['label'], k) for (k, x) in concatmvf.metadata['contigs'].items()) inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig_id() mvfmetadata.append(mvf.metadata) for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } # if concatmvf.metadata['labels'].index(label) != i: transformer.set_label(i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in concatmvf_reverse_contig: newid = (contigid if contigid not in concatmvf.metadata['contigs'] else concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata concatmvf_reverse_contig[contigdata['label']] = newid else: newid = concatmvf_reverse_contig[contigdata['label']] transformer.set_contig(contigid, newid) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_ncol() concatmvf.write_data(concatmvf.get_header()) contigs = concatmvf.metadata['contigs'] # Now loop through each file blank_entry = '-' * len(concatmvf.metadata['samples']) for current_contig in contigs: contig_merged_entries = {} args.qprint("Merging Contig: {}".format(current_contig)) for ifile, mvffile in enumerate(inputfiles): if current_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[current_contig] for chrom, pos, allelesets in mvffile.itercontigentries( localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base == '-' or base == 'X': continue raise RuntimeError( "Merging columns have two different bases: {} {} {}" .format(pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) concatmvf.write_entries( ((current_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( current_contig, len(contig_merged_entries))) return ''
def mvf_join(args): """Main method""" concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] for mvfname in args.mvf: # This will create a dictionary of samples{old:new}, contigs{old:new} transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read') mvf.reset_max_contig_id() for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } if concatmvf.metadata['labels'].index(label) != i: transformer.set_label( i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in [ concatmvf.metadata['contigs'][x]['label'] for x in concatmvf.metadata['contigs'] ]: newid = (contigid not in concatmvf.metadata['contigs'] and contigid or concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata else: for concatid, concatdata in ( concatmvf.metadata['contigs'].items()): if contigdata['label'] == concatdata['label']: newid = concatid break if newid != contigid: transformer.set_contig(contigid, newid) transformers.append(transformer) # Write output header concatmvf.write_data(concatmvf.get_header()) # Now loop through each file entries = [] nentries = 0 for ifile, mvfname in enumerate(args.mvf): if not args.quiet: sys.stderr.write("Processing {} ...\n".format(mvfname)) transformer = transformers[ifile] mvf = MultiVariantFile(mvfname, 'read') for contigid, pos, allelesets in mvf.iterentries(decode=False, quiet=args.quiet): if transformer.labels: allelesets = [mvf.decode(x) for x in allelesets] for j, alleles in enumerate(allelesets): allelesets[j] = concatmvf.encode(''.join([ x in transformer.labels and alleles[transformer.labels[x]] or alleles[x] for x in range(len(alleles)) ])) if transformer.contigs: contigid = (contigid in transformer['contigs'] and transformer['contigs'][contigid] or contigid) entries.append((contigid, pos, allelesets)) nentries += 1 if nentries == args.line_buffer: concatmvf.write_entries(entries) entries = [] nentries = 0 if entries: concatmvf.write_entries(entries) entries = [] nentries = 0 if not args.quiet: sys.stderr.write("done\n") return ''
def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF args.qprint("Opening input VCF: {}".format(args.vcf)) vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF args.qprint("Establishing output MVF: {}".format(args.out)) mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS CONTIG INFO args.qprint("Processing VCF headers.") vcfcontigs = vcf.metadata['contigs'].copy() args.qprint("{} contigs found.".format(len(vcfcontigs))) contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig_id() args.qprint("Processing contigs.") static_contig_ids = mvf.get_contig_ids() for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in static_contig_ids: if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in static_contig_ids): newid = vlabel[:] else: newid = mvf.get_next_contig_id() mvf.metadata['contigs'][newid] = vcfcontigs[vcid].copy() static_contig_ids.append(newid) contig_translate[vlabel] = [newid, vlabel] mvf.reset_max_contig_id() new_contigs = [(x, mvf.metadata['contigs'][x]['label']) for x in mvf.metadata['contigs']] if args.skip_contig_label_check is False: args.qprint("Checking contigs for label/id overlap errors.") xids = [x[0] for x in new_contigs] xlabels = [x[1] for x in new_contigs] for i, (newid, newlabel) in enumerate(new_contigs): if newid in xlabels[:i] or newid in xlabels[i+1:]: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({})".format( newid, xlabels)) if newlabel in xids[:i] or newlabel in xids[i+1:]: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({})".format( newlabel, xlabels)) # PROCESS SAMPLE INFO args.qprint("Processing samples.") samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace] unmatched = [x for x in enumerate(samplelabels)] for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 args.qprint("Processing VCF entries.") for vcfrecord in vcf.iterentries(args): # try: mvf_alleles = encode_mvfstring(''.join(vcfrecord['genotypes'])) if args.out_flavor in ('dnaqual',): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual',) else (mvf_alleles,)))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 # except Exception as exception: if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''