def vcf2mvf(args=None): """Main method for vcf2mvf""" sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.field_sep] # ESTABLISH VCF args.qprint("Opening input VCF: {}".format(args.vcf)) vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) # ESTABLISH MVF args.qprint("Establishing output MVF: {}".format(args.out)) mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) mvf.notes.append(args.command_string) mvf.metadata['mvfversion'] = args.versionx # PROCESS CONTIG INFO args.qprint("Processing VCF headers.") vcfcontigs = vcf.metadata['contigs'].copy() args.qprint("{} contigs found.".format(len(vcfcontigs))) contig_translate = {} if args.contig_ids: for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids): try: cid = int(cid) except ValueError: pass assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs] for vid in vcfcontigs: if vcfcontigs[vid]['label'] == cvcf: contig_translate[cvcf] = [cid, cmvf] if cid in mvf.metadata['contigs']: raise RuntimeError( 'Contig id {} is not unique'.format(cid)) mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy() if cmvf in mvf.get_contig_labels(): raise RuntimeError( 'Contig label {} is not unique'.format(cmvf)) mvf.metadata['contigs'][cid]['label'] = cmvf[:] mvf.reset_max_contig() mvf.max_contig_index -= 1 args.qprint("Processing contigs.") static_contig_ids = list(mvf.get_contig_ids()) for vcid in vcfcontigs: vlabel = vcfcontigs[vcid]['label'] if vlabel not in static_contig_ids: newindex = mvf.get_next_contig_index() if ((is_int(vlabel) or len(vlabel) < 3) and vlabel not in static_contig_ids): newid = vlabel[:] else: newid = str(newindex) mvf.contig_indices.append(newindex) mvf.contig_ids.append(newid) mvf.contig_data[newindex] = vcfcontigs[vcid].copy() static_contig_ids.append(newid) contig_translate[vlabel] = [newindex, vlabel] mvf.reset_max_contig() new_contigs = [(x, mvf.contig_data[x]['label']) for x in mvf.contig_indices] if args.skip_contig_label_check is False: args.qprint("Checking contigs for label/id overlap errors.") xids = [x[0] for x in new_contigs] xlabels = [x[1] for x in new_contigs] xintersect = set(xids).intersection(xlabels) if xintersect: for i, (newid, newlabel) in enumerate(new_contigs): if i % 100 == 0: args.qprint("{} contigs processed".format(i)) if newid in xlabels[:i] or newid in xlabels[i + 1:]: # if newid in xlabels: # if xlabels.index(newid) != i: raise RuntimeError("Error contig id {} is the same as" " the label for another contig" " ({})".format(newid, xlabels.index(newid))) if newlabel in xids[:i] or newlabel in xids[i + 1:]: # if newlabel in xids: # if xids.index(newlabel) != i: raise RuntimeError("Error contig label {} is the same" "as the id for another contig" "({})".format(newlabel, xids.index(newlabel))) # PROCESS SAMPLE INFO args.qprint("Processing samples.") samplelabels = [args.ref_label] + vcf.metadata['samples'][:] if args.alleles_from: args.alleles_from = args.alleles_from.split(':') samplelabels += args.alleles_from if args.sample_replace: newsample = [ x.split(':') if ':' in tuple(x) else tuple([x, x]) for x in args.sample_replace ] unmatched = list(enumerate(samplelabels)) for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched is not False: del unmatched[labelmatched] mvf.sample_indices = list(range(len(samplelabels))) mvf.sample_ids = samplelabels[:] for i, label in enumerate(samplelabels): mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.sample_ids) mvf.max_sample_index = len(mvf.sample_ids) mvf.metadata['sourceformat'] = vcf.metadata['sourceformat'] # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 args.qprint("Processing VCF entries.") for vcfrecord in vcf.iterentries(args): mvfstring = ''.join(vcfrecord['genotypes']) if args.filter_nonref_empty is True: if all(x in 'Xx-?' for x in mvfstring[1:]): continue mvf_alleles = encode_mvfstring(mvfstring) if args.out_flavor in ('dnaqual', ): qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores'])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord['contig'])[0], vcfrecord['coord'], ((mvf_alleles, qual_alleles) if args.out_flavor in ('dnaqual', ) else (mvf_alleles, )))) nentry += 1 if nentry == args.line_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def fasta2mvf(args): """Main method""" sepchars = dict([("PIPE", "\\|"), ("TAB", "\\t"), ("SPACE", "\\s"), ("DBLSPACE", "\\s\\s"), ("COMMA", "\\,"), ("NONE", None), ("AT", "\\@"), ('UNDER', "\\_"), ("DBLUNDER", "\\_\\_")]) if args.field_sep is None: args.field_sep = '' else: args.field_sep = re.compile("[{}]".format(''.join( [sepchars[x] for x in args.field_sep]))) if args.manual_coord: assert len(args.manual_coord) == len(args.fasta) args.manual_coord = [(x.split(':')[0], int(x.split(":")[1].split('..')[0]), int(x.split(':')[1].split('..')[1])) for x in args.manual_coord] mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) fasta = {} current_contig = 0 fsamples = [] fcontigs = [] for ifasta, fastapath in enumerate(args.fasta): print("Processing {}".format(fastapath)) for header, seq in fasta_iter(fastapath): if args.field_sep is None: header = header[:] if args.field_sep != '' and args.field_sep is not None: header = [str(x) for x in re.split(args.field_sep, header)] if args.contig_by_file is True: contig = os.path.basename(fastapath[:]) if args.sample_field is None: sample = header[:] else: sample = header[args.sample_field] elif (len(header) < max( args.contig_field if args.contig_field is not None else 0, args.sample_field if args.sample_field is not None else 0) or args.contig_field is None or args.sample_field is None): contig = "UNK{}".format(current_contig) sample = header[:] elif args.manual_coord: contig = args.manual_coord[ifasta][0] else: contig = header[args.contig_field] sample = header[args.sample_field] if contig not in fcontigs: fcontigs.append(contig) fasta[contig] = {} if sample not in fsamples: fsamples.append(sample) fasta[contig][sample] = (len(seq), seq) reflabel = None if args.ref_label: for i, samplename in enumerate(fsamples): if args.ref_label in samplename: reflabel = i break if reflabel: newref = fsamples.pop(i) fsamples = [newref] + fsamples for i, contig in enumerate(fcontigs): new_index = mvf.get_next_contig_index() mvf.contig_indices.append(new_index) mvf.contig_ids.append(str(new_index)) mvf.contig_labels.append(contig) mvf.contig_label_to_index[contig] = new_index mvf.contig_id_to_index[str(new_index)] = new_index mvf.contig_data[new_index] = { 'label': contig, 'id': str(new_index), 'length': max([fasta[contig][x][0] for x in fasta[contig]]) } mvf.metadata['labels'] = fsamples[:] for i, label in enumerate(fsamples[:]): mvf.sample_indices.append(i) mvf.sample_id_to_index[label] = i mvf.sample_ids.append(label) mvf.sample_data[i] = {'id': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = 'fasta' mvf.metadata.append(args.command_string) mvf.flavor = args.flavor # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 mvf_alleles = {} for cind, contig in enumerate(fcontigs): for pos in range(mvf.contig_data[cind + 1]['length']): mvf_alleles = encode_mvfstring( ''.join(samp not in fasta[contig] and '-' or pos >= fasta[contig][samp][0] and '-' or fasta[contig][samp][1][pos] for samp in fsamples)) if mvf_alleles: if args.flavor == 'dna': mvf_alleles = ''.join( ["X" if x in 'NX' else x for x in mvf_alleles]) mvfentries.append((cind, pos + 1, (mvf_alleles, ))) nentry += 1 if nentry == args.write_buffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def merge_mvf(args): """Main method""" args.qprint("Running MergeMVF") if any(fpath.endswith('.gz') for fpath in args.mvf): print("WARNING! Running MergeMVF with gzipped input files is " "extremely slow and strongly discouraged.") concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata args.qprint("Reading First File and Establishing Output") if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.copy_header(first_mvf) # Open each MVF file, read headers to make unified header transformers = [] mvfmetadata = [] inputfiles = [] for mvfname in args.mvf: args.qprint("Reading headers from {}".format(mvfname)) # This will create a dictionary of samples{old:new}, contigs{old:new} args.qprint("Processing Headers and Indexing: {}".format(mvfname)) transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read', contigindex=(not args.skip_index)) if args.skip_index: mvf.read_index_file() mvf.reset_max_contig() mvfmetadata.append(mvf.metadata) for i, sid in enumerate(mvf.get_sample_ids()): if sid not in concatmvf.get_sample_ids(): new_sindex = concatmvf.max_sample_index + 0 concatmvf.max_sample_index += 1 concatmvf.sample_indices.append(new_sindex) concatmvf.sample_ids.append(sid) concatmvf.sample_data[new_sindex] = {} concatmvf.sample_data[new_sindex]['id'] = sid concatmvf.sample_id_to_index[sid] = new_sindex transformer.set_label(i, concatmvf.sample_id_to_index[sid]) for cindex in mvf.contig_indices: if (mvf.contig_data[cindex]['label'] not in concatmvf.contig_label_to_index): new_cindex = (mvf.contig_data[cindex]['id'] if mvf.contig_data[cindex]['id'] not in concatmvf.contig_ids else concatmvf.get_next_contig_index()) concatmvf.contig_data[new_cindex] = ( mvf.contig_data[cindex].copy()) else: new_cindex = concatmvf.contig_label_to_index[ mvf.contig_data[cindex]['label']] transformer.set_contig(cindex, new_cindex) transformers.append(transformer) inputfiles.append(mvf) # Write output header args.qprint("Writing headers to merge output") concatmvf.reset_max_sample() concatmvf.notes.append(args.command_string) concatmvf.write_data(concatmvf.get_header()) # Now loop through each file blank_entry = '-' * len(concatmvf.sample_indices) for cons_contig in concatmvf.contig_indices: contig_merged_entries = {} args.qprint("Merging Contig Index: {}".format(cons_contig)) for ifile, mvffile in enumerate(inputfiles): if cons_contig not in transformers[ifile].contigs: continue localcontig = transformers[ifile].contigs[cons_contig] if 'idx' not in mvffile.contig_data[localcontig]: print("not found") continue for _, pos, allelesets in mvffile.itercontigentries(localcontig, decode=True): if pos not in contig_merged_entries: contig_merged_entries[pos] = blank_entry[:] for j, base in enumerate(allelesets[0]): xcoord = transformers[ifile].labels_rev[j] if contig_merged_entries[pos][xcoord] != '-': if contig_merged_entries[pos][xcoord] == base: continue if base in '-X': continue raise RuntimeError( ("Merging columns have two different bases: " "{} {} {}").format( pos, contig_merged_entries[pos][xcoord], base)) contig_merged_entries[pos] = ( contig_merged_entries[pos][:xcoord] + base + contig_merged_entries[pos][xcoord + 1:]) if contig_merged_entries: concatmvf.write_entries( ((cons_contig, coord, (entry, )) for coord, entry in sorted(contig_merged_entries.items())), encoded=False) args.qprint("Entries written for contig {}: {}".format( cons_contig, len(contig_merged_entries))) return ''