def legacy_annotate_mvf(args): """Main method""" args.qprint("Running LegacyAnnotateMVF") mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Input MVF header processed.") args.qprint("MVF flavor: {}".format(mvf.flavor)) gff, geneids = parse_gff_legacy_annotate( args.gff, mvf.contig_data, gene_pattern=args.gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite, flavor=mvf.flavor) outmvf.copy_headers_from(mvf) if args.nongenic_mode is False: outmvf.contig_data = geneids.copy() outmvf.contig_indices = list(range(len(geneids))) outmvf.contig_ids = [geneids[x]['id'] for x in outmvf.contig_indices] outmvf.contig_labels = [geneids[x]['label'] for x in outmvf.contig_indices] outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") entrybuffer = [] nentry = 0 args.qprint("Processing MVF entries.") for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = None if contigid in gff: for (xgeneid, xstart, xstop) in gff[contigid]: if xstart < pos < xstop: annotated_pos = xgeneid + 0 break if args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xstart < xpos < xstop: annotated_pos = xgeneid + 0 break if annotated_pos is not None and not args.nongenic_mode: entrybuffer.append((annotated_pos, pos, allelesets)) elif args.nongenic_mode and annotated_pos is None: entrybuffer.append((contigid, pos, allelesets)) if args.nongenic_mode or annotated_pos is not None: nentry += 1 if nentry == args.line_buffer: args.qprint("Writing block of entries.") outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) args.qprint("Writing final block of entries.") entrybuffer = [] nentry = 0 return ''
def annotate_mvf(args): """Main method""" args.qprint("Running AnnotateMVF") mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Input MVF header processed.") args.qprint("MVF flavor: {}".format(mvf.metadata['flavor'])) gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'], gene_prefix=args.gene_prefix) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite, flavor=mvf.metadata['flavor']) outmvf.metadata = deepcopy(mvf.metadata) if args.nongenic_mode is False: outmvf.metadata['contigs'] = geneids outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") entrybuffer = [] nentry = 0 args.qprint("Processing MVF entries.") for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = False if contigid in gff: if pos in gff[contigid]: annotated_pos = True elif args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xpos in gff[contigid]: annotated_pos = True break if annotated_pos and not args.nongenic_mode: entrybuffer.append((gff[contigid][pos], pos, allelesets)) elif args.nongenic_mode and not annotated_pos: entrybuffer.append((contigid, pos, allelesets)) if args.nongenic_mode or annotated_pos: nentry += 1 if nentry == args.line_buffer: args.qprint("Writing block of entries.") outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) args.qprint("Writing final block of entries.") entrybuffer = [] nentry = 0 return ''
def annotate_mvf(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs']) if args.quiet is False: print("gff_processed") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) if args.nongenic_mode is False: outmvf.metadata['contigs'] = geneids outmvf.write_data(outmvf.get_header()) entrybuffer = [] nentry = 0 for contigid, pos, allelesets in mvf.iterentries(decode=False): annotated_pos = False if contigid in gff: if pos in gff[contigid]: annotated_pos = True elif args.nongenic_mode is True and args.unmargin > 0: for xpos in range(pos - args.unmargin, pos + args.unmargin + 1): if xpos in gff[contigid]: annotated_pos = True break if args.nongenic_mode is False and annotated_pos is True: entrybuffer.append((gff[contigid][pos], pos, allelesets)) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 elif args.nongenic_mode is True and annotated_pos is False: entrybuffer.append((contigid, pos, allelesets)) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def calc_sample_coverage(args): """Counts the total number of non-gap/ambiguous characters for each sample per contig. """ mvf = MultiVariantFile(args.mvf, 'read') data = {} # data_order = [] # Set up sample indices if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) # Set up contig ids if args.contig_ids is not None: contig_indices = mvf.get_contig_indices(args.contig_ids[0].split(",")) elif args.contig_labels is not None: contig_indices = mvf.get_contig_indices( labels=args.contig_labels[0].split(",")) else: contig_indices = None for contig, _, allelesets in mvf.iterentries(contig_indices=contig_indices, subset=sample_indices, decode=True): if contig not in data: data[contig] = dict((x, 0) for x in sample_labels) data[contig]['contig'] = contig for j, elem in enumerate(sample_indices): data[contig][sample_labels[elem]] += int( allelesets[0][j] not in 'Xx-') outfile = OutputFile(path=args.out, headers=(["contig"] + [sample_labels[x] for x in sample_indices])) for contig in data: outfile.write_entry(data[contig]) return ''
def mvf_join(args): """Main method""" concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # Copy the first file's metadata if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() # Open each MVF file, read headers to make unified header transformers = [] for mvfname in args.mvf: # This will create a dictionary of samples{old:new}, contigs{old:new} transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read') for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label } if concatmvf.metadata['labels'].index(label) != i: transformer.set_label( i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in iter(mvf.metadata['contigs'].items()): if contigdata['label'] not in [ concatmvf.metadata['contigs'][x]['label'] for x in concatmvf.metadata['contigs'] ]: newid = (contigid not in concatmvf.metadata['contigs'] and contigid or concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata else: for concatid, concatdata in ( concatmvf.metadata['contigs'].items()): if contigdata['label'] == concatdata['label']: newid = concatid break if newid != contigid: transformer.set_contig(contigid, newid) transformers.append(transformer) # Write output header concatmvf.write_data(concatmvf.get_header()) # Now loop through each file entries = [] nentries = 0 for ifile, mvfname in enumerate(args.mvf): if not args.quiet: sys.stderr.write("Processing {} ...\n".format(mvfname)) transformer = transformers[ifile] mvf = MultiVariantFile(mvfname, 'read') for contigid, pos, allelesets in mvf.iterentries(decode=False, quiet=args.quiet): if transformer.labels: allelesets = [mvf.decode(x) for x in allelesets] for j, alleles in enumerate(allelesets): allelesets[j] = concatmvf.encode(''.join([ x in transformer.labels and alleles[transformer.labels[x]] or alleles[x] for x in range(len(alleles)) ])) if transformer.contigs: contigid = (contigid in transformer['contigs'] and transformer['contigs'][contigid] or contigid) entries.append((contigid, pos, allelesets)) nentries += 1 if nentries == args.line_buffer: concatmvf.write_entries(entries) entries = [] nentries = 0 if entries: concatmvf.write_entries(entries) entries = [] nentries = 0 if not args.quiet: sys.stderr.write("done\n") return ''
def calc_all_character_count_per_sample(args): """Count the number of and relative rate of certain bases spatially along chromosomes """ args.qprint("Running CalcAllCharacterCountPerSample") mvf = MultiVariantFile(args.mvf, 'read') current_contig = None current_position = 0 data_in_buffer = False # Set up sample indices sample_labels = mvf.get_sample_ids() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = None data = dict((i, {}) for i in sample_indices) data_characters = [{} for i in sample_indices] for contig, pos, allelesets in mvf.iterentries(decode=False, contig_ids=contig_ids): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): args.qprint("Processing contig {}".format(current_contig)) for i in sample_indices: data[i][(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data[i][(current_contig, current_position)].update(data_characters[i]) if contig != current_contig: current_contig = contig[:] current_position = 0 else: current_position += (0 if args.windowsize == -1 else args.windowsize) data_characters = [{} for i in sample_indices] data_in_buffer = False alleles = allelesets[0] if len(alleles) == 1: for i in sample_indices: data_characters[i][alleles[0]] = ( data_characters[i].get(alleles[0], 0) + 1) else: alleles = mvf.decode(alleles) for i in sample_indices: data_characters[i][alleles[i]] = ( data_characters[i].get(alleles[i], 0) + 1) data_in_buffer = True if data_in_buffer: for i in sample_indices: data[i][(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data[i][(current_contig, current_position)].update(data_characters[i]) # WRITE OUTPUT all_chars = set([]) for sampleid in data: for window in data[sampleid]: all_chars.update([ x for x in data[sampleid][window] if x not in ('contig', 'position') ]) headers = ['contig', 'position'] headers.extend(list(sorted(all_chars))) outfile = OutputFile(path=args.out, headers=headers) for sampleid in sample_indices: outfile.write("#{}\n".format(sample_labels[sampleid])) sorted_entries = [(data[sampleid][k]['contig'], data[sampleid][k]['position'], k) for k in data[sampleid]] for _, _, k in sorted_entries: outfile.write_entry(data[sampleid][k], defaultvalue='0') return ''
def calc_pairwise_distances(args): """Count the pairwise nucleotide distance between combinations of samples in a window """ args.qprint("Running CalcPairwiseDistances") mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Input MVF: Read") data = {} data_order = [] if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) args.qprint("Calculating for sample columns: {}".format( list(sample_indices))) current_contig = None current_position = 0 data_in_buffer = False sample_pairs = [tuple(x) for x in combinations(sample_indices, 2)] base_matches = dict((x, {}) for x in sample_pairs) all_match = {} if mvf.flavor == 'dna': allele_frames = (0, ) args.data_type = 'dna' elif mvf.flavor == 'prot': allele_frames = (0, ) args.data_type = 'dna' elif mvf.flavor == 'codon': if args.data_type == 'prot': allele_frames = (0, ) else: allele_frames = (1, 2, 3) args.data_type = 'dna' args.qprint("MVF flavor is: {}".format(mvf.flavor)) args.qprint("Data type is: {}".format(args.data_type)) args.qprint("Ambiguous mode: {}".format(args.ambig)) args.qprint("Processing MVF Records") pwdistance_function = get_pairwise_function(args.data_type, args.ambig) if args.emit_counts: outfile_emitcounts = open(args.out + ".pairwisecounts", 'w') for contig, pos, allelesets in mvf.iterentries(decode=None): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue # Establish first contig if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) all_diff, all_total = pwdistance_function(all_match) for samplepair in base_matches: ndiff, ntotal = pwdistance_function(base_matches[samplepair]) taxa = "{};{}".format(sample_labels[samplepair[0]], sample_labels[samplepair[1]]) data[(current_contig, current_position)].update({ '{};ndiff'.format(taxa): ndiff + all_diff, '{};ntotal'.format(taxa): ntotal + all_total, '{};dist'.format(taxa): zerodiv(ndiff + all_diff, ntotal + all_total) }) if contig != current_contig: current_contig = contig[:] current_position = 0 if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize else: current_position += args.windowsize if args.emit_counts: args.qprint("Writing Full Count Table") for p0, p1 in base_matches: outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format( p0, p1, current_position, current_contig, "\n".join([ "{} {}".format(x, (base_matches[(p0, p1)].get(x, 0) + all_match.get(x, 0))) for x in set(base_matches[(p0, p1)]).union(all_match) ]))) base_matches = dict((x, {}) for x in sample_pairs) all_match = {} data_in_buffer = False for iframe in allele_frames: alleles = allelesets[iframe] if len(alleles) == 1: all_match["{0}{0}".format(alleles)] = ( all_match.get("{0}{0}".format(alleles), 0) + 1) data_in_buffer = True continue if alleles[1] == '+': if alleles[2] in 'X-': continue samplepair = (0, int(alleles[3:])) if any(x not in sample_indices for x in samplepair): continue basepair = "{0}{1}".format(alleles[0], alleles[2]) base_matches[samplepair][basepair] = ( base_matches[samplepair].get(basepair, 0) + 1) data_in_buffer = True continue alleles = mvf.decode(alleles) valid_positions = [ i for i, x in enumerate(alleles) if x not in 'X-' and i in sample_indices ] assert len(alleles) == 4 assert alleles[0] not in 'X-', alleles assert alleles[1] not in 'X-', alleles for i, j in combinations(valid_positions, 2): samplepair = (i, j) basepair = "{0}{1}".format(alleles[i], alleles[j]) base_matches[samplepair][basepair] = ( base_matches[samplepair].get(basepair, 0) + 1) data_in_buffer = True # print(base_matches) if data_in_buffer is True: print(sum(base_matches[samplepair].values()), base_matches[samplepair], samplepair) print(sum(all_match.values()), all_match) print(sum(base_matches[samplepair].values()) + sum(all_match.values())) # Check whether, windows, contigs, or total if args.windowsize == 0: current_contig = 'TOTAL' current_position = 0 elif args.windowsize == -1: current_position = 0 data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) # print("All match") all_diff, all_total = pwdistance_function(all_match) print(all_diff, all_total) for samplepair in base_matches: ndiff, ntotal = pwdistance_function(base_matches[samplepair]) taxa = "{};{}".format(sample_labels[samplepair[0]], sample_labels[samplepair[1]]) data[(current_contig, current_position)].update({ '{};ndiff'.format(taxa): ndiff + all_diff, '{};ntotal'.format(taxa): ntotal + all_total, '{};dist'.format(taxa): zerodiv(ndiff + all_diff, ntotal + all_total) }) if args.emit_counts: args.qprint("Writing Full Count Table") for p0, p1 in base_matches: outfile_emitcounts.write("#{}\t{}\t{}\t{}\n{}\n".format( p0, p1, current_position, current_contig, "\n".join([ "{} {}".format(x, (base_matches[(p0, p1)].get(x, 0) + all_match.get(x, 0))) for x in set(base_matches[(p0, p1)]).union(all_match) ]))) args.qprint("Writing Output") headers = ['contig', 'position'] for samplepair in sample_pairs: headers.extend([ '{};{};{}'.format(sample_labels[samplepair[0]], sample_labels[samplepair[1]], x) for x in ('ndiff', 'ntotal', 'dist') ]) outfile = OutputFile(path=args.out, headers=headers) for okey in data_order: outfile.write_entry(data[okey]) if args.emit_counts: outfile_emitcounts.close() return ''
def calc_pattern_count(args): """Count biallelic patterns spatially along chromosomes (e.g,, for use in DFOIL or Dstats http://www.github.com/jbpease/dfoil). The last sample specified will determine the 'A' versus 'B' allele. """ mvf = MultiVariantFile(args.mvf, 'read') data = {} current_contig = None current_position = 0 sitepatterns = {} if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() nsamples = len(sample_indices) for contig, pos, allelesets in mvf.iterentries(decode=True, subset=sample_indices): alleles = allelesets[0] # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, alleles) is False: continue # Establish first contig if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): data[(current_contig, current_position)] = dict([('contig', current_contig), ('position', current_position)]) data[(current_contig, current_position)].update(sitepatterns) sitepatterns = {} if contig != current_contig: current_position = 0 current_contig = contig[:] else: current_position += (0 if args.windowsize == -1 else args.windowsize) if set(alleles) - set("ACGT"): continue if len(set(alleles)) > 2: continue pattern = ''.join( ['A' if x == alleles[-1] else 'B' for x in alleles[:-1]]) + 'A' sitepatterns[pattern] = sitepatterns.get(pattern, 0) + 1 if sitepatterns: data[(current_contig, current_position)] = dict([('contig', current_contig), ('position', current_position)]) data[(current_contig, current_position)].update(sitepatterns) # WRITE OUTPUT headers = ['contig', 'position'] headers.extend( [MLIB.abpattern(x, nsamples) for x in range(0, 2**nsamples, 2)]) outfile = OutputFile(path=args.out, headers=headers) outfile.write("#{}\n".format(",".join(mvf.get_sample_ids(sample_indices)))) sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k) for k in data]) for _, _, k in sorted_entries: outfile.write_entry(data[k]) # WRITE LIST OUTPUT if args.output_lists is True: sorted_entries = sorted([(data[k]['contig'], data[k]['position'], k) for k in data]) total_counts = {} for contig, pos, k in sorted_entries: outfilepath = "{}-{}-{}.counts.list".format(args.out, contig, pos) with open(outfilepath, 'w') as outfile: outfile.write("pattern,count\n") for pattern, pcount in sorted(data[k].items()): if pattern in ['contig', 'position']: continue outfile.write("{},{}\n".format(pattern, pcount)) total_counts[pattern] = (total_counts.get(pattern, 0) + pcount) outfilepath = "{}-TOTAL.counts.list".format(args.out) with open(outfilepath, 'w') as outfile: outfile.write("pattern,count\n") for pattern, pcount in sorted(total_counts.items()): if pattern in ['contig', 'position']: continue outfile.write("{},{}\n".format(pattern, pcount)) return ''
def calc_character_count(args): """Count the number of and relative rate of certain bases spatially along chromosomes """ mvf = MultiVariantFile(args.mvf, 'read') data = {} current_contig = None current_position = 0 all_match = 0 all_total = 0 data_in_buffer = False # Set up base matching from special words data_order = [] def proc_special_word(argx): if argx == 'dna': argx = MLIB.validchars['dna'] elif argx == 'dnaambig2': argx = MLIB.validchars['dna+ambig2'] elif argx == 'dnaambig3': argx = MLIB.validchars['dna+ambig3'] elif argx == 'dnaambigall': argx = MLIB.validchars['dna+ambigall'] elif argx == 'prot': argx = MLIB.validchars['amino'] return argx args.base_match = proc_special_word(args.base_match) args.base_total = proc_special_word(args.base_total) # Set up sample indices if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() sample_labels = mvf.get_sample_ids(indices=sample_indices) # Set up contig ids if args.contig_ids is not None: contig_indices = mvf.get_contig_indices( ids=args.contig_ids[0].split(",")) elif args.contig_labels is not None: contig_indices = mvf.get_contig_indices( labels=args.contig_labels[0].split(",")) else: contig_indices = None match_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys([sample_labels[i] for i in sample_indices], 0) for contig, pos, allelesets in mvf.iterentries( decode=False, contig_indices=contig_indices): # Check Minimum Site Coverage if check_mincoverage(args.mincoverage, allelesets[0]) is False: continue # if contig not in contig_ids: # continue # Establish first contig if current_contig is None: current_contig = contig[:] if args.windowsize > 0: while pos > current_position + args.windowsize - 1: current_position += args.windowsize # Check if windows are specified. if not same_window((current_contig, current_position), (contig, pos), args.windowsize): data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) if contig != current_contig: current_contig = contig[:] current_position = 0 else: current_position += (0 if args.windowsize == -1 else args.windowsize) match_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) total_counts = dict().fromkeys( [sample_labels[i] for i in sample_indices], 0) all_total = 0 all_match = 0 data_in_buffer = False else: alleles = allelesets[0] if len(alleles) == 1: if args.base_match is None: all_match += 1 elif alleles in args.base_match: all_match += 1 if args.base_total is None: all_total += 1 elif alleles in args.base_total: all_total += 1 else: alleles = mvf.decode(alleles) for i in sample_indices: if args.base_match is None: match_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_match: match_counts[sample_labels[i]] += 1 if args.base_total is None: total_counts[sample_labels[i]] += 1 elif alleles[i] in args.base_total: total_counts[sample_labels[i]] += 1 data_in_buffer = True if data_in_buffer: data[(current_contig, current_position)] = { 'contig': current_contig, 'position': current_position } data_order.append((current_contig, current_position)) for k in match_counts: data[(current_contig, current_position)].update([ (k + '.match', match_counts[k] + all_match), (k + '.total', total_counts[k] + all_total), (k + '.prop', ((float(match_counts[k] + all_match) / float(total_counts[k] + all_total)) if total_counts[k] + all_total > 0 else 0)) ]) # WRITE OUTPUT headers = ['contig', 'position'] for label in sample_labels: headers.extend([label + x for x in ('.match', '.total', '.prop')]) outfile = OutputFile(path=args.out, headers=headers) for okey in data_order: outfile.write_entry(data[okey]) return ''
def calc_group_unique_allele_window(args): """Count the number of and relative rate of uniquely held alleles spatially along chromosomes (i.e. Lineage-specific rates)""" args.qprint("Running InferGroupSpecificAllele") data = {} mvf = MultiVariantFile(args.mvf, 'read') if mvf.flavor != 'codon': raise RuntimeError( "\n=====================\nERROR: MVF is not codon flavor!") ncol = mvf.metadata['ncol'] args.qprint("Input MVF read with {} columns.".format(ncol)) annotations = {} coordinates = {} labels = mvf.get_sample_ids()[:] current_contig = None current_position = 0 counts = Counter() totals = Counter() args.start_contig = ( args.start_contig if args.start_contig is not None else 0) args.end_contig = ( args.end_contig if args.end_contig is not None else 100000000000) if args.output_align is True: outputalign = [] if args.gff is not None: annotations, coordinates = (parse_gff_analysis(args.gff)) if args.allele_groups is not None: args.allele_groups = procarg_allelegroups( args.allele_groups, mvf) if args.species_groups is None: args.species_groups = args.allele_groups else: args.species_groups = procarg_speciesgroups( args.species_groups, mvf) fieldtags = [ 'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0', 'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'] if args.branch_lrt is not None: with open(args.branch_lrt, 'w') as branchlrt: genealign = [] branchlrt.write("\t".join( ['contig', 'ntaxa', 'alignlength', 'lrtscore'] + ["null.{}".format(x) for x in fieldtags] + ["test.{}".format(x) for x in fieldtags] + ['tree']) + "\n") groups = args.allele_groups.values() if args.species_groups is not None: speciesgroups = args.species_groups.values() allsets = set([]) for group in groups: allsets.update(group) allsets = list(sorted(allsets)) speciesnames = args.species_groups.keys() speciesrev = {} if args.species_groups is not None: for species in args.species_groups: speciesrev.update([ (x, species) for x in args.species_groups[species]]) if args.mincoverage is not None: if args.mincoverage < len(groups) * 2: raise RuntimeError(""" Error: InferGroupSpecificAllele: --mincoverage cannot be lower than the twice the number of specified groups in --allele-groups """) genealign = [] args.qprint("Parameter Check Complete.") args.qprint("Number of Groups Specified: {}".format(len(groups))) for group in groups: args.qprint(group) args.qprint([labels[x] for x in group]) if not(group): raise RuntimeError( "Group is Empty! Check group labels/indicies specified.") args.qprint("Processing Entries.") for contig, pos, allelesets in mvf.iterentries(decode=False): if not same_window((current_contig, current_position), (contig, pos), args.windowsize): xkey = (current_contig, current_position,) data[xkey] = counts.copy() data[xkey].update([ ('contig', (mvf.get_contig_labels(ids=current_contig) if args.use_labels is True else current_contig)), ('position', current_position), ('nonsynyonymous_changes', counts.get('nonsynonymous_changes', 0) or 0), ('synyonymous_changes', counts.get('synonymous_changes', 0) or 0) ]) data[xkey].update([ ('ns_ratio', (float(data[xkey].get( 'nonsynonymous_changes', 0)) / ( data[xkey].get('synonymous_changes', 1.0)))), ('annotation', annotations.get(data[xkey]['contig'], '.')), ('coordinates', coordinates.get(data[xkey]['contig'], '.')) ]) if genealign: if (args.end_contig >= int(current_contig)) and ( args.start_contig <= int(current_contig)): (pamlnull, pamltest, tree) = paml_branchsite( genealign, labels[:], species=speciesnames, speciesrev=speciesrev, codemlpath=args.codeml_path, raxmlpath=args.raxml_path, pamltmp=args.paml_tmp, target=args.target, targetspec=args.num_target_species, allsampletrees=args.all_sample_trees, outgroup=args.outgroup) lrtscore = -1 if (pamlnull.get('likelihood', -1) != -1 and pamltest.get('likelihood', -1) != -1): lrtscore = 2 * (pamltest['likelihood'] - pamlnull['likelihood']) with open(args.branch_lrt, 'a') as branchlrt: branchlrt.write("\t".join([str(x) for x in [ data[xkey]['contig'], len(genealign), len(genealign[0]) * 3, lrtscore] + [ pamlnull.get(y, -1) for y in fieldtags] + [ pamltest.get(y, -1) for y in fieldtags] + [ str(tree).rstrip()]]) + "\n") genealign = None totals.add('genes_total') if counts.get('total_codons', 0) > 0: totals.add('genes_tested') if counts.get('total_nsyn_codons', 0) > 0: totals.add('genes_with_nsyn') if contig != current_contig: current_contig = contig[:] current_position = 0 elif args.windowsize > 0: current_position += args.windowsize counts = Counter() proteins = allelesets[0] codons = allelesets[1:4] if len(proteins) == 1 and all(len(x) == 1 for x in codons): if proteins == '*' or ''.join(codons) in MLIB.stop_codons: continue counts.add('total_codons') totals.add('total_codons') if args.output_align is True: if not outputalign: outputalign = [[''.join(codons)] for x in range(mvf.metadata['ncol'])] else: for subalign in outputalign: subalign.append(''.join(codons)) if args.branch_lrt is not None: if not genealign: genealign = [[''.join(codons)] for x in range(ncol)] else: for subalign in genealign: subalign.append(''.join(codons)) continue if len(proteins) > 1: if allelesets[0][1] == '+': continue proteins = mvf.decode(proteins) if args.mincoverage is not None: if sum([int(x not in 'X-') for x in proteins]) < ( args.mincoverage): continue species_groups = [[proteins[i] for i in x if proteins[i] not in '-X'] for x in speciesgroups] if any(not x for x in species_groups): continue xcodons = [mvf.decode(x) for x in codons] codons = [''.join(x) for x in zip(*xcodons)] if any(codons[x] in MLIB.stop_codons for x in allsets): continue if any(any(x != species_groups[0][0] for x in y) for y in species_groups): totals.add('total_nsyn_codons') counts.add('total_nsyn_codons') totals.add('total_codons') totals.add('tested_codons') counts.add('total_codons') totals.add('variable_codons', val=int(sum([int(len(set(x) - set('X-')) > 1) for x in xcodons]) > 0)) if args.output_align is not None: if not outputalign: outputalign = [[x] for x in codons] else: for j, subalign in enumerate(outputalign): subalign.append(codons[j]) if args.branch_lrt is not None: if not genealign: genealign = [[x] for x in codons] else: for j, codon in enumerate(codons): genealign[j].append(codon) nonsyn_change = False synon_change = False codon_groups = [ set(codons[i] for i in x if '-' not in codons[i] and 'X' not in codons[i]) for x in groups] protein_groups = None for i, grp in enumerate(codon_groups): if any(base in codon for base in 'RYWKMS' for codon in grp): codon_groups[i] = hapgroup(grp) if all(grp1.isdisjoint(grp0) for grp0, grp1 in combinations(codon_groups, 2)): protein_groups = [set(MLIB.codon_tables['full'][''.join(x)] for x in codon_groups[i]) for i, grp in enumerate(codon_groups)] if all(grp1.isdisjoint(grp0) for grp0, grp1 in combinations(protein_groups, 2)): nonsyn_change = True elif all(grp1 == grp0 for grp0, grp1 in combinations( protein_groups, 2)): synon_change = True if nonsyn_change: if args.verbose is True: print('NON', contig, pos, allelesets, codon_groups, protein_groups, groups, mvf.get_contig_labels( ids=contig)) counts.add('nonsynonymous_changes') totals.add('nonsynonymous_changes') elif synon_change: if args.verbose is True: print('SYN', contig, pos, allelesets, codon_groups, protein_groups, groups, mvf.get_contig_labels( ids=contig)) counts.add('synonymous_changes') totals.add('synonymous_changes') args.totals = totals # WRITE OUTPUT headers = ["contig", "position", "nonsynonymous_changes", "synonymous_changes", "ns_ratio", "nonsynonymous_total", "synonymous_total", "pvalue", "total_codons", "annotation", "coordinates"] if args.windowsize == -1: headers.remove('position') if args.chi_test is None: headers.remove('pvalue') outfile = OutputFile(path=args.out, headers=headers) sorted_entries = sorted([ (data[k]['ns_ratio'], k) for k in data if data[k].get('nonsynonymous_changes', 0) > 0], reverse=True) for _, k in sorted_entries: outfile.write_entry(data[k]) with open(args.out + '.total', 'w') as totalfile: for entry in args.totals.iter_sorted(): totalfile.write(entry) if args.output_align is not None: with open(args.output_align, 'w') as alignfile: alignfile.write( "\n".join([">{}\n{}".format(mvf.metadata['labels'][i], ''.join(outputalign[i])) for i in range(len(outputalign))])) return ''
def plot_chromoplot(args): """Main method""" pallette = Pallette() if args.colors is not None: pallette.basecolors = args.colors # Establish MVF and parse chromosome information if args.quiet is False: print("Reading MVF...") mvf = MultiVariantFile(args.mvf, 'read') if args.quiet is False: print("Parsing headers...") if args.contig_ids is not None: contigids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contigids = mvf.get_contig_ids(labels=args.contig_labels[0].split(",")) else: contigids = mvf.get_contig_ids() if args.quiet is False: print("Plotting chromoplot for contigs: {}".format( ",".join(contigids))) sample_labels = mvf.get_sample_labels() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() assert len(sample_indices) >= 3 if args.outgroup_indices is not None: outgroup_indices = [ int(x) for x in args.outgroup_indices[0].split(",") ] elif args.outgroup_labels is not None: outgroup_indices = mvf.get_sample_indices( labels=args.outgroup_labels[0].split(",")) assert len(outgroup_indices) >= 1 quartets = [(x, y, z, outgroup) for x, y, z in combinations(sample_indices, 3) for outgroup in outgroup_indices] # Begin iterations for quartet_indices in quartets: quartet_labels = [sample_labels[x] for x in quartet_indices] if args.quiet is False: print("Beginning quartet {}".format(",".join(quartet_labels))) params = { 'contigs': [[ contigid, mvf.metadata['contigs'][contigid]['label'], mvf.metadata['contigs'][contigid]['length'] ] for contigid in contigids], 'outpath': ((args.out_prefix if args.out_prefix is not None else '') or '_'.join(quartet_labels)) + ".png", 'labels': quartet_labels, 'indices': quartet_indices, 'windowsize': args.windowsize, 'majority': args.majority, 'infotrack': args.info_track, 'yscale': args.yscale, 'xscale': args.xscale, 'quiet': args.quiet, 'plottype': args.plot_type } chromoplot = Chromoplot(params=params, pallette=pallette) current_contig = '' for contig, pos, allelesets in mvf.iterentries(subset=quartet_indices, decode=True, contigs=contigids): if contig != current_contig: if args.quiet is False: print("Starting contig {}".format(contig)) current_contig = contig[:] alleles = allelesets[0] if '-' in alleles: site_code = 'gap' elif any(x not in 'ATGCatgc' for x in alleles): site_code = 'ambiguous' elif alleles[3] not in alleles[:3]: site_code = 'nonpolar' elif len(set(alleles)) > 2: site_code = 'triallelic' else: site_code = sum([ 2**(3 - j) * (alleles[j] != alleles[3]) for j in range(3) ]) chromoplot.add_data(str(contig), int(pos // args.windowsize), site_code) contig = '' current_contig = '' if not args.quiet: print("Writing image...") chromoplot.plot_chromoplot() if not args.quiet: print("Writing log...") chromoplot.write_total_log() return ''
def filter_mvf(args): """Main method""" if args.more_help is True: modulehelp() sys.exit() if args.mvf is None and args.test is None: raise RuntimeError("No input file specified with --mvf") if args.out is None and args.test is None: raise RuntimeError("No output file specified with --out") # Establish Input MVF if args.test is not None: ncol = args.test_nchar or len(args.test.split()[1]) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] # Create Actionset if args.labels: labels = mvf.get_sample_labels()[:] for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('columns', 'collapsepriority', 'collapsemerge', 'allelegroup', 'notmultigroup'): for j in range(1, len(arr)): arr[j] = ','.join( [str(labels.index(x)) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) actionset = build_actionset(args.actions, ncol) # TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break else: sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break else: sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': loc = loc.split(':') loc[1] = int(loc[1]) if actionfunc(loc) is False: linefail = True sys.stdout.write("Location Fail\n") break else: sys.stdout.write("Location Pass\n") if linefail is False: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format(test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() # MAIN MODE # Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) # reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge') for y in actionset): if args.labels: labels = outmvf.metadata['labels'][:] else: labels = [x for x in outmvf.metadata['samples']] for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': labels = [labels[x] for x in actionarg[0]] elif actionname in ('collapsepriority', 'collapsemerge'): labels = [ labels[x] for x in range(len(labels)) if x not in actionarg[0][1:] ] if args.labels: oldindices = mvf.get_sample_indices(labels) else: oldindices = labels[:] newsamples = {} for i, _ in enumerate(labels): newsamples[i] = mvf.metadata['samples'][oldindices[i]] outmvf.metadata['samples'] = newsamples.copy() outmvf.metadata['labels'] = labels[:] outmvf.write_data(outmvf.get_header()) # End header editing linebuffer = [] nbuffer = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose is True: sys.stdout.write(" {} {}".format(alleles, linetype)) for actionname, actiontype, actionfunc, actionargs in actionset: if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True elif actiontype == 'location': if actionfunc([chrom, pos]) is False: linefail = True if linefail: break if linefail is False: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True nbuffer += 1 linebuffer.append((chrom, pos, (alleles, ))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.line_buffer: outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] return ''
def translate_mvf(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: gff = parse_gff_translate(args.gff, args) if not args.quiet: print("gff_processed") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) entrybuffer = [] nentry = 0 if not args.gff: inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons(inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((current_contig, pos, (amino_acids, ))) else: entrybuffer.append( (current_contig, pos, (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: mvf_entries = {} for contigid, pos, allelesets in mvf.iterentries(decode=False): if contigid not in mvf_entries: mvf_entries[contigid] = {} mvf_entries[contigid][pos] = allelesets[0] for contigname in sorted(gff): contigid = mvf.get_contig_ids(labels=contigname)[0] for coords in sorted(gff[contigname]): reverse_strand = False if coords[3] == '-': reverse_strand = True alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[2::-1] ] else: alleles = [ mvf_entries[contigid].get(x, '-') for x in coords[0:3] ] if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = [MLIB.complement_bases[x] for x in alleles] decoded_alleles = alleles amino_acids = translate(''.join(alleles))[0] else: if reverse_strand: decoded_alleles = [[ MLIB.complement_bases[y] for y in mvf.decode(x) ] for x in alleles] alleles = [ mvf.encode(''.join(x)) for x in decoded_alleles ] else: decoded_alleles = [mvf.decode(x) for x in alleles] amino_acids = [ translate(''.join(x)) for x in zip(*decoded_alleles) ] amino_acids = mvf.encode(''.join( [x[0] for x in amino_acids])) if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append((contigid, coords[0], (amino_acids, ))) else: entrybuffer.append( (contigid, coords[0], (amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def translate_mvf(args): """Main method""" args.qprint("Running TranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff_genes, gene_order = parse_gff_exome(args) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.contig_data = dict( ( i, dict((y, z) for (y, z) in gff_genes[x].items() if y not in ('cds', ))) for (i, x) in enumerate(gene_order)) outmvf.contig_indices = list(range(len(gene_order))) outmvf.contig_ids = [gff_genes[x]['id'] for x in gene_order] outmvf.contig_labels = [gff_genes[x]['label'] for x in gene_order] outmvf.flavor = args.output_data outmvf.metadata.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: running_gene_index = -1 for igene, gene in enumerate(gene_order): xcontiglabel = gff_genes[gene]['contig'] xcontig = mvf.get_contig_indices( labels=gff_genes[gene]['contig']) if xcontig is None: print("Warning: contig {} not found".format( gff_genes[gene]['contig'])) xcontigid = mvf.get_contig_ids(indices=xcontig)[0] min_gene_coord = gff_genes[gene]['cds'][0][0] max_gene_coord = gff_genes[gene]['cds'][-1][1] mvf_entries = {} if not igene % 100: args.qprint("Processing gene {} on {}".format( gene, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): if pos < min_gene_coord: continue if pos > max_gene_coord: break mvf_entries[pos] = allelesets[0] reverse_strand = gff_genes[gene]['strand'] == '-' coords = [] running_gene_index += 1 for elem in gff_genes[gene]['cds']: coords.extend(list(range(elem[0], elem[1] + 1))) if reverse_strand: coords = coords[::-1] for codoncoord in range(0, len(coords), 3): alleles = tuple(mvf_entries.get(x, '-') for x in coords[codoncoord:codoncoord + 3]) if len(alleles) < 3: alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles))) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) amino_acids = outmvf.encode(''.join(amino_acids)) if args.output_data == 'protein': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, ) )) elif args.output_data == 'codon': entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[codoncoord] if args.retain_coords else codoncoord ), ( amino_acids, alleles[0], alleles[1], alleles[2] ) )) elif args.output_data == 'dna': for j, elem in enumerate( range(codoncoord, min(codoncoord + 3, len(coords)))): entrybuffer.append(( ( xcontigid if args.retain_contigs else running_gene_index ), ( coords[elem] if args.retain_coords else elem + 1 ), ( alleles[j], ) )) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def legacy_translate_mvf(args): """Main method""" args.qprint("Running LegacyTranslateMVF") if args.gff: args.qprint("Reading and Indexing MVF.") else: args.qprint("Reading MVF.") mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff)) if mvf.flavor != 'dna': raise RuntimeError("MVF must be flavor=dna to translate") if args.gff: args.qprint("Processing MVF Index File.") mvf.read_index_file() args.qprint("GFF processing start.") gff = parse_gff_legacy_translate( args.gff, args, parent_gene_pattern=args.parent_gene_pattern) args.qprint("GFF processed.") outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) outmvf.flavor = args.output_data outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF Established.") entrybuffer = [] nentry = 0 pos = None if not args.gff: args.qprint("No GFF used, translating sequences as pre-aligned in " "coding frame.") inputbuffer = [] current_contig = '' for contigid, pos, allelesets in mvf.iterentries(decode=False): if current_contig == '': current_contig = contigid[:] if contigid == current_contig: inputbuffer.append((pos, allelesets)) else: for _, amino_acids, alleles in iter_codons( inputbuffer, mvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 inputbuffer = [(pos, allelesets)] current_contig = contigid[:] if inputbuffer: for _, amino_acids, alleles in iter_codons( inputbuffer, outmvf): if all([x in '-X' for x in amino_acids]): continue if args.output_data == 'protein': entrybuffer.append( (current_contig, pos, (amino_acids,))) else: entrybuffer.append(( current_contig, pos, ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry == args.line_buffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 else: args.qprint("Indexing GFF gene names.") # mvfid_to_gffname = outmvf.get_contig_reverse_dict() for xcontig in outmvf.get_contig_indices(): mvf_entries = {} xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0] xcontigid = outmvf.get_contig_ids(indices=xcontig)[0] if xcontiglabel not in gff: if args.verbose: print( ("No entries in GFF, " "skipping contig: index:{} id:{} label:{}").format( xcontig, xcontigid, xcontiglabel)) continue if not xcontig % 100: args.qprint("Processing contig: {} {}".format( xcontigid, xcontiglabel)) for contigid, pos, allelesets in mvf.itercontigentries( xcontig, decode=False): mvf_entries[pos] = allelesets[0] for coords in sorted(gff[xcontiglabel]): reverse_strand = coords[3] == '-' alleles = (tuple(mvf_entries.get(x, '-') for x in coords[2::-1]) if reverse_strand is True else tuple(mvf_entries.get(x, '-') for x in coords[0:3])) if all(len(x) == 1 for x in alleles): if reverse_strand: alleles = tuple( MLIB.complement_bases[x] for x in alleles) decoded_alleles = alleles amino_acids = translate_single_codon(''.join(alleles)) else: if reverse_strand is True: decoded_alleles = tuple(tuple(MLIB.complement_bases[y] for y in mvf.decode(x)) for x in alleles) alleles = tuple(outmvf.encode(''.join(x)) for x in decoded_alleles) else: decoded_alleles = tuple(mvf.decode(x) for x in alleles) amino_acids = tuple(translate_single_codon(''.join(x)) for x in zip(*decoded_alleles)) # print("aminx", amino_acids) amino_acids = outmvf.encode(''.join(amino_acids)) # if all(x in '-X' for x in amino_acids): # continue # print("amino", amino_acids) # print("translated", amino_acids, alleles) if args.output_data == 'protein': entrybuffer.append((xcontig, coords[0], (amino_acids,))) else: entrybuffer.append(( xcontigid, coords[0], ( amino_acids, alleles[0], alleles[1], alleles[2]))) nentry += 1 if nentry >= args.line_buffer: args.qprint("Writing a block of {} entries.".format( args.line_buffer)) outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 if entrybuffer: outmvf.write_entries(entrybuffer) entrybuffer = [] nentry = 0 return ''
def mvf2fasta(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or ( mvf.flavor == "prot" and args.output_data in ("dna", "rna")): raise RuntimeError( "--output-data {} incompatiable with '{}' flavor mvf".format( args.output_data, mvf.flavor)) regions, max_region_coord, regionlabel = parse_regions_arg( args.regions, mvf.metadata['contigs']) sample_labels = mvf.get_sample_labels() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() skipcontig = '' tmp_files = dict((fn, open("{}-{}.tmp".format(fn, randint(1000000, 9999999)), 'w+', args.buffer)) for fn in sample_labels) labelwritten = dict.fromkeys(sample_labels, False) for contig, pos, allelesets in mvf.iterentries(contigs=[ x for x in max_region_coord ], quiet=args.quiet, decode=True): if contig == skipcontig: continue if (contig not in max_region_coord) or ( max_region_coord[contig] is not None and pos > max_region_coord[contig]): skipcontig = contig[:] continue inregion = False for rcontig, rstart, rstop, _ in regions: if contig == rcontig: if rstart is None or pos >= rstart: if rstop is None or pos <= rstop: inregion = True break if inregion is False: continue for col, label in zip(sample_indices, sample_labels): if not labelwritten[label]: if args.label_type == 'long': xlabel = "{} region={}".format(label, regionlabel) elif args.label_type == 'short': xlabel = "{}".format(label) tmp_files[label].write(">{}\n".format(xlabel)) labelwritten[label] = True if mvf.flavor == 'dna': tmp_files[label].write("N" if allelesets[0][col] == 'X' else allelesets[0][col]) elif mvf.flavor in ('codon', 'prot') and (args.output_data == 'prot'): tmp_files[label].write(allelesets[0][col]) elif mvf.flavor == 'codon' and args.output_data == 'dna': codon = [ "N" if allelesets[x][col] == 'X' else allelesets[x][col] for x in (1, 2, 3) ] tmp_files[label].write(''.join(codon)) with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while len(buff): outfile.write(buff) buff = filehandler.read(args.buffer) outfile.write("\n") filehandler.close() os.remove(os.path.join(args.temp_dir, filehandler.name)) return ''
def infer_window_tree(args): """Main method""" # ESTABLISH FILE OBJECTS mvf = MultiVariantFile(args.mvf, 'read') # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = mvf.get_contig_ids() treefile = OutputFile( args.out, headers=[ 'contig', 'windowstart', 'windowsize', 'tree', 'topology', 'topoid', # 'templabels', ### USED FOR DEBUGGING ### 'alignlength', 'aligndepth', 'status' ]) topofile = OutputFile(args.out + '.counts', headers=['rank', 'topology', 'count']) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( labels=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() if not os.path.exists(args.temp_dir): os.mkdir(args.temp_dir) os.chdir(args.temp_dir) # SETUP PARAMS main_labels = mvf.get_sample_labels(sample_indices) if args.choose_allele in ['randomboth', 'majorminor']: main_labels = [label + x for x in ['a', 'b'] for label in main_labels] params = { 'outgroups': args.raxml_outgroups or [], 'rootwith': (args.root_with.split(',') if args.root_with is not None else None), 'minsites': args.min_sites, 'minseqcoverage': args.min_seq_coverage, 'mindepth': args.min_depth, 'raxmlpath': args.raxml_path, 'raxmlopts': args.raxml_opts, 'duplicateseq': args.duplicate_seq, 'model': args.raxml_model, 'bootstrap': args.bootstrap, 'windowsize': args.windowsize, 'chooseallele': args.choose_allele, 'tempdir': args.temp_dir, 'tempprefix': args.temp_prefix } # WINDOW START INTERATION verify_raxml(params) current_contig = '' current_position = 0 window_data = None skip_contig = False topo_ids = {} topo_counts = {} for contig, pos, allelesets in mvf.iterentries(contigs=contig_ids, subset=sample_indices, quiet=args.quiet, no_invariant=False, no_ambig=False, no_gap=False, decode=True): if current_contig == contig: if skip_contig is True: continue if not same_window((current_contig, current_position), (contig, pos), args.windowsize): skip_contig = False if window_data is not None: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) if args.windowsize != -1: skip_contig = True else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (topo_ids and max(topo_ids.values()) + 1 or 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) current_position = (current_position + args.windowsize if (contig == current_contig and args.windowsize > 0) else 0) current_contig = contig[:] window_data = None window_data = WindowData( window_params={ 'contigname': (mvf.get_contig_labels( ids=current_contig) if args.output_contig_labels is not None else current_contig[:]), "windowstart": ( '-1' if args.windowsize == -1 else current_position + 0), "windowsize": args.windowsize, "labels": main_labels[:] }) # ADD ALLELES if mvf.flavor == 'dna': if args.choose_allele != 'none': allelesets[0] = hapsplit(allelesets[0], args.choose_allele) window_data.append_alleles(allelesets[0], mindepth=args.min_depth) # LAST LOOP if window_data: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_data = None # END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({'rank': rank, 'count': value, 'topology': topo}) return ''
def infer_window_tree(args): """Main method""" args.qprint("Running InferTree") # ESTABLISH FILE OBJECTS mvf = MultiVariantFile(args.mvf, 'read') args.qprint("Read MVF File: {}".format(args.mvf)) # Set up contig ids if args.contig_ids is not None: contig_ids = args.contig_ids[0].split(",") elif args.contig_labels is not None: contig_ids = mvf.get_contig_ids( labels=args.contig_labels[0].split(",")) else: contig_ids = mvf.get_contig_ids() treefile = OutputFile( args.out, headers=['contig', 'windowstart', 'windowsize', 'tree', 'topology', 'topoid', # 'templabels', ### USED FOR DEBUGGING ### 'alignlength', 'aligndepth', 'status']) topofile = OutputFile(args.out + '.counts', headers=['rank', 'topology', 'count']) if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() if not os.path.exists(args.temp_dir): os.mkdir(args.temp_dir) os.chdir(args.temp_dir) # SETUP PARAMS main_labels = mvf.get_sample_ids(sample_indices) if args.choose_allele in ['randomboth', 'majorminor']: main_labels = [label + x for x in ['a', 'b'] for label in main_labels] params = { 'bootstrap': args.bootstrap, 'chooseallele': args.choose_allele, 'collapse_polytomies': args.collapse_polytomies, 'duplicateseq': args.duplicate_seq, 'engine': args.engine, 'engine_path': args.engine_path, 'engine_opts': args.engine_opts, 'mindepth': args.min_depth, 'minseqcoverage': args.min_seq_coverage, 'minsites': args.min_sites, 'model': args.model, 'outgroups': (args.raxml_outgroups if args.raxml_outgroups is not None else None), 'rootwith': (args.root_with.split(',') if args.root_with is not None else []), 'tempdir': args.temp_dir, 'tempprefix': args.temp_prefix, 'windowsize': args.windowsize, } # DEFAULT MODEL if params['model'] is None: if params['engine'] == 'raxml': params['model'] = 'GTRGAMMA' elif params['engine'] == 'raxml-ng': params['model'] = "GTR+G" # WINDOW START INTERATION verify_raxml(params) args.qprint("RAxML Found.") current_contig = None current_position = 0 window_data = None # skip_contig = False topo_ids = {} topo_counts = {} args.qprint("Prcocessing Records") windowsizename = "window size={}".format(args.windowsize) if windowsizename == "window size=-1": windowsizename = "whole contig" elif windowsizename == "window size=0": windowsizename = "whole genome" window_data = WindowData(window_params={ 'contigname': 'all', "windowstart": 0, "windowsize": 0, "labels": main_labels[:]}) for contig, pos, allelesets in mvf.iterentries( contig_ids=contig_ids, subset=sample_indices, no_invariant=False, no_ambig=False, no_gap=False, decode=True): # if current_contig == contig: # if skip_contig is True: # args.qprint("Skipping contig: {}".format(current_contig)) # continue if not same_window((current_contig, current_position), (contig, pos), args.windowsize): # skip_contig = False if window_data is not None: args.qprint(("Making tree for {} " "at contig {} position {}").format( windowsizename, current_contig, current_position)) entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) # if args.windowsize != -1: # skip_contig = True args.qprint( "TREE REJECTED with error code: {} ({})".format( entry['status'], entry.get('comment', "None"))) else: args.qprint("Tree completed.") topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = (max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) current_position = current_position + args.windowsize if ( contig == current_contig and args.windowsize > 0) else 0 current_contig = contig[:] window_data = None window_data = WindowData(window_params={ 'contigname': (mvf.get_contig_labels(ids=current_contig) if args.output_contig_labels is not None else current_contig[:]), "windowstart": ('-1' if args.windowsize == -1 else current_position + 0), "windowsize": args.windowsize, "labels": main_labels[:]}) # ADD ALLELES if mvf.flavor == 'dna': if args.choose_allele != 'none': allelesets[0] = hapsplit(allelesets[0], args.choose_allele) window_data.append_alleles(allelesets[0], mindepth=args.min_depth) elif mvf.flavor == 'codon': for i in (1, 2, 3): if args.choose_allele != 'none': allelesets[i] = hapsplit(allelesets[i], args.choose_allele) window_data.append_alleles(allelesets[i], mindepth=args.min_depth) # LAST LOOP if window_data: entry = window_data.maketree_raxml(params) if entry['status'] != 'ok': if args.output_empty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = ( max(topo_ids.values()) + 1 if topo_ids else 0) entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_data = None # END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.items()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({'rank': rank, 'count': value, 'topology': topo}) return ''
def mvf2fasta(args): """Main method""" mvf = MultiVariantFile(args.mvf, 'read') if (mvf.flavor in ("dna", "rna") and args.output_data == "prot") or ( mvf.flavor == "prot" and args.output_data in ("dna", "rna")): raise RuntimeError( "--output-data {} incompatiable with '{}' flavor mvf".format( args.output_data, mvf.flavor)) regions, max_region_coord, regionlabel = parse_regions_arg( args.regions, mvf.contig_data) sample_labels = mvf.get_sample_ids() if args.sample_indices is not None: sample_indices = [int(x) for x in args.sample_indices[0].split(",")] elif args.sample_labels is not None: sample_indices = mvf.get_sample_indices( ids=args.sample_labels[0].split(",")) else: sample_indices = mvf.get_sample_indices() skipcontig = None tmp_files = dict( (fname, tempfile.NamedTemporaryFile(mode='w+', prefix=fname)) for fname in sample_labels) labelwritten = dict.fromkeys(sample_labels, False) write_buffer = {} current_contig = None data_written = False args.qprint("Regions determined. Reading entries.") for contig, pos, allelesets in mvf.iterentries( contig_indices=mvf.get_contig_indices( ids=list(max_region_coord.keys())), decode=True): if current_contig is None: current_contig = mvf.get_contig_indices(ids=contig) if contig == skipcontig: continue if (contig not in max_region_coord) or ( max_region_coord[contig] is not None and pos > max_region_coord[contig]): skipcontig = contig[:] continue inregion = False for rcontig, rstart, rstop, _ in regions[contig]: if contig == rcontig: if rstart is None or pos >= rstart: if rstop is None or pos <= rstop: inregion = True break if inregion is False: continue for col, label in zip(sample_indices, sample_labels): if not labelwritten[label]: if args.label_type == 'long': xlabel = "{} region={}".format(label, regionlabel) elif args.label_type == 'short': xlabel = "{}".format(label) tmp_files[label].write(">{}\n".format(xlabel)) labelwritten[label] = True if mvf.flavor == 'dna': tmp_files[label].write("N" if allelesets[0][col] == 'X' else allelesets[0][col]) data_written = True elif mvf.flavor in ('codon', 'prot') and (args.output_data == 'prot'): tmp_files[label].write(allelesets[0][col]) data_written = True elif mvf.flavor == 'codon' and args.output_data == 'dna': codon = [ "N" if allelesets[x][col] == 'X' else allelesets[x][col] for x in (1, 2, 3) ] if not args.gene_mode: tmp_files[label].write(''.join(codon)) data_written = True else: if contig != current_contig: if mvf.metadata['contigs'][current_contig].get( 'strand', "+") == '-': write_buffer[label] = write_buffer[label][::-1] tmp_files[label].write(''.join(write_buffer[label])) data_written = True if label not in write_buffer: write_buffer[label] = [] write_buffer[label].append(''.join(codon)) if args.gene_mode and current_contig != contig: write_buffer = {} current_contig = contig[:] if write_buffer: for label in write_buffer: if mvf.metadata['contigs'][current_contig].get('strand', "+") == '-': write_buffer[label] = write_buffer[label][::-1] tmp_files[label].write(''.join(write_buffer[label])) data_written = True write_buffer = {} if data_written is False: print("ERROR NO DATA WRITTEN") with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while buff: outfile.write(buff) buff = filehandler.read(args.buffer) outfile.write("\n") filehandler.close() return ''
def filter_mvf(args): """Main method""" args.qprint("Running FilterMVF") if args.more_help is True: modulehelp() sys.exit() if args.mvf is None and args.test is None: raise RuntimeError("No input file specified with --mvf") if args.out is None and args.test is None: raise RuntimeError("No output file specified with --out") # Establish Input MVF if args.test is not None: ncol = args.test_nchar or len(args.test.split()[1]) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] args.qprint("Input MVF read with {} columns.".format(ncol)) # Create Actionset if args.labels: for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): arr[1] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[1].split(',')]) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(mvf.sample_id_to_index[x]) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) removed_columns = set([]) for i in range(len(args.actions)): action = args.actions[i] arr = action.split(':') if arr[0] in ('collapsepriority', 'collapsemerge'): tmp_arr = arr[1][:] arr[1] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[1].split(',')]) removed_columns.update([int(x) for x in tmp_arr.split(',')[1:]]) print(arr) print(removed_columns) if arr[0] in ('columns', 'allelegroup', 'notmultigroup', 'reqsample'): for j in range(1, len(arr)): arr[j] = ','.join([ str(int(x) - len([y for y in removed_columns if y < int(x)])) for x in arr[j].split(',')]) args.actions[i] = ':'.join(arr) actionset = build_actionset(args.actions, ncol) args.qprint("Actions established.") args.qprint(actionset) # TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': loc = loc.split(':') loc[1] = int(loc[1]) if actionfunc(loc) is False: linefail = True sys.stdout.write("Location Fail\n") break sys.stdout.write("Location Pass\n") if linefail is False: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() # MAIN MODE # Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.copy_headers_from(mvf) removed_indices = set([]) # reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge') for y in actionset): for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': if args.labels: oldindices = [outmvf.sample_id_to_index[int(x)] for x in actionarg[0]] else: oldindices = [int(x) for x in actionarg[0]] elif actionname in ('collapsepriority', 'collapsemerge'): actionarg[0] = [x - len([y for y in removed_indices if y < x]) for x in actionarg[0]] oldindices = [x for x in outmvf.sample_indices if x not in actionarg[0][1:]] outmvf.sample_ids = outmvf.get_sample_ids(oldindices) outmvf.sample_data = dict( (i, outmvf.sample_data[oldindices[i]]) for i, _ in enumerate(oldindices)) if actionname in ('collapsepriority', 'collapsemerge'): if len(actionarg) == 2: outmvf.sample_data[actionarg[0][0]]['id'] = actionarg[1][0] outmvf.sample_ids[actionarg[0][0]] = actionarg[1][0] outmvf.sample_indices = list(range(len(oldindices))) outmvf.metadata['ncol'] = len(outmvf.sample_indices) outmvf.notes.append(args.command_string) outmvf.write_data(outmvf.get_header()) args.qprint("Output MVF established.") # End header editing linebuffer = [] nbuffer = 0 args.qprint("Processing Entries.") write_total = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False # invar = invariant (single character) # refvar (all different than reference, two chars) # onecov (single coverage, + is second character) # onevar (one variable base, + is third character) # full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose is True: sys.stdout.write(" {} {} ".format(alleles, linetype)) for actionname, actiontype, actionfunc, _ in actionset: if actiontype == 'filter': linefail = not actionfunc(alleles, linetype) elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) linefail = linetype == 'empty' elif actiontype == 'location': linefail = not actionfunc([chrom, pos]) if linefail: break if linefail is False: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.line_buffer: write_total += args.line_buffer args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) write_total += len(linebuffer) args.qprint("{} entries written. Total written: {}.".format( args.line_buffer, write_total)) linebuffer = [] return ''