def smrna_txt(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'name num_trna trna_begin trna_end isotype anticodon up_region down_region intron_begin intron_end cove hmm 2_str scanner' curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: gene = line.rstrip() data = {k:g for k,g in zip(keys.split(), gene.split())} if data.get('name') != curr_chrom: curr_chrom = data.get('name') try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: start_delta = find_delta(positions, deltas, int(data.get('trna_begin'))) end_delta = find_delta(positions, deltas, int(data.get('trna_end'))) data['trna_begin'] = int(data.get('trna_begin')) + start_delta data['trna_end'] = int(data.get('trna_end')) + end_delta out.write('%s\n' % \ '\t'.join([str(data.get(k)) for k in keys.split()])) except IndexError: pass
def npf(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'chrom chromStart chromEnd name score strand a b c d' curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: if not line.startswith('chrom'): gene = line.rstrip() data = {k:g for k,g in zip(keys.split(), gene.split('\t'))} if data.get('chrom') != curr_chrom: curr_chrom = data.get('chrom') try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: start_delta = find_delta(positions, deltas, int(data.get('chromStart'))) end_delta = find_delta(positions, deltas, int(data.get('chromEnd'))) data['chromStart'] = int(data.get('chromStart')) + start_delta data['chromEnd'] = int(data.get('chromEnd')) + end_delta out.write('%s\n' % \ '\t'.join([str(data.get(k)) for k in keys.split()])) except IndexError: out.write('%s\n' % \ '\t'.join([str(data.get(k)) for k in keys.split()]))
def gtf(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = "seqname source feature start end score strand frame attribute" curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: gene = line.rstrip() data = gene.split('\t') if data[0] != curr_chrom: curr_chrom = data[0] try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: start_delta = find_delta(positions, deltas, int(data[3])) end_delta = find_delta(positions, deltas, int(data[4])) data[3] = int(data[3]) + start_delta data[4] = int(data[4]) + end_delta out.write('%s\n' % \ '\t'.join([str(d) for d in data])) except IndexError: pass
def smrna_table_txt(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'name chrom start end strand mature hairpin' curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: gene = line.rstrip() data = {k:g for k,g in zip(keys.split(), gene.split())} if data.get('chrom') != curr_chrom: curr_chrom = data.get('chrom') try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: start_delta = find_delta(positions, deltas, int(data.get('start'))) end_delta = find_delta(positions, deltas, int(data.get('end'))) data['start'] = int(data.get('start')) + start_delta data['end'] = int(data.get('end')) + end_delta out.write('%s\n' % \ '\t'.join([str(data.get(k)) for k in keys.split()])) except IndexError: pass
def test_data_validity(self): """Assert data equals known value. """ self.assertEqual( transform.find_delta(self.positions, self.deltas, 32624884), -15328) self.assertEqual( transform.find_delta(self.positions, self.deltas, 32624885), -15328)
def smrna_12_bed(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb blockCount blockSizes blockStart' curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: if not line.startswith('chrom'): gene = line.rstrip() data = {k:g for k,g in zip(keys.split(), gene.split('\t'))} if data.get('chrom') != curr_chrom: curr_chrom = data.get('chrom') try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: c_start_delta = find_delta(positions, deltas, int(data.get('chromStart'))) c_end_delta = find_delta(positions, deltas, int(data.get('chromEnd'))) t_start_delta = find_delta(positions, deltas, int(data.get('thickStart'))) t_end_delta = find_delta(positions, deltas, int(data.get('thickEnd'))) b_tmp_data = find_delta(positions, deltas, (int(data.get('chromStart')) + \ int(data.get('blockStart')))) b_start_delta = b_tmp_data - int(data.get('chromStart')) data['chromStart'] = int(data.get('chromStart')) + c_start_delta data['chromEnd'] = int(data.get('chromEnd')) + c_end_delta data['thickStart'] = int(data.get('thickStart')) + t_start_delta data['thickEnd'] = int(data.get('thickEnd')) + t_end_delta data['blockStart'] = int(data.get('blockStart')) + b_start_delta out.write('%s\n' % \ '\t'.join([str(data.get(k)) for k in keys.split()])) except IndexError: pass
def test_return_type(self): """Assert return type is int. """ position = random.randint(0, len(self.transform.get('chr19'))) self.assertIsInstance( transform.find_delta(self.positions, self.deltas, position), int)
def build_modification_index(positions, deltas, line, start_delta): """Return a list of (read index, change relative to read start location). """ hwm = start_delta mod_index = [] for i,p in enumerate(line.get_reference_positions()): p_delta = find_delta(positions, deltas, p) if hwm != p_delta: mod_index.append((i, hwm-p_delta)) hwm = p_delta return mod_index
def smrna_lib_fa(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'chrom start end strand' curr_chrom = "" with open(args.input, 'r') as input_: for line in input_: if line.startswith('>'): gene = line.rstrip() gene_list = match('>(.*):(\d+)-(\d+)\((.*)\)', gene) data = {k:g for k,g in zip(keys.split(), gene_list.groups())} if data.get('chrom') != curr_chrom: curr_chrom = data.get('chrom') try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () try: start_delta = find_delta(positions, deltas, int(data.get('start'))) end_delta = find_delta(positions, deltas, int(data.get('end'))) data['start'] = int(data.get('start')) + start_delta data['end'] = int(data.get('end')) + end_delta out.write('>%s:%s-%s(%s)\n' % \ tuple([str(data.get(k)) for k in keys.split()])) except IndexError: pass else: out.write('%s' % line)
def atac(args, logger): """ """ if not args.chrom_sizes: exit("Chrom sizes required for bam conversion") chrom_mods = build_transform(args.mod, logger) input_ = AlignmentFile(args.input, 'rb') header = update_header(input_.header.as_dict(), args.chrom_sizes) output = AlignmentFile(args.output, 'wb', header=header) curr_chrom = "" for line in input_: if input_.references[line.reference_id] != curr_chrom: curr_chrom = input_.references[line.reference_id] positions, deltas = get_positions_and_deltas( chrom_mods, curr_chrom, logger) # if line.is_reverse and (line.reference_length != len(line.seq)): # print line # print line.reference_length # print line.cigar # print len(line.seq) # print len(line.get_reference_positions()) # try: if not line.is_reverse: start_delta = find_delta(positions, deltas, int(line.reference_start)) line.reference_start = int(line.reference_start) + start_delta else: end_delta = find_delta(positions, deltas, int(line.reference_end)) mapped_end = int(line.reference_end) + end_delta line.reference_start = mapped_end - len( line.seq) # line.reference_length output.write(line)
def test_return_types(self): """Assert return type is list of tuples. """ for line in self.lines: start_delta = find_delta(self.positions, self.deltas, int(line.reference_start)) mod_index = build_modification_index(self.positions, self.deltas, line, start_delta) self.assertIsInstance(mod_index, list) for mi in mod_index: self.assertIsInstance(mi, tuple) self.assertIsInstance(mi[0], int) self.assertIsInstance(mi[1], int)
def bam(args, logger): if not args.chrom_sizes: exit("Chrom sizes required for bam conversion") chrom_mods = build_transform(args.mod, logger) input_ = AlignmentFile(args.input, 'rb') header = update_header(input_.header.as_dict(), args.chrom_sizes) output = AlignmentFile(args.output, 'wb', header=header) curr_chrom = "" for line in input_: if input_.references[line.reference_id] != curr_chrom: curr_chrom = input_.references[line.reference_id] positions, deltas = get_positions_and_deltas(chrom_mods, curr_chrom, logger) try: start_delta = find_delta(positions, deltas, int(line.reference_start)) # end_delta = find_delta(positions, # deltas, # int(line.reference_end)) mod_index = build_modification_index(positions, deltas, line, start_delta) # new_cigar = update_cigar(mod_index, line.cigar) # if len(line.cigar) < len(new_cigar): # line.cigar = new_cigar[-1*len(line.cigar):] # else: # line.cigar = new_cigar line.reference_start = int(line.reference_start) + start_delta output.write(line) except IndexError: print "IndexError: ", line pass except TypeError: print "TypeError:", line pass
def wig(args, logger): out = open(args.output, 'w') chrom_mods = build_transform(args.mod, logger) keys = 'dataValue' curr_chrom = "" chrom_pattern = compile('chrom=(\S+)\s') start_pattern = compile('start=(\d+)') with open(args.input, 'r') as input_: for line in input_: if not 'chrom' in line: out.write(line) # location = line.rstrip() # data = {k:g for k,g in zip(keys.split(), location.split())} # try: # start_delta = find_delta(positions, # deltas, # int(data.get('chromStart'))) # data['chromStart'] = int(data.get('chromStart')) + start_delta # out.write('%s\n' % \ # '\t'.join([str(data.get(k)) for k in keys.split()])) # except IndexError: # out.write('%s\n' % \ # '\t'.join([str(data.get(k)) for k in keys.split()])) else: chrom_search = chrom_pattern.search(line) if chrom_search.groups()[0] != curr_chrom: curr_chrom = chrom_search.groups()[0] try: positions, deltas = zip(*chrom_mods.get(curr_chrom)) logger.info("CONTIG: '%s'" % curr_chrom) except TypeError: logger.warn( "CONTIG: '%s' is not in MOD File." % \ curr_chrom) positions, deltas = (), () start_location = int(start_pattern.search(line).groups()[0]) start_delta = find_delta(positions, deltas, start_location) new_line = sub('start=%d' % start_location, 'start=%d' % (start_location + start_delta), line) out.write(new_line)