Beispiel #1
0
def smrna_txt(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'name num_trna trna_begin trna_end isotype anticodon up_region down_region intron_begin intron_end cove hmm 2_str scanner'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = {k:g for k,g in zip(keys.split(), gene.split())}
            if data.get('name') != curr_chrom:
                curr_chrom = data.get('name')
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data.get('trna_begin')))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data.get('trna_end')))
                data['trna_begin'] = int(data.get('trna_begin')) + start_delta
                data['trna_end'] = int(data.get('trna_end')) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(data.get(k)) for k in keys.split()]))
            except IndexError:
                pass
Beispiel #2
0
def npf(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom chromStart chromEnd name score strand a b c d'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if not line.startswith('chrom'):
                gene = line.rstrip()
                data = {k:g for k,g in zip(keys.split(), gene.split('\t'))}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    start_delta = find_delta(positions, 
                                             deltas, 
                                             int(data.get('chromStart')))
                    end_delta = find_delta(positions, 
                                           deltas,
                                           int(data.get('chromEnd')))
                    data['chromStart'] = int(data.get('chromStart')) + start_delta
                    data['chromEnd'] = int(data.get('chromEnd')) + end_delta
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
Beispiel #3
0
def gtf(args, logger):
    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = "seqname source feature start end score strand frame attribute"
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = gene.split('\t')
            if data[0] != curr_chrom:
                curr_chrom = data[0]
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data[3]))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data[4]))
                data[3] = int(data[3]) + start_delta
                data[4] = int(data[4]) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(d) for d in data]))
            except IndexError:
                pass
Beispiel #4
0
def smrna_table_txt(args, logger):
    
    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'name chrom start end strand mature hairpin'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = {k:g for k,g in zip(keys.split(), gene.split())}
            if data.get('chrom') != curr_chrom:
                curr_chrom = data.get('chrom')
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data.get('start')))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data.get('end')))
                data['start'] = int(data.get('start')) + start_delta
                data['end'] = int(data.get('end')) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(data.get(k)) for k in keys.split()]))
            except IndexError:
                pass
    def test_data_validity(self):
        """Assert data equals known value.

        """
        self.assertEqual(
            transform.find_delta(self.positions, self.deltas, 32624884),
            -15328)
        self.assertEqual(
            transform.find_delta(self.positions, self.deltas, 32624885),
            -15328)
Beispiel #6
0
def smrna_12_bed(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb blockCount blockSizes blockStart'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if not line.startswith('chrom'):
                gene = line.rstrip()
                data = {k:g for k,g in zip(keys.split(), gene.split('\t'))}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    c_start_delta = find_delta(positions, 
                                               deltas, 
                                               int(data.get('chromStart')))
                    c_end_delta = find_delta(positions, 
                                             deltas,
                                             int(data.get('chromEnd')))
                    t_start_delta = find_delta(positions, 
                                               deltas, 
                                               int(data.get('thickStart')))
                    t_end_delta = find_delta(positions, 
                                             deltas,
                                             int(data.get('thickEnd')))
                    b_tmp_data = find_delta(positions, 
                                             deltas, 
                                             (int(data.get('chromStart')) + \
                                              int(data.get('blockStart'))))
                    b_start_delta = b_tmp_data - int(data.get('chromStart'))
                    data['chromStart'] = int(data.get('chromStart')) + c_start_delta
                    data['chromEnd'] = int(data.get('chromEnd')) + c_end_delta
                    data['thickStart'] = int(data.get('thickStart')) + t_start_delta
                    data['thickEnd'] = int(data.get('thickEnd')) + t_end_delta
                    data['blockStart'] = int(data.get('blockStart')) + b_start_delta
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    pass
    def test_return_type(self):
        """Assert return type is int.

        """
        position = random.randint(0, len(self.transform.get('chr19')))
        self.assertIsInstance(
            transform.find_delta(self.positions, self.deltas, position), int)
Beispiel #8
0
def build_modification_index(positions, deltas, line, start_delta):
    """Return a list of (read index, change relative to read start location).

    """
    hwm = start_delta
    mod_index = []
    for i,p in enumerate(line.get_reference_positions()):
        p_delta = find_delta(positions, deltas, p)
        if hwm != p_delta:
            mod_index.append((i, hwm-p_delta))
            hwm = p_delta
    return mod_index
Beispiel #9
0
def smrna_lib_fa(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom start end strand'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if line.startswith('>'):
                gene = line.rstrip()

                gene_list = match('>(.*):(\d+)-(\d+)\((.*)\)', gene)
                data = {k:g for k,g in zip(keys.split(), gene_list.groups())}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    start_delta = find_delta(positions, 
                                             deltas, 
                                             int(data.get('start')))
                    end_delta = find_delta(positions, 
                                           deltas,
                                           int(data.get('end')))
                    data['start'] = int(data.get('start')) + start_delta
                    data['end'] = int(data.get('end')) + end_delta
                    out.write('>%s:%s-%s(%s)\n' % \
                              tuple([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    pass
            else:
                out.write('%s' % line)
Beispiel #10
0
def atac(args, logger):
    """

    """
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")

    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(
                chrom_mods, curr_chrom, logger)
        # if line.is_reverse and (line.reference_length != len(line.seq)):
        #     print line
        #     print line.reference_length
        #     print line.cigar
        #     print len(line.seq)
        #     print len(line.get_reference_positions())
#        try:
        if not line.is_reverse:
            start_delta = find_delta(positions, deltas,
                                     int(line.reference_start))
            line.reference_start = int(line.reference_start) + start_delta
        else:
            end_delta = find_delta(positions, deltas, int(line.reference_end))
            mapped_end = int(line.reference_end) + end_delta
            line.reference_start = mapped_end - len(
                line.seq)  # line.reference_length
        output.write(line)
Beispiel #11
0
    def test_return_types(self):
        """Assert return type is list of tuples.

        """
        for line in self.lines:
            start_delta = find_delta(self.positions,
                                     self.deltas,
                                     int(line.reference_start))
            mod_index = build_modification_index(self.positions,
                                                 self.deltas,
                                                 line,
                                                 start_delta)
            self.assertIsInstance(mod_index, list)
            for mi in mod_index:
                self.assertIsInstance(mi, tuple)
                self.assertIsInstance(mi[0], int)
                self.assertIsInstance(mi[1], int)
Beispiel #12
0
def bam(args, logger):
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")
        
    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(chrom_mods,
                                                         curr_chrom,
                                                         logger)

        try:
            start_delta = find_delta(positions,
                                     deltas,
                                     int(line.reference_start))
            # end_delta = find_delta(positions,
            #                        deltas,
            #                        int(line.reference_end))
            mod_index = build_modification_index(positions,
                                                 deltas,
                                                 line,
                                                 start_delta)
            # new_cigar = update_cigar(mod_index, line.cigar)

            # if len(line.cigar) < len(new_cigar):
            #     line.cigar = new_cigar[-1*len(line.cigar):]
            # else:
            #     line.cigar = new_cigar
            line.reference_start = int(line.reference_start) + start_delta
            output.write(line)
        except IndexError:
            print "IndexError: ", line
            pass
        except TypeError:
            print "TypeError:", line
            pass
Beispiel #13
0
def wig(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'dataValue'
    curr_chrom = ""
    chrom_pattern = compile('chrom=(\S+)\s')
    start_pattern = compile('start=(\d+)')
    with open(args.input, 'r') as input_:
        for line in input_:
            if not 'chrom' in line:
                out.write(line)
                # location = line.rstrip()
                # data = {k:g for k,g in zip(keys.split(), location.split())}
                # try:
                #     start_delta = find_delta(positions, 
                #                              deltas, 
                #                              int(data.get('chromStart')))
                #     data['chromStart'] = int(data.get('chromStart')) + start_delta
                #     out.write('%s\n' % \
                #               '\t'.join([str(data.get(k)) for k in keys.split()]))
                # except IndexError:
                #     out.write('%s\n' % \
                #               '\t'.join([str(data.get(k)) for k in keys.split()]))
            else:
                chrom_search = chrom_pattern.search(line)
                if chrom_search.groups()[0] != curr_chrom:
                    curr_chrom = chrom_search.groups()[0]
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                start_location = int(start_pattern.search(line).groups()[0])
                start_delta = find_delta(positions, deltas, start_location)
                new_line = sub('start=%d' % start_location,
                               'start=%d' % (start_location + start_delta),
                               line)
                out.write(new_line)