def setUpClass(self):
     self.mod_file = os.path.join(os.path.dirname(__file__),
                                  "../test_data/CAST_to_BL6_chr19.mod")
     self.logger = logger.build_logger()
     self.transform = transform.build_transform(self.mod_file, self.logger)
     self.positions, self.deltas = zip(*self.transform.get('chr19'))
     self.position = 0
     self.mod_file = os.path.join(os.path.dirname(__file__),
                                  "../test_data/CAST_to_BL6_chr19.mod")
     self.logger = logger.build_logger()
     self.for_trans = transform.build_transform(self.mod_file, self.logger)
     self.for_file = os.path.join(os.path.dirname(__file__),
                                  "../test_data/CAST_to_BL6_chr19.dat")
     self.for_data = pickle.load(open(self.for_file, 'r'))
Exemple #2
0
def npf(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom chromStart chromEnd name score strand a b c d'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if not line.startswith('chrom'):
                gene = line.rstrip()
                data = {k:g for k,g in zip(keys.split(), gene.split('\t'))}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    start_delta = find_delta(positions, 
                                             deltas, 
                                             int(data.get('chromStart')))
                    end_delta = find_delta(positions, 
                                           deltas,
                                           int(data.get('chromEnd')))
                    data['chromStart'] = int(data.get('chromStart')) + start_delta
                    data['chromEnd'] = int(data.get('chromEnd')) + end_delta
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
Exemple #3
0
def smrna_txt(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'name num_trna trna_begin trna_end isotype anticodon up_region down_region intron_begin intron_end cove hmm 2_str scanner'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = {k:g for k,g in zip(keys.split(), gene.split())}
            if data.get('name') != curr_chrom:
                curr_chrom = data.get('name')
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data.get('trna_begin')))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data.get('trna_end')))
                data['trna_begin'] = int(data.get('trna_begin')) + start_delta
                data['trna_end'] = int(data.get('trna_end')) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(data.get(k)) for k in keys.split()]))
            except IndexError:
                pass
Exemple #4
0
def smrna_table_txt(args, logger):
    
    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'name chrom start end strand mature hairpin'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = {k:g for k,g in zip(keys.split(), gene.split())}
            if data.get('chrom') != curr_chrom:
                curr_chrom = data.get('chrom')
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data.get('start')))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data.get('end')))
                data['start'] = int(data.get('start')) + start_delta
                data['end'] = int(data.get('end')) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(data.get(k)) for k in keys.split()]))
            except IndexError:
                pass
Exemple #5
0
def gtf(args, logger):
    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = "seqname source feature start end score strand frame attribute"
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            gene = line.rstrip()
            data = gene.split('\t')
            if data[0] != curr_chrom:
                curr_chrom = data[0]
                try:
                    positions, deltas = zip(*chrom_mods.get(curr_chrom))
                    logger.info("CONTIG: '%s'" % curr_chrom)
                except TypeError:
                    logger.warn(
                        "CONTIG: '%s' is not in MOD File." % \
                        curr_chrom)
                    positions, deltas = (), ()
            try:
                start_delta = find_delta(positions, 
                                         deltas, 
                                         int(data[3]))
                end_delta = find_delta(positions, 
                                       deltas,
                                       int(data[4]))
                data[3] = int(data[3]) + start_delta
                data[4] = int(data[4]) + end_delta
                out.write('%s\n' % \
                          '\t'.join([str(d) for d in data]))
            except IndexError:
                pass
Exemple #6
0
 def setUpClass(self):
     self.alignment_file = os.path.join(os.path.dirname(__file__),
                                        "../test_data/cast_ATAC_chr19.bam")
     self.input_ = pysam.AlignmentFile(self.alignment_file, 'rb')
     self.lines = random.sample([i for i in self.input_], 100)
     self.mod_file = os.path.join(
         os.path.dirname(__file__), "../test_data/CAST_to_BL6_chr19.mod")
     self.logger = build_logger()
     self.curr_chrom = 'chr19'
     self.chrom_mods = build_transform(self.mod_file, self.logger)
     self.positions, self.deltas = get_positions_and_deltas(self.chrom_mods,
                                                            self.curr_chrom,
                                                            self.logger)
     self.output = (self.positions, self.deltas)
Exemple #7
0
def smrna_12_bed(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb blockCount blockSizes blockStart'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if not line.startswith('chrom'):
                gene = line.rstrip()
                data = {k:g for k,g in zip(keys.split(), gene.split('\t'))}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    c_start_delta = find_delta(positions, 
                                               deltas, 
                                               int(data.get('chromStart')))
                    c_end_delta = find_delta(positions, 
                                             deltas,
                                             int(data.get('chromEnd')))
                    t_start_delta = find_delta(positions, 
                                               deltas, 
                                               int(data.get('thickStart')))
                    t_end_delta = find_delta(positions, 
                                             deltas,
                                             int(data.get('thickEnd')))
                    b_tmp_data = find_delta(positions, 
                                             deltas, 
                                             (int(data.get('chromStart')) + \
                                              int(data.get('blockStart'))))
                    b_start_delta = b_tmp_data - int(data.get('chromStart'))
                    data['chromStart'] = int(data.get('chromStart')) + c_start_delta
                    data['chromEnd'] = int(data.get('chromEnd')) + c_end_delta
                    data['thickStart'] = int(data.get('thickStart')) + t_start_delta
                    data['thickEnd'] = int(data.get('thickEnd')) + t_end_delta
                    data['blockStart'] = int(data.get('blockStart')) + b_start_delta
                    out.write('%s\n' % \
                              '\t'.join([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    pass
Exemple #8
0
def bam(args, logger):
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")
        
    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(chrom_mods,
                                                         curr_chrom,
                                                         logger)

        try:
            start_delta = find_delta(positions,
                                     deltas,
                                     int(line.reference_start))
            # end_delta = find_delta(positions,
            #                        deltas,
            #                        int(line.reference_end))
            mod_index = build_modification_index(positions,
                                                 deltas,
                                                 line,
                                                 start_delta)
            # new_cigar = update_cigar(mod_index, line.cigar)

            # if len(line.cigar) < len(new_cigar):
            #     line.cigar = new_cigar[-1*len(line.cigar):]
            # else:
            #     line.cigar = new_cigar
            line.reference_start = int(line.reference_start) + start_delta
            output.write(line)
        except IndexError:
            print "IndexError: ", line
            pass
        except TypeError:
            print "TypeError:", line
            pass
Exemple #9
0
def wig(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'dataValue'
    curr_chrom = ""
    chrom_pattern = compile('chrom=(\S+)\s')
    start_pattern = compile('start=(\d+)')
    with open(args.input, 'r') as input_:
        for line in input_:
            if not 'chrom' in line:
                out.write(line)
                # location = line.rstrip()
                # data = {k:g for k,g in zip(keys.split(), location.split())}
                # try:
                #     start_delta = find_delta(positions, 
                #                              deltas, 
                #                              int(data.get('chromStart')))
                #     data['chromStart'] = int(data.get('chromStart')) + start_delta
                #     out.write('%s\n' % \
                #               '\t'.join([str(data.get(k)) for k in keys.split()]))
                # except IndexError:
                #     out.write('%s\n' % \
                #               '\t'.join([str(data.get(k)) for k in keys.split()]))
            else:
                chrom_search = chrom_pattern.search(line)
                if chrom_search.groups()[0] != curr_chrom:
                    curr_chrom = chrom_search.groups()[0]
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                start_location = int(start_pattern.search(line).groups()[0])
                start_delta = find_delta(positions, deltas, start_location)
                new_line = sub('start=%d' % start_location,
                               'start=%d' % (start_location + start_delta),
                               line)
                out.write(new_line)
Exemple #10
0
def smrna_lib_fa(args, logger):

    out = open(args.output, 'w')
    chrom_mods = build_transform(args.mod, logger)
    keys = 'chrom start end strand'
    curr_chrom = ""
    with open(args.input, 'r') as input_:
        for line in input_:
            if line.startswith('>'):
                gene = line.rstrip()

                gene_list = match('>(.*):(\d+)-(\d+)\((.*)\)', gene)
                data = {k:g for k,g in zip(keys.split(), gene_list.groups())}
                if data.get('chrom') != curr_chrom:
                    curr_chrom = data.get('chrom')
                    try:
                        positions, deltas = zip(*chrom_mods.get(curr_chrom))
                        logger.info("CONTIG: '%s'" % curr_chrom)
                    except TypeError:
                        logger.warn(
                            "CONTIG: '%s' is not in MOD File." % \
                            curr_chrom)
                        positions, deltas = (), ()
                try:
                    start_delta = find_delta(positions, 
                                             deltas, 
                                             int(data.get('start')))
                    end_delta = find_delta(positions, 
                                           deltas,
                                           int(data.get('end')))
                    data['start'] = int(data.get('start')) + start_delta
                    data['end'] = int(data.get('end')) + end_delta
                    out.write('>%s:%s-%s(%s)\n' % \
                              tuple([str(data.get(k)) for k in keys.split()]))
                except IndexError:
                    pass
            else:
                out.write('%s' % line)
Exemple #11
0
def atac(args, logger):
    """

    """
    if not args.chrom_sizes:
        exit("Chrom sizes required for bam conversion")

    chrom_mods = build_transform(args.mod, logger)
    input_ = AlignmentFile(args.input, 'rb')

    header = update_header(input_.header.as_dict(), args.chrom_sizes)
    output = AlignmentFile(args.output, 'wb', header=header)

    curr_chrom = ""
    for line in input_:

        if input_.references[line.reference_id] != curr_chrom:
            curr_chrom = input_.references[line.reference_id]
            positions, deltas = get_positions_and_deltas(
                chrom_mods, curr_chrom, logger)
        # if line.is_reverse and (line.reference_length != len(line.seq)):
        #     print line
        #     print line.reference_length
        #     print line.cigar
        #     print len(line.seq)
        #     print len(line.get_reference_positions())
#        try:
        if not line.is_reverse:
            start_delta = find_delta(positions, deltas,
                                     int(line.reference_start))
            line.reference_start = int(line.reference_start) + start_delta
        else:
            end_delta = find_delta(positions, deltas, int(line.reference_end))
            mapped_end = int(line.reference_end) + end_delta
            line.reference_start = mapped_end - len(
                line.seq)  # line.reference_length
        output.write(line)