Ejemplo n.º 1
0
def jsonify(database_path,
            name,
            sha1,
            output_root_directory,
            public_url,
            browser_url,
            extended=False):
    '''
    Make a JSON representation of the database.
    @param database_path : the path to the sqlite database
    @param name : the name of the track
    @param sha1 : the sha1 sum of the file
    @param public_url : he base url where the file can be fetched from external request
    @param browser_url : he base url where the file can be fetched from internal request
    @param output_root_directory : the base system path where to write the output
    @param extended : if the format is ``basic`` or ``extended``
    '''
    # configure outputs
    output_path = os.path.join(output_root_directory, sha1)
    out_public_url = os.path.join(public_url, sha1)
    out_browser_url = os.path.join(browser_url, sha1)
    os.mkdir(output_path)
    with track.load(database_path, 'sql', readonly=False) as t:
        for chr_name in t:
            chr_length = t.chrmeta[chr_name]['length']
            out = os.path.join(output_path, chr_name)
            os.mkdir(out)
            lazy_url = os.path.join(out_browser_url, chr_name,
                                    'lazyfeatures-{chunk}.json')
            _jsonify(t, name, chr_length, chr_name,
                     os.path.join(out_public_url, chr_name), lazy_url, out,
                     extended)
    return 1
Ejemplo n.º 2
0
 def runTest(self):
     in_path = samples['small_features'][5]['sql']
     with track.load(in_path) as t:
         data = t.get_partial_score_vector('chrI', 10, 30)
         got = list(data)
     expected = [0.0] * 4 + [1.0] * 5 + [0.0] * 8 + [1.0] * 3
     self.assertEqual(got, expected)
Ejemplo n.º 3
0
 def runTest(self):
     in_path = samples['small_features'][5]['sql']
     with track.load(in_path) as t:
         data = t.read('chrI', ('start', 'end', 'name', 'score'))
         got = tuple(data.next())
     expected = (14, 19, u'', 0.0)
     self.assertEqual(got, expected)
Ejemplo n.º 4
0
 def setTrackChrGenes(**kwargs):
     fname = kwargs.get('fname', mousefile)
     num = kwargs.get('num', 1)
     t = track.load(fname);
     chromosome_data = t.read('chr{0}'.format(num))
     rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)]
     return rows
Ejemplo n.º 5
0
    def search(self, project_id, term, *args, **kw):
        project = DBSession.query(Project).filter(Project.id == project_id).first()
        sequence = project.sequence
        default = sequence.default_tracks
        if default is None or len(default) < 1:
            return {}
        t = default[0]
        chrs = {}
        with track.load(t.path, 'sql', readonly=True) as t:
            gene_name_alias = t.find_column_name(['name', 'gene_name', 'gene name', 'gname', 'Name', 'product'])
            try:
                for row in t.search({gene_name_alias: term}, [gene_name_alias, 'start', 'end']):
                    chr, name, start, stop = row
                    if chr not in chrs:
                        chrs[chr] = {}

                    names = chrs[chr]
                    if name in names:
                        old = names[name]
                        start = min(old[0], start)
                        stop = max(old[1], stop)
                    names[name] = [start, stop]
            except Exception:
                return {}

        #result[chr].append([name, start, stop])
        result = {}
        for chr, names in chrs.iteritems():
            result[chr] = []
            for k, v in names.iteritems():
                result[chr].append([k, v[0], v[1]])

        return result
Ejemplo n.º 6
0
Archivo: search.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.search({'start':'2','name':'feature'}, exact_match=True)
         got = list(data)
     expected = []
     self.assertEqual(got, expected)
Ejemplo n.º 7
0
def benchmark_access(file):
    with track.load(str(file['new_path'])) as t:
        with Timer() as timer:
            data = t.read()
            for entry in data:
                pass
    return timer.total_time
Ejemplo n.º 8
0
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.get_partial_score_vector('chrI', 0, 50)
         got = list(data)
     expected = [10.0] * 10 + [0.0] * 10 + [10.0
                                            ] * 10 + [0.0] * 15 + [10.0] * 5
     self.assertEqual(got, expected)
Ejemplo n.º 9
0
 def runTest(self):
     in_path = samples['small_features'][4]['sql']
     with track.load(in_path, readonly=True) as t:
         t.chrmeta = {}
         data = t.get_full_score_vector('chrI')
         got = list(data)
     expected = [0.0] * 10 + [1.0] * 10 + [0.0] * 10 + [2.0] * 10
     self.assertEqual(got, expected)
Ejemplo n.º 10
0
Archivo: write.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_signals'][7]['sql']
     out_path = temporary_path('.sql')
     with track.load(in_path) as i:
         with track.new(out_path) as o:
             for chrom in i: o.write(chrom, i.read(chrom))
             self.assertEqual(list(o.read('chrI')), list(i.read('chrI')))
     os.remove(out_path)
Ejemplo n.º 11
0
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     out_path = temporary_path('.bed')
     with track.load(in_path) as t:
         result = complement(t)
         result.export(out_path)
     with track.load(out_path) as t:
         data = t.read('chrI')
         got = map(tuple, data)
     expected = [( 10,     20),
                 ( 30,     40),
                 ( 50,     60),
                 ( 80,     90),
                 (110,    120),
                 (135, 230208)]
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 12
0
Archivo: write.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_signals'][7]['sql']
     out_path = temporary_path('.sql')
     with track.load(in_path) as i:
         with track.new(out_path) as o:
             for chrom in i:
                 o.write(chrom, i.read(chrom))
             self.assertEqual(list(o.read('chrI')), list(i.read('chrI')))
     os.remove(out_path)
Ejemplo n.º 13
0
 def runTest(self):
     in_path = samples['small_features'][4]['sql']
     with track.load(in_path) as t:
         data = t.get_full_score_vector('chrI')
         got = list(data)
     expected = [0.0] * 10 + [1.0] * 10 + [0.0] * 10 + [2.0] * 10 + [
         0.0
     ] * 230168
     self.assertEqual(got, expected)
Ejemplo n.º 14
0
 def runTest(self):
     in_path = samples['gzip_tracks'][4]['gzip']
     with track.load(in_path) as t:
         data = t.read()
         got = list(data)
     expected = [('chr1', 10, 20, u'Lorem', 1.0, 1),
                 ('chr1', 30, 40, u'Ipsum', 2.0, 1),
                 ('chr2', 10, 20, u'Dolor', 3.0, 1)]
     self.assertEqual(got, expected)
Ejemplo n.º 15
0
 def _get_annotation_from_bed(self, fname, offset):
     """
     Reads intervals from BED file
     """
     try:
         with track.load(fname) as ann:
             ann = ann.read(fields=['start', 'end'])
             intervals = self._intervals_to_interval_map(ann, offset)
     except Exception:
         intervals = self._intervals_to_interval_map([], 0)
     return intervals
Ejemplo n.º 16
0
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.search({'start': '2', 'name': 'feature'})
         got = list([tuple(x) for x in data])
     expected = [(u'chrI', 2, 8, u'Validation feature 2', 0.0),
                 (u'chrI', 20, 30, u'Validation feature 3', 10.0),
                 (u'chrI', 25, 30, u'Validation feature 4', 0.0),
                 (u'chrI', 120, 130, u'Validation feature 11', 10.0),
                 (u'chrI', 125, 135, u'Validation feature 12', 5.0)]
     self.assertEqual(got, expected)
Ejemplo n.º 17
0
 def runTest(self):
     in_path = samples['small_signals'][4]['sql']
     out_path = temporary_path('.sql')
     t = threshold(in_path, 8000.0)
     t.export(out_path)
     with track.load(out_path) as t:
         data = t.read('chrI')
         got = map(tuple, data)
     expected = [(120, 122, 9000.0)]
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 18
0
 def _get_annotation_from_bed(self, fname, offset):
     """
     Reads intervals from BED file
     """
     try:
         with track.load(fname) as ann:
             ann = ann.read(fields=['start', 'end'])
             intervals = self._intervals_to_interval_map(ann, offset)
     except Exception:
         intervals = self._intervals_to_interval_map([], 0)
     return intervals
Ejemplo n.º 19
0
def ucsc_geneid_fix(in_gtf, out_gtf, remote=None, local=None):
    """Updates 'gene_id' entries in GTF files downloaded from UCSC
    Table Browser to contain gene IDs instead of transcript IDs.

    If the output GTF file name already exists, it will be overwritten.

    :param in_gtf: path to input GTF file
    :type in_gtf: str
    :param out_gtf: path to output GTF file
    :type out_gtf: str
    :param remote: UCSC database and annotation source to use
    :type remote: dict('db': str, 'annot_src': str)
    :param local: two-column file name containing transcript-gene mapping,
            only when `db` and `annot_src` are None
    :type local: str
    :returns: None

    """
    # remote not defined
    if remote is None:
        # then local must be defined
        if local is None:
            raise ValueError("Missing `remote` or `local` arguments")
        mapping = get_local_transcript_gene_mapping(local)
    # remote defined
    else:
        # then local can not be defined
        if local is not None:
            raise ValueError("Only supply `remote` or `local` argument, " "not both.")
        # remote must have 'db'
        if "db" not in remote:
            raise ValueError("Missing remote database name")
        # and 'annot_src'
        if "annot" not in remote:
            raise ValueError("Missing remote annotation source name")

        db = remote["db"]
        annot = remote["annot"]
        if annot not in QUERIES.keys():
            raise ValueError("Invalid annotation source " "name: {0}".format(annot))

        mapping = get_ucsc_transcript_gene_mapping(annot, db, cred=CRED)

    # remove output file if it exists
    if os.path.exists(out_gtf):
        os.remove(out_gtf)

    with track.load(in_gtf, readonly=True) as in_track, track.new(out_gtf, format="gtf") as out_track:
        # since GTF has custom fields, need to set the out_track to use
        # in_track's fields
        out_track.fields = in_track.fields
        for chrom in in_track.chromosomes:
            chrom_rec = in_track.read(chrom)
            out_track.write(chrom, update_gene_id_attr(chrom_rec, mapping))
Ejemplo n.º 20
0
Archivo: search.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.search({'start':'2','name':'feature'})
         got = list([tuple(x) for x in data])
     expected = [(u'chrI', 2, 8, u'Validation feature 2', 0.0),
                 (u'chrI', 20, 30, u'Validation feature 3', 10.0),
                 (u'chrI', 25, 30, u'Validation feature 4', 0.0),
                 (u'chrI', 120, 130, u'Validation feature 11', 10.0),
                 (u'chrI', 125, 135, u'Validation feature 12', 5.0)]
     self.assertEqual(got, expected)
Ejemplo n.º 21
0
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.search({
             'start': '2',
             'name': 'feature'
         },
                         exact_match=True)
         got = list(data)
     expected = []
     self.assertEqual(got, expected)
Ejemplo n.º 22
0
 def setTrackChrPromoters(**kwargs):
     fname = kwargs.get('fname', mousefile)
     num = kwargs.get('num', 1)
     t = track.load(fname);
     chromosome_data = t.read('chr{0}'.format(num))
     rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)]
     fwd_genes = [e for e in rows if e['strand'] == 1]
     fwd_starts =dict([(e['name'],e['start']) for e in fwd_genes])
     fwd_promoters= dict([(k, [v - 2000, v - 100])
                          for k,v in fwd_starts.iteritems()])
     return fwd_promoters
Ejemplo n.º 23
0
Archivo: write.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     out_path = temporary_path('.sql')
     chrom = 'chrI'
     with track.load(in_path) as i:
         with track.new(out_path) as o:
             o.fields = track.default_fields
             o.write(chrom, i.read(chrom, ('start','end')))
             got = tuple(o.read(chrom).next())
             expected = (0, 10, None, None, None)
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 24
0
Archivo: write.py Proyecto: bow/track
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     out_path = temporary_path('.sql')
     chrom = 'chrI'
     with track.load(in_path) as i:
         with track.new(out_path) as o:
             o.fields = track.default_fields
             o.write(chrom, i.read(chrom, ('start', 'end')))
             got = tuple(o.read(chrom).next())
             expected = (0, 10, None, None, None)
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 25
0
 def runTest(self):
     x_path = samples['small_signals'][4]['sql']
     y_path = samples['small_features'][4]['sql']
     out_path = temporary_path('.sql')
     t = mean_score_by_feature(x_path,y_path)
     t.export(out_path)
     with track.load(out_path) as t:
         data = t.read('chrI')
         got = map(tuple, data)
     expected = [(10, 20, 15.0, u'Lorem', 1),
                 (30, 40, 50.0, u'Ipsum', 1)]
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 26
0
def tosql(fileinfo, seq_name):
    """
    Transform a input file to an sql one.
    """
    debug('Tosql', 3)
    track.convert(fileinfo.extension and (fileinfo.paths['upload_to'], fileinfo.extension) or fileinfo.paths['upload_to'], fileinfo.paths['store'])
    with track.load(fileinfo.paths['store'], 'sql', readonly=False) as t:
        t.assembly = seq_name

    # debug('tosql : btrack.convert("%s", "%s", chrmeta="%s")' % (fileinfo.paths['upload_to'], fileinfo.paths['store'], seq_name), 3)
    # btrack.convert(fileinfo.paths['upload_to'], fileinfo.paths['store'], chrmeta=seq_name)
    fileinfo.states['instore'] = True
    debug('done', 4)
    return fileinfo
Ejemplo n.º 27
0
Archivo: sql.py Proyecto: bow/track
 def parse(self):
     # Core function #
     def read_whole_track(t):
         self.handler.defineFields(t.fields)
         self.handler.newTrack(t.info)
         if t.info.get('assembly'): self.handler.defineAssembly(t.info.get('assembly'))
         else: self.handler.defineChrmeta(t.chrmeta)
         for chrom in t:
             for feature in t.read(chrom):
                 self.handler.newFeature(chrom, feature)
     # Check param type #
     if isinstance(self.path, Track):
         read_whole_track(self.path)
     else:
         with load(self.path, 'sql', readonly=True) as t: read_whole_track(t)
Ejemplo n.º 28
0
def tosql(fileinfo, seq_name):
    """
    Transform a input file to an sql one.
    """
    debug('Tosql', 3)
    track.convert(
        fileinfo.extension and
        (fileinfo.paths['upload_to'], fileinfo.extension)
        or fileinfo.paths['upload_to'], fileinfo.paths['store'])
    with track.load(fileinfo.paths['store'], 'sql', readonly=False) as t:
        t.assembly = seq_name

    # debug('tosql : btrack.convert("%s", "%s", chrmeta="%s")' % (fileinfo.paths['upload_to'], fileinfo.paths['store'], seq_name), 3)
    # btrack.convert(fileinfo.paths['upload_to'], fileinfo.paths['store'], chrmeta=seq_name)
    fileinfo.states['instore'] = True
    debug('done', 4)
    return fileinfo
Ejemplo n.º 29
0
 def runTest(self):
     for num, info in sorted(samples['small_features'].items()):
         # Prepare paths #
         orig_bed_path = info['bed']
         orig_sql_path = info['sql']
         test_sql_path = temporary_path('.sql')
         test_bed_path = temporary_path('.bed')
         # From BED to SQL #
         track.convert(orig_bed_path, test_sql_path, assembly='sacCer2')
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to BED #
         with track.load(test_sql_path) as t: t.roman_to_integer()
         track.convert(test_sql_path, test_bed_path)
         self.assertTrue(assert_file_equal(orig_bed_path, test_bed_path, start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_bed_path)
Ejemplo n.º 30
0
 def runTest(self):
     orig_path = samples['small_features'][2]['sql']
     test_path = temporary_path('.sql')
     shutil.copy(orig_path, test_path)
     with track.load(test_path) as t:
         t.delete_fields(['name', 'strand'])
         got = list(t.read())
     expected = [('chrI', 10, 20, 0.1), ('chrI', 30, 40, 0.2),
                 ('chrI', 50, 60, 0.1), ('chrI', 70, 80, 0.2),
                 ('chrI', 90, 100, 0.0), ('chrI', 110, 120, 0.4),
                 ('chrI', 130, 150, 0.4), ('chrI', 180, 190, 0.1),
                 ('chrI', 180, 200, 0.1), ('chrI', 210, 220, 0.2),
                 ('chrI', 230, 240, 0.1), ('chrI', 250, 260, 0.2),
                 ('chrI', 270, 280, 0.0), ('chrI', 290, 300, 0.7)]
     self.assertEqual(got, expected)
     # Clean up #
     os.remove(test_path)
Ejemplo n.º 31
0
 def runTest(self):
     in_path = samples['small_features'][1]['sql']
     with track.load(in_path) as t:
         data = t.read()
         got = list(data)
     expected = [('chrI', 0, 10, u'Validation feature 1', 10.0),
                 ('chrI', 2, 8, u'Validation feature 2', 0.0),
                 ('chrI', 20, 30, u'Validation feature 3', 10.0),
                 ('chrI', 25, 30, u'Validation feature 4', 0.0),
                 ('chrI', 40, 45, u'Validation feature 5', 0.0),
                 ('chrI', 40, 50, u'Validation feature 6', 10.0),
                 ('chrI', 60, 70, u'Validation feature 7', 10.0),
                 ('chrI', 70, 80, u'Validation feature 8', 10.0),
                 ('chrI', 90, 100, u'Validation feature 9', 10.0),
                 ('chrI', 90, 110, u'Validation feature 10', 10.0),
                 ('chrI', 120, 130, u'Validation feature 11', 10.0),
                 ('chrI', 125, 135, u'Validation feature 12', 5.0)]
     self.assertEqual(got, expected)
Ejemplo n.º 32
0
 def runTest(self):
     for num, info in sorted(samples["gff_tracks"].items()):
         # Prepare paths #
         orig_gff_path = info["gff"]
         orig_sql_path = info["sql"]
         test_sql_path = temporary_path(".sql")
         test_gff_path = temporary_path(".gff")
         # From GFF to SQL #
         track.convert(orig_gff_path, test_sql_path, assembly="sacCer2")
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to GFF #
         with track.load(test_sql_path) as t:
             t.roman_to_integer()
         track.convert(test_sql_path, test_gff_path)
         self.assertTrue(assert_file_equal(orig_gff_path, test_gff_path, start_a=1, start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_gff_path)
Ejemplo n.º 33
0
    def parse(self):
        # Core function #
        def read_whole_track(t):
            self.handler.defineFields(t.fields)
            self.handler.newTrack(t.info)
            if t.info.get('assembly'):
                self.handler.defineAssembly(t.info.get('assembly'))
            else:
                self.handler.defineChrmeta(t.chrmeta)
            for chrom in t:
                for feature in t.read(chrom):
                    self.handler.newFeature(chrom, feature)

        # Check param type #
        if isinstance(self.path, Track):
            read_whole_track(self.path)
        else:
            with load(self.path, 'sql', readonly=True) as t:
                read_whole_track(t)
Ejemplo n.º 34
0
 def runTest(self):
     for num, info in sorted(samples['small_signals'].items()):
         # Some files cannot be roundtriped #
         if num == 3 or num == 7: continue
         # Prepare paths #
         orig_wig_path = info['wig']
         orig_sql_path = info['sql']
         test_sql_path = temporary_path('.sql')
         test_wig_path = temporary_path('.wig')
         # From WIG to SQL #
         track.convert(orig_wig_path, test_sql_path, assembly='sacCer2')
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to WIG #
         with track.load(test_sql_path) as t: t.roman_to_integer()
         track.convert(test_sql_path, test_wig_path)
         self.assertTrue(assert_file_equal(orig_wig_path, test_wig_path, start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_wig_path)
Ejemplo n.º 35
0
 def runTest(self):
     for num, info in sorted(samples['small_features'].items()):
         # Prepare paths #
         orig_bed_path = info['bed']
         orig_sql_path = info['sql']
         test_sql_path = temporary_path('.sql')
         test_bed_path = temporary_path('.bed')
         # From BED to SQL #
         track.convert(orig_bed_path, test_sql_path, assembly='sacCer2')
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to BED #
         with track.load(test_sql_path) as t:
             t.roman_to_integer()
         track.convert(test_sql_path, test_bed_path)
         self.assertTrue(
             assert_file_equal(orig_bed_path, test_bed_path, start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_bed_path)
Ejemplo n.º 36
0
 def runTest(self):
     in_path = temporary_path('.sql')
     out_path = temporary_path('.sql')
     with track.new(in_path) as t:
         t.fields = ('start','end','score')
         t.assembly = 'sacCer2'
         t.write('chrI',[(0,2,10),(2,4,20),(6,8,10)])
         result = window_smoothing(t, 2)
         result.export(out_path)
     with track.load(out_path) as t:
         data = t.read('chrI')
         got = map(tuple, data)
     expected = [(0,  1,   8.0),
                 (1,  3,  12.0),
                 (3,  5,  10.0),
                 (5,  6,   8.0),
                 (6,  9,   4.0),
                 (9,  10,  2.0)]
     self.assertEqual(got, expected)
     os.remove(in_path)
     os.remove(out_path)
Ejemplo n.º 37
0
 def runTest(self):
     for num, info in sorted(samples['small_signals'].items()):
         # Some files cannot be roundtriped #
         if num == 3 or num == 7: continue
         # Prepare paths #
         orig_wig_path = info['wig']
         orig_sql_path = info['sql']
         test_sql_path = temporary_path('.sql')
         test_wig_path = temporary_path('.wig')
         # From WIG to SQL #
         track.convert(orig_wig_path, test_sql_path, assembly='sacCer2')
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to WIG #
         with track.load(test_sql_path) as t:
             t.roman_to_integer()
         track.convert(test_sql_path, test_wig_path)
         self.assertTrue(
             assert_file_equal(orig_wig_path, test_wig_path, start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_wig_path)
Ejemplo n.º 38
0
def pre_compute_sql_scores(database_path, sha1, output_dir):
    '''
    Pre compute scores for a quantitative database
    @param database_path : the path to the database
    @param sha1 : the sha1 sun hexdigest of the database
    @param output_dir : where files will be write
    '''
    out_path = os.path.join(output_dir, sha1)
    try:
        os.mkdir(out_path)
    except:
        pass

    #print 'prepare connection'

    with track.load(database_path, format='sql', readonly=True) as t:
        for chromosome in t:
            #       print 'doing chr %s' % chromosome
            max = get_last_feature_stop(t, chromosome)
            if max is not None:
                #          print 'generating score array'
                array = generate_array(
                    t.read(chromosome, ('start', 'end', 'score')), max, 100000)

                #         print 'doing for each zoom'
                for zoom in zooms:
                    #            print 'compute : zoom = %s' % zoom
                    gen = gen_tuples(array, max, zoom)

                    #           print 'prepare output'
                    output = os.path.join(out_path,
                                          '%s_%s.db' % (chromosome, zoom))
                    out_connection = sqlite3.connect(output)

                    #          print 'write'
                    write_tuples(out_connection, gen)
                    #     print 'end zooms'
                    #print 'end chr'
    return 1
Ejemplo n.º 39
0
 def runTest(self):
     for num, info in sorted(samples['gff_tracks'].items()):
         # Prepare paths #
         orig_gff_path = info['gff']
         orig_sql_path = info['sql']
         test_sql_path = temporary_path('.sql')
         test_gff_path = temporary_path('.gff')
         # From GFF to SQL #
         track.convert(orig_gff_path, test_sql_path, assembly='sacCer2')
         self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path))
         # From SQL to GFF #
         with track.load(test_sql_path) as t:
             t.roman_to_integer()
         track.convert(test_sql_path, test_gff_path)
         self.assertTrue(
             assert_file_equal(orig_gff_path,
                               test_gff_path,
                               start_a=1,
                               start_b=1))
         # Clean up #
         os.remove(test_sql_path)
         os.remove(test_gff_path)
Ejemplo n.º 40
0
def squish_bed(in_file, out_file):
    """Removes all overlapping regions in the input BED file, writing to the
    output BED file.
    
    :param in_file: path to input BED file
    :type in_file: str
    :param out_file: path to output BED file
    :type out_file: str
    
    """
    # check for input file presence, remove output file if it already exists
    assert os.path.exists(in_file), 'Required input file {0} does not ' \
        'exist'.format(in_file)
    if os.path.exists(out_file):
        os.unlink(out_file)

    with track.load(in_file, readonly=True) as in_track, \
            track.new(out_file, format='bed') as out_track:

        for chrom in in_track.chromosomes:
            chrom_rec = in_track.read(chrom)
            out_track.write(chrom, squish_track_records(chrom_rec))
Ejemplo n.º 41
0
 def runTest(self):
     in_paths = [samples['small_signals'][1]['sql'],
                 samples['small_signals'][2]['sql'],
                 samples['small_signals'][3]['sql']]
     out_path = temporary_path('.sql')
     t = merge_scores(in_paths)
     t.export(out_path)
     with track.load(out_path) as t:
         data = t.read('chrI')
         got = map(tuple, data)
     expected = [( 0,    5,    2.0 + 0.6666666666666666),
                 ( 5,   10,    4.0),
                 ( 20,  30,   10.0),
                 ( 30,  40,   30.0),
                 ( 40,  50,   26.0 + 0.666666666666666),
                 ( 50,  60,  120.0),
                 ( 60,  68,  100.0),
                 ( 68,  70,  200.0),
                 ( 70,  80,  100.0),
                 ( 90, 110,    3.0),
                 (120, 130,   10.0)]
     self.assertEqual(got, expected)
     os.remove(out_path)
Ejemplo n.º 42
0
Archivo: scores.py Proyecto: bbcf/pygdv
def pre_compute_sql_scores(database_path, sha1, output_dir):
    '''
    Pre compute scores for a quantitative database
    @param database_path : the path to the database
    @param sha1 : the sha1 sun hexdigest of the database
    @param output_dir : where files will be write
    '''
    out_path = os.path.join(output_dir, sha1)
    try :
        os.mkdir(out_path)
    except :
        pass
    
    #print 'prepare connection'
    
    with track.load(database_path, format='sql', readonly=True) as t:
        for chromosome in t:
            #       print 'doing chr %s' % chromosome
            max = get_last_feature_stop(t, chromosome)
            if max is not None:
                #          print 'generating score array'
                array = generate_array(t.read(chromosome, ('start', 'end', 'score')), max, 100000)
    
                #         print 'doing for each zoom'            
                for zoom in zooms:
                    #            print 'compute : zoom = %s' % zoom
                    gen = gen_tuples(array, max, zoom)
                    
                    #           print 'prepare output'
                    output = os.path.join(out_path, '%s_%s.db' % (chromosome, zoom))
                    out_connection = sqlite3.connect(output)
                    
                    #          print 'write'
                    write_tuples(out_connection, gen)
                    #     print 'end zooms'
                    #print 'end chr'
    return 1
Ejemplo n.º 43
0
 def runTest(self):
     orig_path = samples['small_features'][2]['sql']
     test_path = temporary_path('.sql')
     shutil.copy(orig_path, test_path)
     with track.load(test_path) as t:
         t.delete_fields(['name','strand'])
         got = list(t.read())
     expected = [('chrI',  10,  20, 0.1),
                 ('chrI',  30,  40, 0.2),
                 ('chrI',  50,  60, 0.1),
                 ('chrI',  70,  80, 0.2),
                 ('chrI',  90, 100, 0.0),
                 ('chrI', 110, 120, 0.4),
                 ('chrI', 130, 150, 0.4),
                 ('chrI', 180, 190, 0.1),
                 ('chrI', 180, 200, 0.1),
                 ('chrI', 210, 220, 0.2),
                 ('chrI', 230, 240, 0.1),
                 ('chrI', 250, 260, 0.2),
                 ('chrI', 270, 280, 0.0),
                 ('chrI', 290, 300, 0.7)]
     self.assertEqual(got, expected)
     # Clean up #
     os.remove(test_path)
Ejemplo n.º 44
0
    def search(self, project_id, term, *args, **kw):
        project = DBSession.query(Project).filter(
            Project.id == project_id).first()
        sequence = project.sequence
        default = sequence.default_tracks
        if default is None or len(default) < 1:
            return {}
        t = default[0]
        chrs = {}
        with track.load(t.path, 'sql', readonly=True) as t:
            gene_name_alias = t.find_column_name(
                ['name', 'gene_name', 'gene name', 'gname', 'Name', 'product'])
            try:
                for row in t.search({gene_name_alias: term},
                                    [gene_name_alias, 'start', 'end']):
                    chr, name, start, stop = row
                    if chr not in chrs:
                        chrs[chr] = {}

                    names = chrs[chr]
                    if name in names:
                        old = names[name]
                        start = min(old[0], start)
                        stop = max(old[1], stop)
                    names[name] = [start, stop]
            except Exception:
                return {}

        #result[chr].append([name, start, stop])
        result = {}
        for chr, names in chrs.iteritems():
            result[chr] = []
            for k, v in names.iteritems():
                result[chr].append([k, v[0], v[1]])

        return result
Ejemplo n.º 45
0
def jsonify(database_path, name, sha1, output_root_directory, public_url, browser_url, extended = False):
    '''
    Make a JSON representation of the database.
    @param database_path : the path to the sqlite database
    @param name : the name of the track
    @param sha1 : the sha1 sum of the file
    @param public_url : he base url where the file can be fetched from external request
    @param browser_url : he base url where the file can be fetched from internal request
    @param output_root_directory : the base system path where to write the output
    @param extended : if the format is ``basic`` or ``extended``
    '''
    # configure outputs
    output_path = os.path.join(output_root_directory, sha1)
    out_public_url = os.path.join(public_url, sha1)
    out_browser_url = os.path.join(browser_url, sha1)
    os.mkdir(output_path)
    with track.load(database_path, 'sql', readonly=False) as t :
        for chr_name in t:
            chr_length = t.chrmeta[chr_name]['length']
            out = os.path.join(output_path, chr_name)
            os.mkdir(out)
            lazy_url = os.path.join(out_browser_url, chr_name, 'lazyfeatures-{chunk}.json')
            _jsonify(t, name, chr_length, chr_name, os.path.join(out_public_url, chr_name), lazy_url, out, extended)
    return 1
def process_csv(neuropil,dataset,cluster_type):
    analysis_time = datetime.datetime.now().isoformat()
    WITH_CHROMOSOME_LOCATION=True
    if WITH_CHROMOSOME_LOCATION:
        import track # install with "pip install track" (see http://bbcf.epfl.ch/bbcflib/tutorial_track.html)
        if dataset=='T1':
            dataset_track = track.load(os.path.join(BRAINCODE_PACKAGE_DIR,'VTs.bed'))
        elif dataset=='CB1':
            dataset_track = track.load(os.path.join(BRAINCODE_PACKAGE_DIR,'janeliaTiles.bed'))
        else:
            raise ValueError('uknown dataset %r' % dataset)

    filenames = get_filenames( dataset, neuropil, cluster_type )
    hs_name_raw = filenames['fragment_info_raw_csv']

    pre_csv_fname = get_fragment_pre_csv_fname(dataset=dataset,
        region=neuropil, cluster_type=cluster_type)
    original_metadata = get_csv_metadata(pre_csv_fname)

    if os.path.exists(hs_name_raw):
        derived_metadata = get_csv_metadata(hs_name_raw)
        if original_metadata['analysis_time_parsed'] < derived_metadata['analysis_time_parsed']:
            print('output %r exists but is newer, so not rewriting. skipping.' % hs_name_raw)
            return
        else:
            print('output %r exists but is older than input, so rewriting.' % hs_name_raw)

    ids_fname = filenames['id_driver_image_csv']
    print('reading %r'%ids_fname)
    id_driver_image_df = pd.read_csv(ids_fname, sep=';')
    driver_id_to_driver_name = {}
    for i,driver_image_row in id_driver_image_df.iterrows():
        driver_id_to_driver_name[ driver_image_row['id'] ] = driver_image_row['driver']

    print('reading %r' % pre_csv_fname )
    qq = pd.read_csv(pre_csv_fname, low_memory=False, comment='#')

    print('computing statistics')
    qq['expressing elsewhere_in_region'] = qq['expressing region'] - qq['expressing cluster']
    qq['total elsewhere_in_region'] = qq['total region'] - qq['total cluster']

    qq['fraction cluster'] = qq['expressing cluster']/qq['total cluster']
    qq['observed'] = qq['expressing cluster']
    if 0:
        # expected value is expression everywhere
        qq['expected freq'] = qq['expressing region']/qq['total region']
    else:
        # expected value is expression elsewhere
        qq['expected freq'] = qq['expressing elsewhere_in_region']/qq['total elsewhere_in_region']
    qq['expected'] = qq['expected freq'] * qq['total cluster']
    qq['fold enrichment'] = qq['observed']/qq['expected']
    qq['chi sq'] = (qq['observed'] - qq['expected'])**2 / qq['expected']
    qq['chi sq p'] = scipy.stats.chisqprob(qq['chi sq'].values, df=1 )
    if 1:
        hypergeometric_p = []
        for i,qq_row in qq.iterrows():
            if i%10000==0:
                print('%d of %d'%(i,len(qq)))
            # Variable names correspond to the scipy docs at
            # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html
            # Comment names correspond to the wikipedia entry at
            # https://en.wikipedia.org/wiki/Hypergeometric_distribution
            M = qq_row['total region']      # wikipedia's N
            n = qq_row['expressing region'] # wikipedia's K
            N = qq_row['total cluster']      # wikipedia's n
            x = qq_row['expressing cluster'] # wikipedia's k
            rv = scipy.stats.hypergeom( M, n, N )
            hypergeometric_p.append( rv.pmf(x) )
        qq['hypergeometric p'] = hypergeometric_p
    qq['driver name'] = qq.apply( lambda row:
                                  driver_id_to_driver_name[ row['driver_id'] ],
                                  axis = 1 )
    print('done computing statistics')
    print('saving all %d rows'%(len(qq),))

    column_renames = collections.OrderedDict([
        # (old_name, new_name),
        ('cluster_id','cluster_id'),
        ('fraction cluster','fraction'),
        ('fold enrichment','fold enrichment'),
        ('driver name','driver name'),
        ('driver_id','driver_id'),
        ('hypergeometric p','hypergeometric p'),
        ('chi sq p','chi sq p'),
        ('expressing cluster','num positive voxels in cluster'),
        ('total cluster','num voxels in cluster'),
        ('expressing region','num positive voxels in region'),
        ('total region','num voxels in region'),
        ]
    )

    qq = qq[[old_name for old_name in column_renames]] # drop unused columns
    qq = qq.rename(columns=column_renames) # rename columns

    hs_sorted = qq.sort( ['cluster_id','fraction'], ascending=[True,False] )

    if WITH_CHROMOSOME_LOCATION:
        chrom = []
        chromStart = []
        chromEnd = []
        chromStrand = []
        ucsc_urls = []
        cached_sdata = {}

        for i,hs_row in hs_sorted.iterrows():
            if dataset=='T1':
                num = vt_name_to_num(hs_row['driver name'])
                name = 'VT%04d'%num
            else:
                assert dataset=='CB1'
                name = hs_row['driver name']
            sdata = cached_sdata.get(name,None)
            if sdata is None:
                # cache miss
                sdata = [r for r in dataset_track.search({'name':name},exact_match=True)]
                cached_sdata[name] = sdata
                if len(sdata)==0:
                    print("no entry for %r"%hs_row['driver name'])

            if len(sdata)==0:
                chrom.append(None)
                chromStart.append(None)
                chromEnd.append(None)
                chromStrand.append(None)
                ucsc_urls.append(None)
            else:
                if len(sdata)>1:
                    print('sdata')
                    print(sdata)
                    raise RuntimeError("more than one entry for %r"%hs_row['driver name'])
                trackrow = sdata[0]
                chrom.append(trackrow[0])
                chromStart.append(trackrow[1])
                chromEnd.append(trackrow[2])
                chromStrand.append(trackrow[5])
                ucsc_urls.append( 'http://genome.ucsc.edu/cgi-bin/hgTracks?db=dm3&position=' + chrom[-1] +'%3A' + str(chromStart[-1]) +'-' +str(chromEnd[-1]) )
        hs_sorted['chrom'] = chrom
        hs_sorted['chromStart'] = np.array(chromStart,dtype=object)  # don't let Pandas convert to float
        hs_sorted['chromEnd'] = np.array(chromEnd,dtype=object) # don't let Pandas convert to float
        hs_sorted['chromStrand'] = chromStrand
        hs_sorted['UCSC Genome Browser URL'] = ucsc_urls

    if dataset=='T1':
        hs_sorted['bbweb URL'] = hs_sorted.apply(lambda row: vt_name_to_bbweb_url(row['driver name']), axis=1)
        hs_sorted['VDRC URL'] = hs_sorted.apply(lambda row: vt_name_to_vdrc_url(row['driver name']), axis=1)
    else:
        assert dataset=='CB1'
        hs_sorted['FlyLight URL'] = hs_sorted.apply(lambda row: janelia_name_to_flylight_url(row['driver name']), axis=1)

    buf = StringIO()
    metadata = {
        'analysis_time':analysis_time,
        'url':'https://strawlab.org/braincode',
        'neuropil':neuropil,
        'dataset':dataset,
        'cluster_type':cluster_type,
    }
    comment_line = '# '+json.dumps( metadata ) + '\n'
    buf.write(comment_line)
    hs_sorted.to_csv(buf,index=False)
    with open(hs_name_raw,mode='w') as fd:
        fd.write(buf.getvalue())
    print('saved to %r'%hs_name_raw)
# track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1913601_0m_rep3.bedGraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/ATAC_0m_rep3.sql')
# track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1913605_120m_rep3.bedGraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/ATAC_120m_rep3.sql')

# track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1645121_RelA-2_120.bedgraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/RelA_120m_rep2.sql')
"""
Plot tracks around Ccl3 and Ccl4
"""

plt.clf()
fig, axes = plt.subplots(10, 1, figsize=(6, 1 * 10))
start_pos = 83460000
end_pos = 83480000

# p300 UT peaks
with track.load(
        '/scratch/PI/mcovert/dvanva/bedfiles/sql/p300.UT.peaks.sql') as t:
    data = t.read({'chr': 'chr11', 'start': 83460000, 'end': 83480000})
    for peak in data:
        axes[0].bar(peak[0], 1, width=peak[1] - peak[0], color='k')
        axes[0].set_xlim([start_pos, end_pos])
        axes[0].set_title('p300 Untreated')
        axes[0].set_xticks([])
        axes[0].set_yticks([])

# p300 LPS 2h peaks
with track.load(
        '/scratch/PI/mcovert/dvanva/bedfiles/sql/p300.LPS_2h.peaks.sql') as t:
    data = t.read({'chr': 'chr11', 'start': 83460000, 'end': 83480000})
    for peak in data:
        axes[1].bar(peak[0], 1, width=peak[1] - peak[0], color='k')
        axes[1].set_xlim([start_pos, end_pos])
Ejemplo n.º 48
0
 def __call__(self, *args, **kwargs):
     """Check that all arguments are present
        and load all tracks that are given as paths
        instead of track objects. Also checks for
        direct calls with generators."""
     # Initialization #
     generator_call = False  # Special switch for direct generator calls
     found_args = {}  # Will contain a set of parameters extracted
     found_tracks = {}  # Will contain a set of track parameters extracted
     found_generators = {}  # Will contain a set of FeatureStream
     extra_args = {}  # Will contain a set of parameters computed
     all_tracks = []  # Will contain all single tracks sent
     tracks_to_close = []  # Will contain single tracks to close
     virtual_tracks = []  # Will contain the results tracks
     rest_of_fields = [
     ]  # Will contain variable output fields when required
     ### Parse arguments ###
     for p in self.input_args:
         if p['key'] in kwargs: value = kwargs[p['key']]
         elif len(args) >= p['position']: value = args[p['position'] - 1]
         elif 'default' in p: value = p['default']
         elif p.get('optional'): continue
         else:                raise Exception("The argument '%s' is missing for the manipulation '%s'." \
                             % (p['key'], self.short_name))
         # Cast it if it's not the right type #
         if not isinstance(value, p['type']): value = p['type'](value)
         # Add it to the dict #
         found_args[p['key']] = value
     ### Parse tracks ###
     for t in self.input_tracks:
         if t['key'] in kwargs: value = kwargs[t['key']]
         elif len(args) >= t['position']: value = args[t['position'] - 1]
         elif 'default' in t: value = t['default']
         elif t.get('optional'): continue
         else:                raise Exception("The argument '%s' is missing for the manipulation '%s'." \
                             % (t['key'], self.short_name))
         # Check is track collection #
         if t.get('kind') == 'many':
             if not is_list(value):
                 message = "The track collection '%s' for the manipulation '%s' is not a list: %s"
                 raise Exception(message %
                                 (t['key'], self.short_name, value))
         # Don't modify the input list #
         if t.get('kind') == 'many': value = value[:]
         # Check for generator case #
         if is_gen(value): generator_call = True
         if t.get('kind') == 'many' and is_gen(value[0]):
             generator_call = True
         if generator_call:
             found_tracks[t['key']] = value
             continue
         # Check is path #
         if isinstance(value, basestring):
             value = track.load(value, readonly=True)
             tracks_to_close.append(value)
         if t.get('kind') == 'many':
             for i, _ in enumerate(value):
                 if isinstance(value[i], basestring):
                     value[i] = track.load(value[i], readonly=True)
                     tracks_to_close.append(value[i])
         # Add to the list of all tracks #
         if t.get('kind') == 'many': all_tracks += [x for x in value]
         else: all_tracks += [value]
         # Track collection must be combined #
         if t.get('kind') == 'many':
             value = TrackCollection(value, self.fields_collapse,
                                     self.chroms_collapse)
         # Variable fields case (track collection must collapse fields) #
         if t['fields'][-1] == '...':
             first_fields = t['fields'][:-1]
             rest_of_fields = [
                 f for f in value.fields if f not in first_fields
             ]
             value.fields = first_fields + rest_of_fields
         # Specific fields case #
         else:
             value.fields = t['fields']
         # What about track SimpleTrack case #
         pass  #TODO
         # Add it to the dict #
         found_tracks[t['key']] = value
     # Check for generator case #
     if generator_call:
         return self.from_generator(found_tracks, found_args, args, kwargs)
     # Collapse chromosomes #
     if not self.chroms_collapse: chromosomes = all_tracks[0].chromosomes
     else:
         chromosomes = collapse(self.chroms_collapse,
                                [t.chromosomes for t in all_tracks])
     # Multiple output tracks disabled #
     t = self.output_tracks[0]
     # Make a new virtual track #
     vtrack = VirtualTrack()
     # Output chromosome metadata #
     for chrom in chromosomes:
         vtrack.chrmeta[chrom] = {
             'length': max([i.chrmeta[chrom]['length'] for i in all_tracks])
         }
     # Output attributes #
     if t.get('datatype'): vtrack.datatype = t['datatype']
     # Output name #
     vtrack.name = self.long_name + ' on ' + andify_strings(
         [i.name for i in all_tracks])
     ### Iterate on chromosomes ###
     for chrom in chromosomes:
         # Get special input arguments #
         for p in self.input_meta:
             if p['kind'] == 'chrom_len':
                 extra_args[p['key']] = vtrack.chrmeta[chrom]['length']
         # Call read on tracks #
         for k, input_track in found_tracks.items():
             if is_list(input_track):
                 found_generators[k] = [i.read(chrom) for i in input_track]
             else:
                 found_generators[k] = input_track.read(chrom)
         # What about track collapse and recursion #
         pass  #TODO
         # Final argument list #
         final_args = {}
         for d in (found_args, found_generators, extra_args):
             final_args.update(d)
         # Call generate #
         data = self.generate(**final_args)
         # Variable fields case #
         if t['fields'][-1] == '...':
             fields = t['fields'][:-1] + rest_of_fields
         else:
             fields = t['fields']
         # Make a FeatureStream #
         stream = FeatureStream(data, fields)
         # Add it to the virtual track #
         vtrack.write(chrom, stream)
     # Close tracks later #
     vtrack.tracks_to_close = tracks_to_close
     # Add it #
     virtual_tracks.append(vtrack)
     # Return one virutal track or list of virtual tracks #
     return len(virtual_tracks) == 1 and virtual_tracks[0] or virtual_tracks
Ejemplo n.º 49
0
Archivo: compare.py Proyecto: bow/track
def benchmark_access(file):
    with track.load(str(file['new_path'])) as t:
        with Timer() as timer:
            data = t.read()
            for entry in data: pass
    return timer.total_time
Ejemplo n.º 50
0
    def run(self, m1, p2, p1, m2, outputPrefix):
        # populate list of genes that show any uniqueness with regards to allelic ratios
        
        # make filehandles for each file
        fh_m1 = open(m1)

        # create GeneInfo objects
        for i in fh_m1:            
            # break line into array
            lineArray = i.strip().split("\t")
            
            currentGeneID = lineArray[3]
            currentChr = lineArray[0]

            # add gene to list if it does not yet exist            
            if not self._geneList.has_key(currentGeneID):
                geneInfoObj = GeneInfo(currentGeneID, currentChr)
                self._geneList[currentGeneID] = geneInfoObj

            # populate exon list
            currentStart = lineArray[1]
            currentEnd = lineArray[2] 
            self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd,
                                                        lineArray[5], 0, 0, 0)
        # close file handle
        fh_m1.close()
        
        fh_p2 = open(p2)
        for i in fh_p2:
            # break line into array
            lineArray = i.strip().split("	")
            
            currentGeneID = lineArray[3]
            currentChr = lineArray[0]

            # add gene to list if it does not yet exist            
            if not self._geneList.has_key(currentGeneID):
                geneInfoObj = GeneInfo(currentGeneID, currentChr)
                self._geneList[currentGeneID] = geneInfoObj

            # populate exon list
            currentStart = lineArray[1]
            currentEnd = lineArray[2] 
            self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd,
                                                        0, lineArray[5], 0, 0)
        fh_p2.close()
 
        fh_p1 = open(p2)
        for i in fh_p1:
            # break line into array
            lineArray = i.strip().split("\t")
            
            currentGeneID = lineArray[3]
            currentChr = lineArray[0]

            # add gene to list if it does not yet exist            
            if not self._geneList.has_key(currentGeneID):
                geneInfoObj = GeneInfo(currentGeneID, currentChr)
                self._geneList[currentGeneID] = geneInfoObj

            # populate exon list
            currentStart = lineArray[1]
            currentEnd = lineArray[2] 
            self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd,
                                                        0, 0, lineArray[5], 0)
        fh_p1.close()
        
        fh_m2 = open(m2)
        for i in fh_m2:
            # break line into array
            lineArray = i.strip().split("\t")
            
            currentGeneID = lineArray[3]
            currentChr = lineArray[0]

            # add gene to list if it does not yet exist            
            if not self._geneList.has_key(currentGeneID):
                geneInfoObj = GeneInfo(currentGeneID, currentChr)
                self._geneList[currentGeneID] = geneInfoObj

            # populate exon list
            currentStart = lineArray[1]
            currentEnd = lineArray[2] 
            self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd,
                                                        0, 0, 0, lineArray[5])
        fh_m2.close()

        strain1_TEs = track.load(self._te1FileName)
        strain2_TEs = track.load(self._te2FileName)
        
        # stores all events to finally output
        # eventID is the geneName_TEtype_TEposition
        # eventID->fields->values
        events = {}
        
        # eventsFiltered = {}

        # genes with any evidence of allelic skew
        # look for TEs
        for i in self._geneList.keys():
            gene = self._geneList[i]
            coverageTotal = gene.getCoverage()
            if coverageTotal > self._coverageThreshold:
                ratio = gene.getTotalAllelicRatio()
                coverages = gene.getSummarizedCoveragesStr()
                
                # for each gene look with window size for TEs
                candidateTEs1 = strain1_TEs.read({'chr':gene.getChr(),
                                                  'start':(gene.getStart() - self._windowSize),
                                                  'end':(gene.getEnd() + self._windowSize)})
                
                # 'event_ID', 'TE_location', 'gene_location', 'distance', 'ratio', 'coverage_total', 'coverages'
                for event in candidateTEs1:
                    eventHash = {}
                    eventStart = event[0]
                    eventEnd = event[1]
                    eventType = event[2]
                    eventID = gene.getGeneID() + "_" + str(eventStart) + "_" + eventType
                    
                    eventHash["event_ID"] = eventID
                    eventHash["TE_name"] = eventType
                    eventHash["TE_location"] = gene.getChr() + ":" + str(eventStart) + "-" + str(eventEnd)
                    eventHash["gene_location"] = gene.getChr() + ":" + str(gene.getStart()) + "-" + str(gene.getEnd())
                    eventHash["distance"] = eventStart - gene.getStart()
                    eventHash["ratio"] = ratio
                    eventHash["coverage_total"] = coverageTotal
                    eventHash["coverages"] = coverages
                    eventHash["UCSC_gene"] = self._geneNames[gene.getGeneID()]
                    eventHash["exon_info"] = self._geneNames[gene.getExonInfo()]
                    
                    events[eventID] = eventHash
                
                candidateTEs2 = strain2_TEs.read({'chr':gene.getChr(),
                                                  'start':(gene.getStart() - self._windowSize),
                                                  'end':(gene.getEnd() + self._windowSize)})
                for event in candidateTEs2:
                    eventHash = {}
                    eventStart = event[0]
                    eventEnd = event[1]
                    eventType = event[2]
                    eventID = gene.getGeneID() + "_" + str(eventStart) + "_" + eventType
                    
                    eventHash["event_ID"] = eventID
                    eventHash["TE_name"] = eventType
                    eventHash["TE_location"] = gene.getChr() + ":" + str(eventStart) + "-" + str(eventEnd)
                    eventHash["gene_location"] = gene.getChr() + ":" + str(gene.getStart()) + "-" + str(gene.getEnd())
                    eventHash["distance"] = gene.getStart() - eventStart
                    eventHash["ratio"] = ratio
                    eventHash["coverage_total"] = coverageTotal
                    eventHash["coverages"] = coverages
                    eventHash["UCSC_gene"] = self._geneNames[gene.getGeneID()]
                    eventHash["exon_info"] = self._geneNames[gene.getExonInfo()]
                    
                    events[eventID] = eventHash                
        
        # output file
        out_fh = open(outputPrefix + '.tsv', 'w')
        out_fh.write(self._getHeader())
        
        for event in events.keys():
            out_fh.write(self._getLine(events[event]))
               
        # close file handles
        out_fh.close()
Ejemplo n.º 51
0
 def __call__(self, *args, **kwargs):
     """Check that all arguments are present
        and load all tracks that are given as paths
        instead of track objects. Also checks for
        direct calls with generators."""
     # Initialization #
     generator_call   = False # Special switch for direct generator calls
     found_args       = {}    # Will contain a set of parameters extracted
     found_tracks     = {}    # Will contain a set of track parameters extracted
     found_generators = {}    # Will contain a set of FeatureStream
     extra_args       = {}    # Will contain a set of parameters computed
     all_tracks       = []    # Will contain all single tracks sent
     tracks_to_close  = []    # Will contain single tracks to close
     virtual_tracks   = []    # Will contain the results tracks
     rest_of_fields   = []    # Will contain variable output fields when required
     ### Parse arguments ###
     for p in self.input_args:
         if p['key'] in kwargs: value = kwargs[p['key']]
         elif len(args) >= p['position']: value = args[p['position']-1]
         elif 'default' in p: value = p['default']
         elif p.get('optional'): continue
         else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \
                               % (p['key'], self.short_name))
         # Cast it if it's not the right type #
         if not isinstance(value, p['type']): value = p['type'](value)
         # Add it to the dict #
         found_args[p['key']] = value
     ### Parse tracks ###
     for t in self.input_tracks:
         if t['key'] in kwargs: value = kwargs[t['key']]
         elif len(args) >= t['position']: value = args[t['position']-1]
         elif 'default' in t: value = t['default']
         elif t.get('optional'): continue
         else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \
                               % (t['key'], self.short_name))
         # Check is track collection #
         if t.get('kind') == 'many':
             if not is_list(value):
                 message = "The track collection '%s' for the manipulation '%s' is not a list: %s"
                 raise Exception(message % (t['key'], self.short_name, value))
         # Don't modify the input list #
         if t.get('kind') == 'many': value = value[:]
         # Check for generator case #
         if is_gen(value): generator_call = True
         if t.get('kind') == 'many' and is_gen(value[0]): generator_call = True
         if generator_call:
             found_tracks[t['key']] = value
             continue
         # Check is path #
         if isinstance(value, basestring):
             value = track.load(value, readonly=True)
             tracks_to_close.append(value)
         if t.get('kind') == 'many':
             for i,_ in enumerate(value):
                 if isinstance(value[i], basestring):
                     value[i] = track.load(value[i], readonly=True)
                     tracks_to_close.append(value[i])
         # Add to the list of all tracks #
         if t.get('kind') == 'many': all_tracks += [x for x in value]
         else:                       all_tracks += [value]
         # Track collection must be combined #
         if t.get('kind') == 'many':
             value = TrackCollection(value, self.fields_collapse, self.chroms_collapse)
         # Variable fields case (track collection must collapse fields) #
         if t['fields'][-1] == '...':
             first_fields = t['fields'][:-1]
             rest_of_fields = [f for f in value.fields if f not in first_fields]
             value.fields = first_fields + rest_of_fields
         # Specific fields case #
         else: value.fields = t['fields']
         # What about track SimpleTrack case #
         pass #TODO
         # Add it to the dict #
         found_tracks[t['key']] = value
     # Check for generator case #
     if generator_call: return self.from_generator(found_tracks, found_args, args, kwargs)
     # Collapse chromosomes #
     if not self.chroms_collapse: chromosomes = all_tracks[0].chromosomes
     else: chromosomes = collapse(self.chroms_collapse, [t.chromosomes for t in all_tracks])
     # Multiple output tracks disabled #
     t = self.output_tracks[0]
     # Make a new virtual track #
     vtrack = VirtualTrack()
     # Output chromosome metadata #
     for chrom in chromosomes:
         vtrack.chrmeta[chrom] = {'length': max([i.chrmeta[chrom]['length'] for i in all_tracks])}
     # Output attributes #
     if t.get('datatype'): vtrack.datatype = t['datatype']
     # Output name #
     vtrack.name = self.long_name + ' on ' + andify_strings([i.name for i in all_tracks])
     ### Iterate on chromosomes ###
     for chrom in chromosomes:
         # Get special input arguments #
         for p in self.input_meta:
             if p['kind'] == 'chrom_len':
                 extra_args[p['key']] = vtrack.chrmeta[chrom]['length']
         # Call read on tracks #
         for k,input_track in found_tracks.items():
             if is_list(input_track): found_generators[k] = [i.read(chrom) for i in input_track]
             else:                    found_generators[k] = input_track.read(chrom)
         # What about track collapse and recursion #
         pass #TODO
         # Final argument list #
         final_args = {}
         for d in (found_args, found_generators, extra_args): final_args.update(d)
         # Call generate #
         data = self.generate(**final_args)
         # Variable fields case #
         if t['fields'][-1] == '...': fields = t['fields'][:-1] + rest_of_fields
         else:                        fields = t['fields']
         # Make a FeatureStream #
         stream = FeatureStream(data, fields)
         # Add it to the virtual track #
         vtrack.write(chrom, stream)
     # Close tracks later #
     vtrack.tracks_to_close = tracks_to_close
     # Add it #
     virtual_tracks.append(vtrack)
     # Return one virutal track or list of virtual tracks #
     return len(virtual_tracks) == 1 and virtual_tracks[0] or virtual_tracks
Ejemplo n.º 52
0
Archivo: url.py Proyecto: bow/track
 def runTest(self):
     url = "http://salt.epfl.ch/BED/sinclair/genomic/ChIP.bedGraph"
     with track.load(url) as t:
         got = t.count('chrY')
         expected = 577
     self.assertEqual(got, expected)