def jsonify(database_path, name, sha1, output_root_directory, public_url, browser_url, extended=False): ''' Make a JSON representation of the database. @param database_path : the path to the sqlite database @param name : the name of the track @param sha1 : the sha1 sum of the file @param public_url : he base url where the file can be fetched from external request @param browser_url : he base url where the file can be fetched from internal request @param output_root_directory : the base system path where to write the output @param extended : if the format is ``basic`` or ``extended`` ''' # configure outputs output_path = os.path.join(output_root_directory, sha1) out_public_url = os.path.join(public_url, sha1) out_browser_url = os.path.join(browser_url, sha1) os.mkdir(output_path) with track.load(database_path, 'sql', readonly=False) as t: for chr_name in t: chr_length = t.chrmeta[chr_name]['length'] out = os.path.join(output_path, chr_name) os.mkdir(out) lazy_url = os.path.join(out_browser_url, chr_name, 'lazyfeatures-{chunk}.json') _jsonify(t, name, chr_length, chr_name, os.path.join(out_public_url, chr_name), lazy_url, out, extended) return 1
def runTest(self): in_path = samples['small_features'][5]['sql'] with track.load(in_path) as t: data = t.get_partial_score_vector('chrI', 10, 30) got = list(data) expected = [0.0] * 4 + [1.0] * 5 + [0.0] * 8 + [1.0] * 3 self.assertEqual(got, expected)
def runTest(self): in_path = samples['small_features'][5]['sql'] with track.load(in_path) as t: data = t.read('chrI', ('start', 'end', 'name', 'score')) got = tuple(data.next()) expected = (14, 19, u'', 0.0) self.assertEqual(got, expected)
def setTrackChrGenes(**kwargs): fname = kwargs.get('fname', mousefile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] return rows
def search(self, project_id, term, *args, **kw): project = DBSession.query(Project).filter(Project.id == project_id).first() sequence = project.sequence default = sequence.default_tracks if default is None or len(default) < 1: return {} t = default[0] chrs = {} with track.load(t.path, 'sql', readonly=True) as t: gene_name_alias = t.find_column_name(['name', 'gene_name', 'gene name', 'gname', 'Name', 'product']) try: for row in t.search({gene_name_alias: term}, [gene_name_alias, 'start', 'end']): chr, name, start, stop = row if chr not in chrs: chrs[chr] = {} names = chrs[chr] if name in names: old = names[name] start = min(old[0], start) stop = max(old[1], stop) names[name] = [start, stop] except Exception: return {} #result[chr].append([name, start, stop]) result = {} for chr, names in chrs.iteritems(): result[chr] = [] for k, v in names.iteritems(): result[chr].append([k, v[0], v[1]]) return result
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.search({'start':'2','name':'feature'}, exact_match=True) got = list(data) expected = [] self.assertEqual(got, expected)
def benchmark_access(file): with track.load(str(file['new_path'])) as t: with Timer() as timer: data = t.read() for entry in data: pass return timer.total_time
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.get_partial_score_vector('chrI', 0, 50) got = list(data) expected = [10.0] * 10 + [0.0] * 10 + [10.0 ] * 10 + [0.0] * 15 + [10.0] * 5 self.assertEqual(got, expected)
def runTest(self): in_path = samples['small_features'][4]['sql'] with track.load(in_path, readonly=True) as t: t.chrmeta = {} data = t.get_full_score_vector('chrI') got = list(data) expected = [0.0] * 10 + [1.0] * 10 + [0.0] * 10 + [2.0] * 10 self.assertEqual(got, expected)
def runTest(self): in_path = samples['small_signals'][7]['sql'] out_path = temporary_path('.sql') with track.load(in_path) as i: with track.new(out_path) as o: for chrom in i: o.write(chrom, i.read(chrom)) self.assertEqual(list(o.read('chrI')), list(i.read('chrI'))) os.remove(out_path)
def runTest(self): in_path = samples['small_features'][1]['sql'] out_path = temporary_path('.bed') with track.load(in_path) as t: result = complement(t) result.export(out_path) with track.load(out_path) as t: data = t.read('chrI') got = map(tuple, data) expected = [( 10, 20), ( 30, 40), ( 50, 60), ( 80, 90), (110, 120), (135, 230208)] self.assertEqual(got, expected) os.remove(out_path)
def runTest(self): in_path = samples['small_features'][4]['sql'] with track.load(in_path) as t: data = t.get_full_score_vector('chrI') got = list(data) expected = [0.0] * 10 + [1.0] * 10 + [0.0] * 10 + [2.0] * 10 + [ 0.0 ] * 230168 self.assertEqual(got, expected)
def runTest(self): in_path = samples['gzip_tracks'][4]['gzip'] with track.load(in_path) as t: data = t.read() got = list(data) expected = [('chr1', 10, 20, u'Lorem', 1.0, 1), ('chr1', 30, 40, u'Ipsum', 2.0, 1), ('chr2', 10, 20, u'Dolor', 3.0, 1)] self.assertEqual(got, expected)
def _get_annotation_from_bed(self, fname, offset): """ Reads intervals from BED file """ try: with track.load(fname) as ann: ann = ann.read(fields=['start', 'end']) intervals = self._intervals_to_interval_map(ann, offset) except Exception: intervals = self._intervals_to_interval_map([], 0) return intervals
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.search({'start': '2', 'name': 'feature'}) got = list([tuple(x) for x in data]) expected = [(u'chrI', 2, 8, u'Validation feature 2', 0.0), (u'chrI', 20, 30, u'Validation feature 3', 10.0), (u'chrI', 25, 30, u'Validation feature 4', 0.0), (u'chrI', 120, 130, u'Validation feature 11', 10.0), (u'chrI', 125, 135, u'Validation feature 12', 5.0)] self.assertEqual(got, expected)
def runTest(self): in_path = samples['small_signals'][4]['sql'] out_path = temporary_path('.sql') t = threshold(in_path, 8000.0) t.export(out_path) with track.load(out_path) as t: data = t.read('chrI') got = map(tuple, data) expected = [(120, 122, 9000.0)] self.assertEqual(got, expected) os.remove(out_path)
def ucsc_geneid_fix(in_gtf, out_gtf, remote=None, local=None): """Updates 'gene_id' entries in GTF files downloaded from UCSC Table Browser to contain gene IDs instead of transcript IDs. If the output GTF file name already exists, it will be overwritten. :param in_gtf: path to input GTF file :type in_gtf: str :param out_gtf: path to output GTF file :type out_gtf: str :param remote: UCSC database and annotation source to use :type remote: dict('db': str, 'annot_src': str) :param local: two-column file name containing transcript-gene mapping, only when `db` and `annot_src` are None :type local: str :returns: None """ # remote not defined if remote is None: # then local must be defined if local is None: raise ValueError("Missing `remote` or `local` arguments") mapping = get_local_transcript_gene_mapping(local) # remote defined else: # then local can not be defined if local is not None: raise ValueError("Only supply `remote` or `local` argument, " "not both.") # remote must have 'db' if "db" not in remote: raise ValueError("Missing remote database name") # and 'annot_src' if "annot" not in remote: raise ValueError("Missing remote annotation source name") db = remote["db"] annot = remote["annot"] if annot not in QUERIES.keys(): raise ValueError("Invalid annotation source " "name: {0}".format(annot)) mapping = get_ucsc_transcript_gene_mapping(annot, db, cred=CRED) # remove output file if it exists if os.path.exists(out_gtf): os.remove(out_gtf) with track.load(in_gtf, readonly=True) as in_track, track.new(out_gtf, format="gtf") as out_track: # since GTF has custom fields, need to set the out_track to use # in_track's fields out_track.fields = in_track.fields for chrom in in_track.chromosomes: chrom_rec = in_track.read(chrom) out_track.write(chrom, update_gene_id_attr(chrom_rec, mapping))
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.search({'start':'2','name':'feature'}) got = list([tuple(x) for x in data]) expected = [(u'chrI', 2, 8, u'Validation feature 2', 0.0), (u'chrI', 20, 30, u'Validation feature 3', 10.0), (u'chrI', 25, 30, u'Validation feature 4', 0.0), (u'chrI', 120, 130, u'Validation feature 11', 10.0), (u'chrI', 125, 135, u'Validation feature 12', 5.0)] self.assertEqual(got, expected)
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.search({ 'start': '2', 'name': 'feature' }, exact_match=True) got = list(data) expected = [] self.assertEqual(got, expected)
def setTrackChrPromoters(**kwargs): fname = kwargs.get('fname', mousefile) num = kwargs.get('num', 1) t = track.load(fname); chromosome_data = t.read('chr{0}'.format(num)) rows = [dict(zip(r.keys(),r.data)) for r in iter(chromosome_data)] fwd_genes = [e for e in rows if e['strand'] == 1] fwd_starts =dict([(e['name'],e['start']) for e in fwd_genes]) fwd_promoters= dict([(k, [v - 2000, v - 100]) for k,v in fwd_starts.iteritems()]) return fwd_promoters
def runTest(self): in_path = samples['small_features'][1]['sql'] out_path = temporary_path('.sql') chrom = 'chrI' with track.load(in_path) as i: with track.new(out_path) as o: o.fields = track.default_fields o.write(chrom, i.read(chrom, ('start','end'))) got = tuple(o.read(chrom).next()) expected = (0, 10, None, None, None) self.assertEqual(got, expected) os.remove(out_path)
def runTest(self): in_path = samples['small_features'][1]['sql'] out_path = temporary_path('.sql') chrom = 'chrI' with track.load(in_path) as i: with track.new(out_path) as o: o.fields = track.default_fields o.write(chrom, i.read(chrom, ('start', 'end'))) got = tuple(o.read(chrom).next()) expected = (0, 10, None, None, None) self.assertEqual(got, expected) os.remove(out_path)
def runTest(self): x_path = samples['small_signals'][4]['sql'] y_path = samples['small_features'][4]['sql'] out_path = temporary_path('.sql') t = mean_score_by_feature(x_path,y_path) t.export(out_path) with track.load(out_path) as t: data = t.read('chrI') got = map(tuple, data) expected = [(10, 20, 15.0, u'Lorem', 1), (30, 40, 50.0, u'Ipsum', 1)] self.assertEqual(got, expected) os.remove(out_path)
def tosql(fileinfo, seq_name): """ Transform a input file to an sql one. """ debug('Tosql', 3) track.convert(fileinfo.extension and (fileinfo.paths['upload_to'], fileinfo.extension) or fileinfo.paths['upload_to'], fileinfo.paths['store']) with track.load(fileinfo.paths['store'], 'sql', readonly=False) as t: t.assembly = seq_name # debug('tosql : btrack.convert("%s", "%s", chrmeta="%s")' % (fileinfo.paths['upload_to'], fileinfo.paths['store'], seq_name), 3) # btrack.convert(fileinfo.paths['upload_to'], fileinfo.paths['store'], chrmeta=seq_name) fileinfo.states['instore'] = True debug('done', 4) return fileinfo
def parse(self): # Core function # def read_whole_track(t): self.handler.defineFields(t.fields) self.handler.newTrack(t.info) if t.info.get('assembly'): self.handler.defineAssembly(t.info.get('assembly')) else: self.handler.defineChrmeta(t.chrmeta) for chrom in t: for feature in t.read(chrom): self.handler.newFeature(chrom, feature) # Check param type # if isinstance(self.path, Track): read_whole_track(self.path) else: with load(self.path, 'sql', readonly=True) as t: read_whole_track(t)
def tosql(fileinfo, seq_name): """ Transform a input file to an sql one. """ debug('Tosql', 3) track.convert( fileinfo.extension and (fileinfo.paths['upload_to'], fileinfo.extension) or fileinfo.paths['upload_to'], fileinfo.paths['store']) with track.load(fileinfo.paths['store'], 'sql', readonly=False) as t: t.assembly = seq_name # debug('tosql : btrack.convert("%s", "%s", chrmeta="%s")' % (fileinfo.paths['upload_to'], fileinfo.paths['store'], seq_name), 3) # btrack.convert(fileinfo.paths['upload_to'], fileinfo.paths['store'], chrmeta=seq_name) fileinfo.states['instore'] = True debug('done', 4) return fileinfo
def runTest(self): for num, info in sorted(samples['small_features'].items()): # Prepare paths # orig_bed_path = info['bed'] orig_sql_path = info['sql'] test_sql_path = temporary_path('.sql') test_bed_path = temporary_path('.bed') # From BED to SQL # track.convert(orig_bed_path, test_sql_path, assembly='sacCer2') self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to BED # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_bed_path) self.assertTrue(assert_file_equal(orig_bed_path, test_bed_path, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_bed_path)
def runTest(self): orig_path = samples['small_features'][2]['sql'] test_path = temporary_path('.sql') shutil.copy(orig_path, test_path) with track.load(test_path) as t: t.delete_fields(['name', 'strand']) got = list(t.read()) expected = [('chrI', 10, 20, 0.1), ('chrI', 30, 40, 0.2), ('chrI', 50, 60, 0.1), ('chrI', 70, 80, 0.2), ('chrI', 90, 100, 0.0), ('chrI', 110, 120, 0.4), ('chrI', 130, 150, 0.4), ('chrI', 180, 190, 0.1), ('chrI', 180, 200, 0.1), ('chrI', 210, 220, 0.2), ('chrI', 230, 240, 0.1), ('chrI', 250, 260, 0.2), ('chrI', 270, 280, 0.0), ('chrI', 290, 300, 0.7)] self.assertEqual(got, expected) # Clean up # os.remove(test_path)
def runTest(self): in_path = samples['small_features'][1]['sql'] with track.load(in_path) as t: data = t.read() got = list(data) expected = [('chrI', 0, 10, u'Validation feature 1', 10.0), ('chrI', 2, 8, u'Validation feature 2', 0.0), ('chrI', 20, 30, u'Validation feature 3', 10.0), ('chrI', 25, 30, u'Validation feature 4', 0.0), ('chrI', 40, 45, u'Validation feature 5', 0.0), ('chrI', 40, 50, u'Validation feature 6', 10.0), ('chrI', 60, 70, u'Validation feature 7', 10.0), ('chrI', 70, 80, u'Validation feature 8', 10.0), ('chrI', 90, 100, u'Validation feature 9', 10.0), ('chrI', 90, 110, u'Validation feature 10', 10.0), ('chrI', 120, 130, u'Validation feature 11', 10.0), ('chrI', 125, 135, u'Validation feature 12', 5.0)] self.assertEqual(got, expected)
def runTest(self): for num, info in sorted(samples["gff_tracks"].items()): # Prepare paths # orig_gff_path = info["gff"] orig_sql_path = info["sql"] test_sql_path = temporary_path(".sql") test_gff_path = temporary_path(".gff") # From GFF to SQL # track.convert(orig_gff_path, test_sql_path, assembly="sacCer2") self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to GFF # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_gff_path) self.assertTrue(assert_file_equal(orig_gff_path, test_gff_path, start_a=1, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_gff_path)
def runTest(self): for num, info in sorted(samples['small_signals'].items()): # Some files cannot be roundtriped # if num == 3 or num == 7: continue # Prepare paths # orig_wig_path = info['wig'] orig_sql_path = info['sql'] test_sql_path = temporary_path('.sql') test_wig_path = temporary_path('.wig') # From WIG to SQL # track.convert(orig_wig_path, test_sql_path, assembly='sacCer2') self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to WIG # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_wig_path) self.assertTrue(assert_file_equal(orig_wig_path, test_wig_path, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_wig_path)
def runTest(self): for num, info in sorted(samples['small_features'].items()): # Prepare paths # orig_bed_path = info['bed'] orig_sql_path = info['sql'] test_sql_path = temporary_path('.sql') test_bed_path = temporary_path('.bed') # From BED to SQL # track.convert(orig_bed_path, test_sql_path, assembly='sacCer2') self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to BED # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_bed_path) self.assertTrue( assert_file_equal(orig_bed_path, test_bed_path, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_bed_path)
def runTest(self): in_path = temporary_path('.sql') out_path = temporary_path('.sql') with track.new(in_path) as t: t.fields = ('start','end','score') t.assembly = 'sacCer2' t.write('chrI',[(0,2,10),(2,4,20),(6,8,10)]) result = window_smoothing(t, 2) result.export(out_path) with track.load(out_path) as t: data = t.read('chrI') got = map(tuple, data) expected = [(0, 1, 8.0), (1, 3, 12.0), (3, 5, 10.0), (5, 6, 8.0), (6, 9, 4.0), (9, 10, 2.0)] self.assertEqual(got, expected) os.remove(in_path) os.remove(out_path)
def runTest(self): for num, info in sorted(samples['small_signals'].items()): # Some files cannot be roundtriped # if num == 3 or num == 7: continue # Prepare paths # orig_wig_path = info['wig'] orig_sql_path = info['sql'] test_sql_path = temporary_path('.sql') test_wig_path = temporary_path('.wig') # From WIG to SQL # track.convert(orig_wig_path, test_sql_path, assembly='sacCer2') self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to WIG # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_wig_path) self.assertTrue( assert_file_equal(orig_wig_path, test_wig_path, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_wig_path)
def pre_compute_sql_scores(database_path, sha1, output_dir): ''' Pre compute scores for a quantitative database @param database_path : the path to the database @param sha1 : the sha1 sun hexdigest of the database @param output_dir : where files will be write ''' out_path = os.path.join(output_dir, sha1) try: os.mkdir(out_path) except: pass #print 'prepare connection' with track.load(database_path, format='sql', readonly=True) as t: for chromosome in t: # print 'doing chr %s' % chromosome max = get_last_feature_stop(t, chromosome) if max is not None: # print 'generating score array' array = generate_array( t.read(chromosome, ('start', 'end', 'score')), max, 100000) # print 'doing for each zoom' for zoom in zooms: # print 'compute : zoom = %s' % zoom gen = gen_tuples(array, max, zoom) # print 'prepare output' output = os.path.join(out_path, '%s_%s.db' % (chromosome, zoom)) out_connection = sqlite3.connect(output) # print 'write' write_tuples(out_connection, gen) # print 'end zooms' #print 'end chr' return 1
def runTest(self): for num, info in sorted(samples['gff_tracks'].items()): # Prepare paths # orig_gff_path = info['gff'] orig_sql_path = info['sql'] test_sql_path = temporary_path('.sql') test_gff_path = temporary_path('.gff') # From GFF to SQL # track.convert(orig_gff_path, test_sql_path, assembly='sacCer2') self.assertTrue(assert_file_equal(orig_sql_path, test_sql_path)) # From SQL to GFF # with track.load(test_sql_path) as t: t.roman_to_integer() track.convert(test_sql_path, test_gff_path) self.assertTrue( assert_file_equal(orig_gff_path, test_gff_path, start_a=1, start_b=1)) # Clean up # os.remove(test_sql_path) os.remove(test_gff_path)
def squish_bed(in_file, out_file): """Removes all overlapping regions in the input BED file, writing to the output BED file. :param in_file: path to input BED file :type in_file: str :param out_file: path to output BED file :type out_file: str """ # check for input file presence, remove output file if it already exists assert os.path.exists(in_file), 'Required input file {0} does not ' \ 'exist'.format(in_file) if os.path.exists(out_file): os.unlink(out_file) with track.load(in_file, readonly=True) as in_track, \ track.new(out_file, format='bed') as out_track: for chrom in in_track.chromosomes: chrom_rec = in_track.read(chrom) out_track.write(chrom, squish_track_records(chrom_rec))
def runTest(self): in_paths = [samples['small_signals'][1]['sql'], samples['small_signals'][2]['sql'], samples['small_signals'][3]['sql']] out_path = temporary_path('.sql') t = merge_scores(in_paths) t.export(out_path) with track.load(out_path) as t: data = t.read('chrI') got = map(tuple, data) expected = [( 0, 5, 2.0 + 0.6666666666666666), ( 5, 10, 4.0), ( 20, 30, 10.0), ( 30, 40, 30.0), ( 40, 50, 26.0 + 0.666666666666666), ( 50, 60, 120.0), ( 60, 68, 100.0), ( 68, 70, 200.0), ( 70, 80, 100.0), ( 90, 110, 3.0), (120, 130, 10.0)] self.assertEqual(got, expected) os.remove(out_path)
def pre_compute_sql_scores(database_path, sha1, output_dir): ''' Pre compute scores for a quantitative database @param database_path : the path to the database @param sha1 : the sha1 sun hexdigest of the database @param output_dir : where files will be write ''' out_path = os.path.join(output_dir, sha1) try : os.mkdir(out_path) except : pass #print 'prepare connection' with track.load(database_path, format='sql', readonly=True) as t: for chromosome in t: # print 'doing chr %s' % chromosome max = get_last_feature_stop(t, chromosome) if max is not None: # print 'generating score array' array = generate_array(t.read(chromosome, ('start', 'end', 'score')), max, 100000) # print 'doing for each zoom' for zoom in zooms: # print 'compute : zoom = %s' % zoom gen = gen_tuples(array, max, zoom) # print 'prepare output' output = os.path.join(out_path, '%s_%s.db' % (chromosome, zoom)) out_connection = sqlite3.connect(output) # print 'write' write_tuples(out_connection, gen) # print 'end zooms' #print 'end chr' return 1
def runTest(self): orig_path = samples['small_features'][2]['sql'] test_path = temporary_path('.sql') shutil.copy(orig_path, test_path) with track.load(test_path) as t: t.delete_fields(['name','strand']) got = list(t.read()) expected = [('chrI', 10, 20, 0.1), ('chrI', 30, 40, 0.2), ('chrI', 50, 60, 0.1), ('chrI', 70, 80, 0.2), ('chrI', 90, 100, 0.0), ('chrI', 110, 120, 0.4), ('chrI', 130, 150, 0.4), ('chrI', 180, 190, 0.1), ('chrI', 180, 200, 0.1), ('chrI', 210, 220, 0.2), ('chrI', 230, 240, 0.1), ('chrI', 250, 260, 0.2), ('chrI', 270, 280, 0.0), ('chrI', 290, 300, 0.7)] self.assertEqual(got, expected) # Clean up # os.remove(test_path)
def search(self, project_id, term, *args, **kw): project = DBSession.query(Project).filter( Project.id == project_id).first() sequence = project.sequence default = sequence.default_tracks if default is None or len(default) < 1: return {} t = default[0] chrs = {} with track.load(t.path, 'sql', readonly=True) as t: gene_name_alias = t.find_column_name( ['name', 'gene_name', 'gene name', 'gname', 'Name', 'product']) try: for row in t.search({gene_name_alias: term}, [gene_name_alias, 'start', 'end']): chr, name, start, stop = row if chr not in chrs: chrs[chr] = {} names = chrs[chr] if name in names: old = names[name] start = min(old[0], start) stop = max(old[1], stop) names[name] = [start, stop] except Exception: return {} #result[chr].append([name, start, stop]) result = {} for chr, names in chrs.iteritems(): result[chr] = [] for k, v in names.iteritems(): result[chr].append([k, v[0], v[1]]) return result
def jsonify(database_path, name, sha1, output_root_directory, public_url, browser_url, extended = False): ''' Make a JSON representation of the database. @param database_path : the path to the sqlite database @param name : the name of the track @param sha1 : the sha1 sum of the file @param public_url : he base url where the file can be fetched from external request @param browser_url : he base url where the file can be fetched from internal request @param output_root_directory : the base system path where to write the output @param extended : if the format is ``basic`` or ``extended`` ''' # configure outputs output_path = os.path.join(output_root_directory, sha1) out_public_url = os.path.join(public_url, sha1) out_browser_url = os.path.join(browser_url, sha1) os.mkdir(output_path) with track.load(database_path, 'sql', readonly=False) as t : for chr_name in t: chr_length = t.chrmeta[chr_name]['length'] out = os.path.join(output_path, chr_name) os.mkdir(out) lazy_url = os.path.join(out_browser_url, chr_name, 'lazyfeatures-{chunk}.json') _jsonify(t, name, chr_length, chr_name, os.path.join(out_public_url, chr_name), lazy_url, out, extended) return 1
def process_csv(neuropil,dataset,cluster_type): analysis_time = datetime.datetime.now().isoformat() WITH_CHROMOSOME_LOCATION=True if WITH_CHROMOSOME_LOCATION: import track # install with "pip install track" (see http://bbcf.epfl.ch/bbcflib/tutorial_track.html) if dataset=='T1': dataset_track = track.load(os.path.join(BRAINCODE_PACKAGE_DIR,'VTs.bed')) elif dataset=='CB1': dataset_track = track.load(os.path.join(BRAINCODE_PACKAGE_DIR,'janeliaTiles.bed')) else: raise ValueError('uknown dataset %r' % dataset) filenames = get_filenames( dataset, neuropil, cluster_type ) hs_name_raw = filenames['fragment_info_raw_csv'] pre_csv_fname = get_fragment_pre_csv_fname(dataset=dataset, region=neuropil, cluster_type=cluster_type) original_metadata = get_csv_metadata(pre_csv_fname) if os.path.exists(hs_name_raw): derived_metadata = get_csv_metadata(hs_name_raw) if original_metadata['analysis_time_parsed'] < derived_metadata['analysis_time_parsed']: print('output %r exists but is newer, so not rewriting. skipping.' % hs_name_raw) return else: print('output %r exists but is older than input, so rewriting.' % hs_name_raw) ids_fname = filenames['id_driver_image_csv'] print('reading %r'%ids_fname) id_driver_image_df = pd.read_csv(ids_fname, sep=';') driver_id_to_driver_name = {} for i,driver_image_row in id_driver_image_df.iterrows(): driver_id_to_driver_name[ driver_image_row['id'] ] = driver_image_row['driver'] print('reading %r' % pre_csv_fname ) qq = pd.read_csv(pre_csv_fname, low_memory=False, comment='#') print('computing statistics') qq['expressing elsewhere_in_region'] = qq['expressing region'] - qq['expressing cluster'] qq['total elsewhere_in_region'] = qq['total region'] - qq['total cluster'] qq['fraction cluster'] = qq['expressing cluster']/qq['total cluster'] qq['observed'] = qq['expressing cluster'] if 0: # expected value is expression everywhere qq['expected freq'] = qq['expressing region']/qq['total region'] else: # expected value is expression elsewhere qq['expected freq'] = qq['expressing elsewhere_in_region']/qq['total elsewhere_in_region'] qq['expected'] = qq['expected freq'] * qq['total cluster'] qq['fold enrichment'] = qq['observed']/qq['expected'] qq['chi sq'] = (qq['observed'] - qq['expected'])**2 / qq['expected'] qq['chi sq p'] = scipy.stats.chisqprob(qq['chi sq'].values, df=1 ) if 1: hypergeometric_p = [] for i,qq_row in qq.iterrows(): if i%10000==0: print('%d of %d'%(i,len(qq))) # Variable names correspond to the scipy docs at # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html # Comment names correspond to the wikipedia entry at # https://en.wikipedia.org/wiki/Hypergeometric_distribution M = qq_row['total region'] # wikipedia's N n = qq_row['expressing region'] # wikipedia's K N = qq_row['total cluster'] # wikipedia's n x = qq_row['expressing cluster'] # wikipedia's k rv = scipy.stats.hypergeom( M, n, N ) hypergeometric_p.append( rv.pmf(x) ) qq['hypergeometric p'] = hypergeometric_p qq['driver name'] = qq.apply( lambda row: driver_id_to_driver_name[ row['driver_id'] ], axis = 1 ) print('done computing statistics') print('saving all %d rows'%(len(qq),)) column_renames = collections.OrderedDict([ # (old_name, new_name), ('cluster_id','cluster_id'), ('fraction cluster','fraction'), ('fold enrichment','fold enrichment'), ('driver name','driver name'), ('driver_id','driver_id'), ('hypergeometric p','hypergeometric p'), ('chi sq p','chi sq p'), ('expressing cluster','num positive voxels in cluster'), ('total cluster','num voxels in cluster'), ('expressing region','num positive voxels in region'), ('total region','num voxels in region'), ] ) qq = qq[[old_name for old_name in column_renames]] # drop unused columns qq = qq.rename(columns=column_renames) # rename columns hs_sorted = qq.sort( ['cluster_id','fraction'], ascending=[True,False] ) if WITH_CHROMOSOME_LOCATION: chrom = [] chromStart = [] chromEnd = [] chromStrand = [] ucsc_urls = [] cached_sdata = {} for i,hs_row in hs_sorted.iterrows(): if dataset=='T1': num = vt_name_to_num(hs_row['driver name']) name = 'VT%04d'%num else: assert dataset=='CB1' name = hs_row['driver name'] sdata = cached_sdata.get(name,None) if sdata is None: # cache miss sdata = [r for r in dataset_track.search({'name':name},exact_match=True)] cached_sdata[name] = sdata if len(sdata)==0: print("no entry for %r"%hs_row['driver name']) if len(sdata)==0: chrom.append(None) chromStart.append(None) chromEnd.append(None) chromStrand.append(None) ucsc_urls.append(None) else: if len(sdata)>1: print('sdata') print(sdata) raise RuntimeError("more than one entry for %r"%hs_row['driver name']) trackrow = sdata[0] chrom.append(trackrow[0]) chromStart.append(trackrow[1]) chromEnd.append(trackrow[2]) chromStrand.append(trackrow[5]) ucsc_urls.append( 'http://genome.ucsc.edu/cgi-bin/hgTracks?db=dm3&position=' + chrom[-1] +'%3A' + str(chromStart[-1]) +'-' +str(chromEnd[-1]) ) hs_sorted['chrom'] = chrom hs_sorted['chromStart'] = np.array(chromStart,dtype=object) # don't let Pandas convert to float hs_sorted['chromEnd'] = np.array(chromEnd,dtype=object) # don't let Pandas convert to float hs_sorted['chromStrand'] = chromStrand hs_sorted['UCSC Genome Browser URL'] = ucsc_urls if dataset=='T1': hs_sorted['bbweb URL'] = hs_sorted.apply(lambda row: vt_name_to_bbweb_url(row['driver name']), axis=1) hs_sorted['VDRC URL'] = hs_sorted.apply(lambda row: vt_name_to_vdrc_url(row['driver name']), axis=1) else: assert dataset=='CB1' hs_sorted['FlyLight URL'] = hs_sorted.apply(lambda row: janelia_name_to_flylight_url(row['driver name']), axis=1) buf = StringIO() metadata = { 'analysis_time':analysis_time, 'url':'https://strawlab.org/braincode', 'neuropil':neuropil, 'dataset':dataset, 'cluster_type':cluster_type, } comment_line = '# '+json.dumps( metadata ) + '\n' buf.write(comment_line) hs_sorted.to_csv(buf,index=False) with open(hs_name_raw,mode='w') as fd: fd.write(buf.getvalue()) print('saved to %r'%hs_name_raw)
# track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1913601_0m_rep3.bedGraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/ATAC_0m_rep3.sql') # track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1913605_120m_rep3.bedGraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/ATAC_120m_rep3.sql') # track.convert('/scratch/PI/mcovert/dvanva/bedfiles/GSM1645121_RelA-2_120.bedgraph', '/scratch/PI/mcovert/dvanva/bedfiles/sql/RelA_120m_rep2.sql') """ Plot tracks around Ccl3 and Ccl4 """ plt.clf() fig, axes = plt.subplots(10, 1, figsize=(6, 1 * 10)) start_pos = 83460000 end_pos = 83480000 # p300 UT peaks with track.load( '/scratch/PI/mcovert/dvanva/bedfiles/sql/p300.UT.peaks.sql') as t: data = t.read({'chr': 'chr11', 'start': 83460000, 'end': 83480000}) for peak in data: axes[0].bar(peak[0], 1, width=peak[1] - peak[0], color='k') axes[0].set_xlim([start_pos, end_pos]) axes[0].set_title('p300 Untreated') axes[0].set_xticks([]) axes[0].set_yticks([]) # p300 LPS 2h peaks with track.load( '/scratch/PI/mcovert/dvanva/bedfiles/sql/p300.LPS_2h.peaks.sql') as t: data = t.read({'chr': 'chr11', 'start': 83460000, 'end': 83480000}) for peak in data: axes[1].bar(peak[0], 1, width=peak[1] - peak[0], color='k') axes[1].set_xlim([start_pos, end_pos])
def __call__(self, *args, **kwargs): """Check that all arguments are present and load all tracks that are given as paths instead of track objects. Also checks for direct calls with generators.""" # Initialization # generator_call = False # Special switch for direct generator calls found_args = {} # Will contain a set of parameters extracted found_tracks = {} # Will contain a set of track parameters extracted found_generators = {} # Will contain a set of FeatureStream extra_args = {} # Will contain a set of parameters computed all_tracks = [] # Will contain all single tracks sent tracks_to_close = [] # Will contain single tracks to close virtual_tracks = [] # Will contain the results tracks rest_of_fields = [ ] # Will contain variable output fields when required ### Parse arguments ### for p in self.input_args: if p['key'] in kwargs: value = kwargs[p['key']] elif len(args) >= p['position']: value = args[p['position'] - 1] elif 'default' in p: value = p['default'] elif p.get('optional'): continue else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \ % (p['key'], self.short_name)) # Cast it if it's not the right type # if not isinstance(value, p['type']): value = p['type'](value) # Add it to the dict # found_args[p['key']] = value ### Parse tracks ### for t in self.input_tracks: if t['key'] in kwargs: value = kwargs[t['key']] elif len(args) >= t['position']: value = args[t['position'] - 1] elif 'default' in t: value = t['default'] elif t.get('optional'): continue else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \ % (t['key'], self.short_name)) # Check is track collection # if t.get('kind') == 'many': if not is_list(value): message = "The track collection '%s' for the manipulation '%s' is not a list: %s" raise Exception(message % (t['key'], self.short_name, value)) # Don't modify the input list # if t.get('kind') == 'many': value = value[:] # Check for generator case # if is_gen(value): generator_call = True if t.get('kind') == 'many' and is_gen(value[0]): generator_call = True if generator_call: found_tracks[t['key']] = value continue # Check is path # if isinstance(value, basestring): value = track.load(value, readonly=True) tracks_to_close.append(value) if t.get('kind') == 'many': for i, _ in enumerate(value): if isinstance(value[i], basestring): value[i] = track.load(value[i], readonly=True) tracks_to_close.append(value[i]) # Add to the list of all tracks # if t.get('kind') == 'many': all_tracks += [x for x in value] else: all_tracks += [value] # Track collection must be combined # if t.get('kind') == 'many': value = TrackCollection(value, self.fields_collapse, self.chroms_collapse) # Variable fields case (track collection must collapse fields) # if t['fields'][-1] == '...': first_fields = t['fields'][:-1] rest_of_fields = [ f for f in value.fields if f not in first_fields ] value.fields = first_fields + rest_of_fields # Specific fields case # else: value.fields = t['fields'] # What about track SimpleTrack case # pass #TODO # Add it to the dict # found_tracks[t['key']] = value # Check for generator case # if generator_call: return self.from_generator(found_tracks, found_args, args, kwargs) # Collapse chromosomes # if not self.chroms_collapse: chromosomes = all_tracks[0].chromosomes else: chromosomes = collapse(self.chroms_collapse, [t.chromosomes for t in all_tracks]) # Multiple output tracks disabled # t = self.output_tracks[0] # Make a new virtual track # vtrack = VirtualTrack() # Output chromosome metadata # for chrom in chromosomes: vtrack.chrmeta[chrom] = { 'length': max([i.chrmeta[chrom]['length'] for i in all_tracks]) } # Output attributes # if t.get('datatype'): vtrack.datatype = t['datatype'] # Output name # vtrack.name = self.long_name + ' on ' + andify_strings( [i.name for i in all_tracks]) ### Iterate on chromosomes ### for chrom in chromosomes: # Get special input arguments # for p in self.input_meta: if p['kind'] == 'chrom_len': extra_args[p['key']] = vtrack.chrmeta[chrom]['length'] # Call read on tracks # for k, input_track in found_tracks.items(): if is_list(input_track): found_generators[k] = [i.read(chrom) for i in input_track] else: found_generators[k] = input_track.read(chrom) # What about track collapse and recursion # pass #TODO # Final argument list # final_args = {} for d in (found_args, found_generators, extra_args): final_args.update(d) # Call generate # data = self.generate(**final_args) # Variable fields case # if t['fields'][-1] == '...': fields = t['fields'][:-1] + rest_of_fields else: fields = t['fields'] # Make a FeatureStream # stream = FeatureStream(data, fields) # Add it to the virtual track # vtrack.write(chrom, stream) # Close tracks later # vtrack.tracks_to_close = tracks_to_close # Add it # virtual_tracks.append(vtrack) # Return one virutal track or list of virtual tracks # return len(virtual_tracks) == 1 and virtual_tracks[0] or virtual_tracks
def run(self, m1, p2, p1, m2, outputPrefix): # populate list of genes that show any uniqueness with regards to allelic ratios # make filehandles for each file fh_m1 = open(m1) # create GeneInfo objects for i in fh_m1: # break line into array lineArray = i.strip().split("\t") currentGeneID = lineArray[3] currentChr = lineArray[0] # add gene to list if it does not yet exist if not self._geneList.has_key(currentGeneID): geneInfoObj = GeneInfo(currentGeneID, currentChr) self._geneList[currentGeneID] = geneInfoObj # populate exon list currentStart = lineArray[1] currentEnd = lineArray[2] self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd, lineArray[5], 0, 0, 0) # close file handle fh_m1.close() fh_p2 = open(p2) for i in fh_p2: # break line into array lineArray = i.strip().split(" ") currentGeneID = lineArray[3] currentChr = lineArray[0] # add gene to list if it does not yet exist if not self._geneList.has_key(currentGeneID): geneInfoObj = GeneInfo(currentGeneID, currentChr) self._geneList[currentGeneID] = geneInfoObj # populate exon list currentStart = lineArray[1] currentEnd = lineArray[2] self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd, 0, lineArray[5], 0, 0) fh_p2.close() fh_p1 = open(p2) for i in fh_p1: # break line into array lineArray = i.strip().split("\t") currentGeneID = lineArray[3] currentChr = lineArray[0] # add gene to list if it does not yet exist if not self._geneList.has_key(currentGeneID): geneInfoObj = GeneInfo(currentGeneID, currentChr) self._geneList[currentGeneID] = geneInfoObj # populate exon list currentStart = lineArray[1] currentEnd = lineArray[2] self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd, 0, 0, lineArray[5], 0) fh_p1.close() fh_m2 = open(m2) for i in fh_m2: # break line into array lineArray = i.strip().split("\t") currentGeneID = lineArray[3] currentChr = lineArray[0] # add gene to list if it does not yet exist if not self._geneList.has_key(currentGeneID): geneInfoObj = GeneInfo(currentGeneID, currentChr) self._geneList[currentGeneID] = geneInfoObj # populate exon list currentStart = lineArray[1] currentEnd = lineArray[2] self._geneList[currentGeneID].addUpdateExon(currentStart, currentEnd, 0, 0, 0, lineArray[5]) fh_m2.close() strain1_TEs = track.load(self._te1FileName) strain2_TEs = track.load(self._te2FileName) # stores all events to finally output # eventID is the geneName_TEtype_TEposition # eventID->fields->values events = {} # eventsFiltered = {} # genes with any evidence of allelic skew # look for TEs for i in self._geneList.keys(): gene = self._geneList[i] coverageTotal = gene.getCoverage() if coverageTotal > self._coverageThreshold: ratio = gene.getTotalAllelicRatio() coverages = gene.getSummarizedCoveragesStr() # for each gene look with window size for TEs candidateTEs1 = strain1_TEs.read({'chr':gene.getChr(), 'start':(gene.getStart() - self._windowSize), 'end':(gene.getEnd() + self._windowSize)}) # 'event_ID', 'TE_location', 'gene_location', 'distance', 'ratio', 'coverage_total', 'coverages' for event in candidateTEs1: eventHash = {} eventStart = event[0] eventEnd = event[1] eventType = event[2] eventID = gene.getGeneID() + "_" + str(eventStart) + "_" + eventType eventHash["event_ID"] = eventID eventHash["TE_name"] = eventType eventHash["TE_location"] = gene.getChr() + ":" + str(eventStart) + "-" + str(eventEnd) eventHash["gene_location"] = gene.getChr() + ":" + str(gene.getStart()) + "-" + str(gene.getEnd()) eventHash["distance"] = eventStart - gene.getStart() eventHash["ratio"] = ratio eventHash["coverage_total"] = coverageTotal eventHash["coverages"] = coverages eventHash["UCSC_gene"] = self._geneNames[gene.getGeneID()] eventHash["exon_info"] = self._geneNames[gene.getExonInfo()] events[eventID] = eventHash candidateTEs2 = strain2_TEs.read({'chr':gene.getChr(), 'start':(gene.getStart() - self._windowSize), 'end':(gene.getEnd() + self._windowSize)}) for event in candidateTEs2: eventHash = {} eventStart = event[0] eventEnd = event[1] eventType = event[2] eventID = gene.getGeneID() + "_" + str(eventStart) + "_" + eventType eventHash["event_ID"] = eventID eventHash["TE_name"] = eventType eventHash["TE_location"] = gene.getChr() + ":" + str(eventStart) + "-" + str(eventEnd) eventHash["gene_location"] = gene.getChr() + ":" + str(gene.getStart()) + "-" + str(gene.getEnd()) eventHash["distance"] = gene.getStart() - eventStart eventHash["ratio"] = ratio eventHash["coverage_total"] = coverageTotal eventHash["coverages"] = coverages eventHash["UCSC_gene"] = self._geneNames[gene.getGeneID()] eventHash["exon_info"] = self._geneNames[gene.getExonInfo()] events[eventID] = eventHash # output file out_fh = open(outputPrefix + '.tsv', 'w') out_fh.write(self._getHeader()) for event in events.keys(): out_fh.write(self._getLine(events[event])) # close file handles out_fh.close()
def __call__(self, *args, **kwargs): """Check that all arguments are present and load all tracks that are given as paths instead of track objects. Also checks for direct calls with generators.""" # Initialization # generator_call = False # Special switch for direct generator calls found_args = {} # Will contain a set of parameters extracted found_tracks = {} # Will contain a set of track parameters extracted found_generators = {} # Will contain a set of FeatureStream extra_args = {} # Will contain a set of parameters computed all_tracks = [] # Will contain all single tracks sent tracks_to_close = [] # Will contain single tracks to close virtual_tracks = [] # Will contain the results tracks rest_of_fields = [] # Will contain variable output fields when required ### Parse arguments ### for p in self.input_args: if p['key'] in kwargs: value = kwargs[p['key']] elif len(args) >= p['position']: value = args[p['position']-1] elif 'default' in p: value = p['default'] elif p.get('optional'): continue else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \ % (p['key'], self.short_name)) # Cast it if it's not the right type # if not isinstance(value, p['type']): value = p['type'](value) # Add it to the dict # found_args[p['key']] = value ### Parse tracks ### for t in self.input_tracks: if t['key'] in kwargs: value = kwargs[t['key']] elif len(args) >= t['position']: value = args[t['position']-1] elif 'default' in t: value = t['default'] elif t.get('optional'): continue else: raise Exception("The argument '%s' is missing for the manipulation '%s'." \ % (t['key'], self.short_name)) # Check is track collection # if t.get('kind') == 'many': if not is_list(value): message = "The track collection '%s' for the manipulation '%s' is not a list: %s" raise Exception(message % (t['key'], self.short_name, value)) # Don't modify the input list # if t.get('kind') == 'many': value = value[:] # Check for generator case # if is_gen(value): generator_call = True if t.get('kind') == 'many' and is_gen(value[0]): generator_call = True if generator_call: found_tracks[t['key']] = value continue # Check is path # if isinstance(value, basestring): value = track.load(value, readonly=True) tracks_to_close.append(value) if t.get('kind') == 'many': for i,_ in enumerate(value): if isinstance(value[i], basestring): value[i] = track.load(value[i], readonly=True) tracks_to_close.append(value[i]) # Add to the list of all tracks # if t.get('kind') == 'many': all_tracks += [x for x in value] else: all_tracks += [value] # Track collection must be combined # if t.get('kind') == 'many': value = TrackCollection(value, self.fields_collapse, self.chroms_collapse) # Variable fields case (track collection must collapse fields) # if t['fields'][-1] == '...': first_fields = t['fields'][:-1] rest_of_fields = [f for f in value.fields if f not in first_fields] value.fields = first_fields + rest_of_fields # Specific fields case # else: value.fields = t['fields'] # What about track SimpleTrack case # pass #TODO # Add it to the dict # found_tracks[t['key']] = value # Check for generator case # if generator_call: return self.from_generator(found_tracks, found_args, args, kwargs) # Collapse chromosomes # if not self.chroms_collapse: chromosomes = all_tracks[0].chromosomes else: chromosomes = collapse(self.chroms_collapse, [t.chromosomes for t in all_tracks]) # Multiple output tracks disabled # t = self.output_tracks[0] # Make a new virtual track # vtrack = VirtualTrack() # Output chromosome metadata # for chrom in chromosomes: vtrack.chrmeta[chrom] = {'length': max([i.chrmeta[chrom]['length'] for i in all_tracks])} # Output attributes # if t.get('datatype'): vtrack.datatype = t['datatype'] # Output name # vtrack.name = self.long_name + ' on ' + andify_strings([i.name for i in all_tracks]) ### Iterate on chromosomes ### for chrom in chromosomes: # Get special input arguments # for p in self.input_meta: if p['kind'] == 'chrom_len': extra_args[p['key']] = vtrack.chrmeta[chrom]['length'] # Call read on tracks # for k,input_track in found_tracks.items(): if is_list(input_track): found_generators[k] = [i.read(chrom) for i in input_track] else: found_generators[k] = input_track.read(chrom) # What about track collapse and recursion # pass #TODO # Final argument list # final_args = {} for d in (found_args, found_generators, extra_args): final_args.update(d) # Call generate # data = self.generate(**final_args) # Variable fields case # if t['fields'][-1] == '...': fields = t['fields'][:-1] + rest_of_fields else: fields = t['fields'] # Make a FeatureStream # stream = FeatureStream(data, fields) # Add it to the virtual track # vtrack.write(chrom, stream) # Close tracks later # vtrack.tracks_to_close = tracks_to_close # Add it # virtual_tracks.append(vtrack) # Return one virutal track or list of virtual tracks # return len(virtual_tracks) == 1 and virtual_tracks[0] or virtual_tracks
def runTest(self): url = "http://salt.epfl.ch/BED/sinclair/genomic/ChIP.bedGraph" with track.load(url) as t: got = t.count('chrY') expected = 577 self.assertEqual(got, expected)