def adn(self, ass, chr, id, **kw): id = int(id) g = GenRep() chrs = g.get_genrep_objects('chromosomes', 'chromosome', filters = {'name':chr}, params = {'assembly_id': ass}) ass = Assembly(ass) for chrid, chrs in ass.chromosomes.iteritems(): if chrs['name'] == chr: start = id * chunk end = start + chunk return g.get_sequence(chrid[0], [[start, end]]) return ''
def adn(self, ass, chr, id, **kw): id = int(id) g = GenRep() chrs = g.get_genrep_objects('chromosomes', 'chromosome', filters={'name': chr}, params={'assembly_id': ass}) ass = Assembly(ass) for chrid, chrs in ass.chromosomes.iteritems(): if chrs['name'] == chr: start = id * chunk end = start + chunk return g.get_sequence(chrid[0], [[start, end]]) return ''
def setUp(self): self.assembly = Assembly('ce6') self.assembly.genrep = GenRep(url='http://bbcftools.epfl.ch/genrep/', root='/db/genrep') self.assembly.intype = '0' self.chromosomes = { (3066, u'NC_003279', 6): { 'length': 15072421, 'name': u'chrI' }, (3067, u'NC_003280', 7): { 'length': 15279323, 'name': u'chrII' }, (3068, u'NC_003281', 8): { 'length': 13783681, 'name': u'chrIII' }, (3069, u'NC_003282', 5): { 'length': 17493785, 'name': u'chrIV' }, (3070, u'NC_003283', 8): { 'length': 20919568, 'name': u'chrV' }, (3071, u'NC_003284', 7): { 'length': 17718854, 'name': u'chrX' }, (2948, u'NC_001328', 1): { 'length': 13794, 'name': u'chrM' } }
""" import os, sys, gzip, tarfile from bbcflib.gfminer.stream import neighborhood, segment_features, score_by_feature from bbcflib.gfminer.common import fusion, sorted_stream from bbcflib.gfminer.figure import lineplot from bbcflib.common import set_file_descr, unique_filename_in, intersect_many_bed, cat, merge_sql from bbcflib.chipseq import add_macs_results from bbcflib.mapseq import merge_bam, index_bam from bbcflib.track import track, convert, FeatureStream from bbcflib.genrep import GenRep from bbcflib.motif import save_motif_profile from bein import program from bein.util import touch from numpy import vstack, zeros _gnrp = GenRep() _macs_flank = 300 _plot_flank = (50, 50) def macs_bedfiles(ex, chrmeta, tests, controls, names, macs_args, via, logfile): missing_beds = [k for k, t in enumerate(tests) if not t[0]] if not missing_beds: return tests genome_size = sum([x['length'] for x in chrmeta.values()]) logfile.write("Running MACS.\n") logfile.flush() _tts = [tests[k][1] for k in missing_beds] _nms = { 'tests': [names['tests'][k] for k in missing_beds], 'controls': names['controls']
def main(argv = None): """ Entry point when program start """ genrep = None assembly = None lims = None job = None config = None config_file = None background = "" matrix = "" original_sql_data = "" random_sql_data = "" track_filtered = "" track_scanned = "" project = "" username = "" identity_file = "" host = "" website = "" remote_path = "" result_path = "" track_regions_path = "" via = "" limspath = "" fdr = 0 runs = {} logging.basicConfig(filename='run_scanning.log',level=logging.INFO) if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt ( argv[1:],"hu:c:" , [ "help", "via = ", "host = " , "remote_path = " , "website = " , "minilims = ","config = " , "matrix = ", "username = "******"identity_file = ", "project = " ] ) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-h", "--help"): print __doc__ print USAGE sys.exit(0) elif option == "--via": if value == "local": via = "local" elif value == "lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,)) elif option == "--website": website = normalize_url(value) elif option == "--minilims": limspath = normcase(expanduser(value)) elif option == "--host": host = value elif option == "--identity_file": identity_file = value elif option == "--remote_path": remote_path = normcase(expanduser(value)) if not remote_path.endswith(sep): remote_path += sep elif option == "--matrix": matrix = {basename(value):normcase(expanduser(value))} elif option == "--username": username = value elif option == "--project": project = value elif option in ("-c", "--config"): config_file = normcase(expanduser(value)) else: raise Usage("Unhandled option: " + option) # read config file if config_file is None or not exists(config_file) or not isfile(config_file): raise Usage("Config file missing") else: job, config = parseConfig(normcase(expanduser(config_file))) if project == "": project = job.description if matrix == "": if "matrix" in job.options: path = normcase(expanduser(job.options["matrix"])) matrix = {basename(path): path} else: raise Usage("You need give value matrix file ") if limspath == "": if "minilims" in job.options: limspath = job.options["minilims"] else: raise Usage("You need give value minilims path/name") if via == "": if "via" in job.options: via = job.options["via"] else: via = "lsf" if host == "" and "host" in job.options: host = job.options["host"] if identity_file == "" and "identity_file" in job.options: identity_file = job.options["identity_file"] if remote_path == "" and "remote_path" in job.options: remote_path = job.options["remote_path"] if username == "" and "username" in job.options: username = job.options["username"] if website == "" and "website" in job.options: website = job.options["website"] genrep = GenRep(config = config) assembly = genrep.assembly(job.assembly_id) lims = MiniLIMS(limspath) json = create_gdv_project( config["gdv"]["key"], config["gdv"]["email"], project, assembly.nr_assembly_id, config["gdv"]["url"], public = True ) project_id = get_project_id( json ) # compute false discovery rate with execution(lims, description = job.description) as ex: background = genrep.statistics ( assembly, output = unique_filename_in(), frequency = True, matrix_format = True ) if len(job.groups) >2: raise ValueError("They are more than 2 group in config file") for group_number in job.groups: group = job.groups[group_number] for run_number in group["runs"]: run_iter = job.groups[group_number]["runs"][run_number] if "url" in run_iter: url = run_iter["url"] uri = "" if run_iter["run"] not in runs: runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None} if url.startswith("http") or url.startswith("www."): url = normalize_url(url) # download data data = urllib2.urlopen(url) uri = unique_filename_in() with open(uri, "w") as opening_file: opening_file.write(data.read()) else: uri = normcase(expanduser(url)) if group["control"]: runs[run_iter["run"]]["control"] = uri runs[run_iter["run"]]["name"] = basename(uri) else: runs[run_iter["run"]]["experimental"] = uri for run in runs: current_run = runs[run] original_sql_data = unique_filename_in() random_sql_data = unique_filename_in() track_filtered = unique_filename_in() logging.info( "[%s]" % job.description ) logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) ) # convert data to sql with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track: # Get sqlite file if is not arleady in this format if track.format != "sql" or track.format != "db" or track.format != "sqlite": track.convert(original_sql_data, format = "sql") else: original_sql_data = current_run["experimental"] # Generate a random population from orginal if it is not give from config file if current_run["control"] is None: # create random track track.shuffle_track(random_sql_data, repeat_number = 5) else: with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random: # Get sqlite file if is not arleady in this format if track_random.format != "sql" or \ track_random.format != "db" or \ track_random.format != "sqlite": track_random.convert(random_sql_data, format = "sql") else: random_sql_data = current_run["control"] track_scanned, fdr, p_value = sqlite_to_false_discovery_rate( ex, matrix, background, genrep, assembly.chromosomes, original_sql_data, random_sql_data, threshold = -100, via = via, keep_max_only = False, alpha = 0.05, nb_sample = 5.0 ) # filter track with fdr as treshold with new(track_filtered, format = "sql", datatype = "qualitative") as track_out: chromosome_used = {} track_out.meta_track = {"source": basename(current_run["experimental"])} track_out.meta_track.update({"k":"v"}) with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in: meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()]) for chromosome in track_in.all_chrs: data_list = [] for data in track_in.read ( {"chr": chromosome, "score": (fdr, sys.maxsize)}, fields = Track.qualitative_fields ): data_list.append(data) chromosome_used[chromosome] = meta[chromosome] if len(data_list) > 0: track_out.write(chromosome, data_list) track_out.chrmeta = chromosome_used ex.add(track_filtered, "sql: filtred %s" % track_filtered) logging.info( "scanned: %s" % track_scanned ) logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) ) logging.info( "filtred: %s" % track_filtered ) # fix track track_scanned_signal = fix_sqlite_db(track_scanned) logging.info( "scanned signal: %s" % track_scanned_signal ) ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal)) # send filtred track and scanned track to remote if host != "" and remote_path != "" and username != "": args = [] if identity_file != "": args = ["-i", normcase(expanduser(identity_file)), "-C" ] source_filtred = normcase(expanduser(track_filtered)) source_scanned = normcase(expanduser(track_scanned_signal)) result_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered) result_path = "%s%s%s.db" % (website, sep, track_filtered) track_regions_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal) track_regions_path = "%s%s%s.db" % (website, sep, track_scanned_signal) scp(ex, source_filtred, result_destination, args = args) scp(ex, source_scanned, track_regions_destination, args = args) else: result_path = track_filtered # Send to GDV filtred track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, result_path, name = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) # Send to GDV scanned track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, track_regions_path, name = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) logging.info( "++++++++++++") logging.info( "-------------------END--------------------")