def __call__(self, **kw): fasta_file = kw.get('fastafile') background = kw.get('background') or None assembly_id = kw.get('assembly') or None regions_file = kw.get('regions') or None motifs_list = kw.get('motifs') motif_add = kw.get('customMotif') threshold = float(kw.get('threshold') or 0) if motifs_list is None: motifs_list = [] if isinstance(motifs_list, basestring): motifs_list = motifs_list.split("|") if not isinstance(motifs_list, list): motifs_list = [motifs_list] if background is None and assembly_id is None: background = self.temporary_path(fname='background.txt') stats = {'A': 0.25,'C': 0.25, 'G': 0.25, 'T': 0.25} if fasta_file: fasta_file = os.path.abspath(fasta_file) with execution(None) as ex: stats = fasta_composition(ex,fasta_file,frequency=True) with open(background,"w") as bgr: bgr.write(" ".join(["1"]+[str(stats[n]) for n in 'ACGT'])) if assembly_id is not None: assembly = genrep.Assembly(assembly_id) else: if regions_file is not None: raise ValueError("Please specify an assembly if you specify regions.") regions_file = os.path.abspath(regions_file) assembly = None motifs = {} if motif_add is not None: mname = os.path.basename(os.path.splitext(motif_add)[0]) if mname: motifs[mname] = os.path.abspath(motif_add) for mot in motifs_list: gid, mname = mot.split(' ') pwmfile = self.temporary_path() g.get_motif_PWM(int(gid), mname, output=pwmfile) motifs[mname] = pwmfile if len(motifs) == 0: raise ValueError("Please give at least one motif to scan for") track_output = self.temporary_path(fname='motif_scan', ext="bed") with execution(None) as ex: save_motif_profile( ex, motifs, assembly, regions_file, fasta_file, background=background, threshold=threshold, output=track_output, description=None, via='local' ) self.new_file(track_output, 'motif_track') return self.display_time()
def __call__(self, **kw): fasta_file = kw.get('fastafile') background = kw.get('background') or None assembly_id = kw.get('assembly') or None regions_file = kw.get('regions') or None motifs_list = kw.get('motifs') motif_add = kw.get('customMotif') threshold = float(kw.get('threshold') or 0) if motifs_list is None: motifs_list = [] if isinstance(motifs_list, basestring): motifs_list = motifs_list.split("|") if not isinstance(motifs_list, list): motifs_list = [motifs_list] if background is None and assembly_id is None: background = self.temporary_path(fname='background.txt') stats = {'A': 0.25,'C': 0.25, 'G': 0.25, 'T': 0.25} if fasta_file: with execution(None) as ex: stats = fasta_composition(ex,fasta_file,frequency=True) with open(background,"w") as bgr: bgr.write(" ".join(["1"]+[str(stats[n]) for n in 'ACGT'])) if assembly_id is not None: assembly = genrep.Assembly(assembly_id) else: if regions_file is not None: raise ValueError("Please specify an assembly if you specify regions.") regions_file = os.path.abspath(regions_file) assembly = None motifs = {} if motif_add is not None: mname = os.path.basename(os.path.splitext(motif_add)[0]) if mname: motifs[mname] = os.path.abspath(motif_add) for mot in motifs_list: gid, mname = mot.split(' ') pwmfile = self.temporary_path() g.get_motif_PWM(int(gid), mname, output=pwmfile) motifs[mname] = pwmfile if len(motifs) == 0: raise ValueError("Please give at least one motif to scan for") track_output = self.temporary_path(fname='motif_scan', ext="bed") with execution(None) as ex: save_motif_profile( ex, motifs, assembly, regions_file, fasta_file, background=background, threshold=threshold, output=track_output, description=None, via='local' ) self.new_file(track_output, 'motif_track') return self.display_time()
def test_clean_deseq_output(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) clean = DE.clean_deseq_output(genes_diff) shutil.copy(clean, genes_diff_clean) with open(clean) as c: self.assertEqual(len(c.readlines()), 5)
def test_clean_before_deseq(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) clean = DE.clean_before_deseq(gene_counts) shutil.copy(clean, gene_counts_clean) with open(clean) as c: self.assertEqual(len(c.readlines()), 4)
def __call__(self, **kw): filename = kw.get('sample') assert os.path.exists(str(filename)), "File not found: '%s'" %filename script_path = kw.get("script_path",default_path) tarname = kw.get('name')+"_domainogram.tar.gz" domainograms_tar = tarfile.open(tarname, "w:gz") if re.search(r'sql$',str(filename)): convert((str(filename),'sql'),(str(filename)+'.bedGraph','bedGraph')) filename=str(filename)+'.bedGraph' with execution(None) as ex: res = c4seq.runDomainogram(ex,infile=filename,name=kw.get('name'),prefix=None,regCoord=kw.get('region'),wmaxDomainogram=str(kw.get('wmaxDomainogram')),wmax_BRICKS=str(kw.get('wmax_BRICKS')),script_path=script_path) start = False with open(res) as f: for s in f: s = s.strip() if re.search('####resfiles####',s): start = True elif start and not re.search("RData",s): domainograms_tar.add(s) domainograms_tar.close() self.new_file(tarname, 'domainograms_tar') return self.display_time()
def __call__(self, **kw): _tool = kw.pop('tool') try: selected_tool = all_tools[int(_tool)] except ValueError: selected_tool = str(_tool) _toolid = all_tools.index(selected_tool) kw = dict((k, v) for k, v in kw.iteritems() if k in ['outfile'] + tools_map[_toolid]) kw['outfile'] = self.temporary_path(fname=selected_tool + '.txt') if kw.get('useropts'): reo = re.search(r'([\w\s\-,.=]+)', kw.pop('useropts')) if reo: key = None for x in reo.groups()[0].split(): if x.startswith('-'): key = x kw[key] = '' elif key: kw[key] = str(x) if kw.get('labels'): kw['labels'] = kw['labels'].split(",") for x in all_params: if x[-5:] == "files" and kw.get(x): kw[x[1:]] = kw.pop(x)[x[1:]] for k in kw.keys(): if kw[k] in (None, '', u'', [], {}): kw.pop(k) with execution(None) as ex: output = eval(selected_tool)(ex, **kw) self.new_file(output, selected_tool + '_result') return self.display_time()
def test_clean_before_deseq(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) clean = DE.clean_before_deseq(gene_counts) shutil.copy(clean, gene_counts_clean) with open(clean) as c: self.assertEqual(len(c.readlines()), 4)
def __call__(self, **kw): _tool = kw.pop('tool') try: selected_tool = all_tools[int(_tool)] except ValueError: selected_tool = str(_tool) _toolid = all_tools.index(selected_tool) kw = dict((k,v) for k,v in kw.iteritems() if k in ['outfile']+tools_map[_toolid]) kw['outfile'] = self.temporary_path(fname=selected_tool+'.txt') if kw.get('useropts'): reo = re.search(r'([\w\s\-,.=]+)', kw.pop('useropts')) if reo: key = None for x in reo.groups()[0].split(): if x.startswith('-'): key = x kw[key] = '' elif key: kw[key] = str(x) if kw.get('labels'): kw['labels'] = kw['labels'].split(",") for x in all_params: if x[-5:] == "files" and kw.get(x): kw[x[1:]] = kw.pop(x)[x[1:]] for k in kw.keys(): if kw[k] in (None,'',u'',[],{}): kw.pop(k) with execution(None) as ex: output = eval(selected_tool)(ex, **kw) self.new_file(output, selected_tool+'_result') return self.display_time()
def test_clean_deseq_output(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) clean = DE.clean_deseq_output(genes_diff) shutil.copy(clean, genes_diff_clean) with open(clean) as c: self.assertEqual(len(c.readlines()), 5)
def __call__(self, **kw): b2wargs = [] control = None if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] bamfile = track(kw['sample'], format='bam') nreads = int(kw.get('normalization') or -1) if nreads < 0: if control is None: nreads = len(set((t[4] for t in bamfile.read()))) else: b2wargs += ["-r"] merge_strands = int(kw.get('merge_strands') or -1) if merge_strands >= 0: suffixes = ["merged"] else: suffixes = ["fwd", "rev"] read_extension = int(kw.get('read_extension') or -1) output = self.temporary_path(fname='density_') with execution(None) as ex: files = bam_to_density(ex, kw['sample'], output, nreads=nreads, merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) for n, x in enumerate(files): tout = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=bamfile.chrmeta, info={'datatype': 'quantitative'}) tout.save() self.new_file(x, 'density_' + suffixes[n]) return 1
def test_external_add_nh_flag(self): with execution(None) as ex: f = external_add_nh_flag(ex, os.path.join(path, "mapped.sam")) g = add_nh_flag(os.path.join(path, "mapped.sam")) m = md5sum(ex, f) m2 = md5sum(ex, g) self.assertEqual(m, m2)
def test_differential_analysis(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) diffs = DE.differential_analysis(gene_counts_clean) for diff in diffs: shutil.copy(diff, genes_diff) with open(diff) as d: self.assertEqual(len(d.readlines()), 5)
def test_differential_analysis(self): with execution(None) as ex: DE = DE_Analysis(ex, *self.args) diffs = DE.differential_analysis(gene_counts_clean) for diff in diffs: shutil.copy(diff, genes_diff) with open(diff) as d: self.assertEqual(len(d.readlines()), 5)
def test_parallel_bowtie_lsf(self): with execution(None) as ex: bam = parallel_bowtie( ex, os.path.join(path, "selected_transcripts"), os.path.join(path, "reads.raw"), n_lines=250, via="lsf" ) sam = bam_to_sam(ex, bam) new_sam = remove_lines_matching(ex, "@PG", sam) new_bam = sam_to_bam(ex, new_sam) m = md5sum(ex, new_bam) self.assertEqual(m, "find right md5sum")
def test_parallel_bowtie_local(self): with execution(None) as ex: bam = parallel_bowtie( ex, os.path.join(path, "selected_transcripts"), os.path.join(path, "reads.raw"), n_lines=250, via="local", ) sam = bam_to_sam(ex, bam) new_sam = remove_lines_matching(ex, "@PG", sam) new_bam = sam_to_bam(ex, new_sam) self.assertEqual(md5sum(ex, new_bam), "2e6bd8ce814949075715b8ffddd1dcd5")
def test_parallel_bowtie_local_with_nh_flags(self): with execution(None) as ex: bam = parallel_bowtie( ex, os.path.join(path, "selected_transcripts"), os.path.join(path, "reads.raw"), n_lines=250, add_nh_flags=True, via="local", ) sam = bam_to_sam(ex, bam) new_sam = remove_lines_matching(ex, "@PG", sam) new_bam = sam_to_bam(ex, new_sam) self.assertEqual(md5sum(ex, new_bam), "529cd218ec0a35d5d0a23fd7b842ee20")
def test_parallel_bowtie_lsf_with_nh_flags(self): with execution(None) as ex: bam = parallel_bowtie( ex, os.path.join(path, "selected_transcripts"), os.path.join(path, "reads.raw"), n_lines=250, add_nh_flags=True, via="lsf", ) sam = bam_to_sam(ex, bam) new_sam = remove_lines_matching(ex, "@PG", sam) new_bam = sam_to_bam(ex, new_sam) m = md5sum(ex, new_bam) self.assertEqual(m, "7b7c270a3980492e82591a785d87538f")
def __call__(self, **kw): input_type = kw.get('input_type', 0) if str(input_type) in [str(x[0]) for x in input_types]: input_type = int(input_type) if input_type in input_types[0]: #fasta fasta = kw.get('fastafile') name = os.path.splitext(os.path.basename(fasta))[0] assembly = genrep.Assembly(fasta=fasta) size = None elif input_type in input_types[1]: #regions assembly = genrep.Assembly(kw.get('assembly')) regions_file = kw.get('regions') or '' if not os.path.exists(regions_file): raise ValueError("File not found: %s" % regions_file) regions = track(regions_file, chrmeta=assembly.chrmeta) name = regions.name gRef = assembly.fasta_by_chrom fasta = self.temporary_path(fname=regions.name + '.fa') (fasta, size) = assembly.fasta_from_regions(list( regions.read(fields=['chr', 'start', 'end'])), out=fasta, path_to_ref=gRef) else: raise ValueError("Input type not implemented: %s" % input_type) fasta = os.path.abspath(fasta) background = assembly.statistics( self.temporary_path(fname="background"), frequency=True) output = self.temporary_path(fname=name + "_meme.tgz") outdir = os.path.join(os.path.split(fasta)[0], name + "_meme") meme_args = kw.get("meme_args", []) nmotifs = kw.get('nmotifs') or _nm if not '-nmotifs' in meme_args: meme_args += ['-nmotifs', "%i" % int(nmotifs)] with execution(None) as ex: if size is None: size = sum(fasta_length(ex, fasta).values()) meme_out = meme(ex, fasta, outdir, background, maxsize=(size * 3) / 2, args=meme_args) tarf = tarfile.open(output, "w:gz") tarf.add(outdir, arcname=os.path.basename(outdir)) tarf.add(fasta, arcname=os.path.basename(fasta)) tarf.close() self.new_file(output, 'meme_archive') return self.display_time()
def __call__(self, **kw): input_type = kw.get('input_type', 0) ass = kw.get('assembly','') regions_file = kw.get('regions') or '' if regions_file: regions_file = os.path.abspath(regions_file) fasta = kw.get('fastafile') or '' if fasta: name = os.path.splitext(os.path.basename(fasta))[0] fasta = os.path.abspath(fasta) else: if not os.path.exists(regions_file): raise ValueError("File not found: %s" %regions_file) name = track(regions_file).name fasta = os.path.abspath(self.temporary_path(fname=name+'.fa')) output = self.temporary_path(fname=name+"_meme.tgz") outdir = os.path.abspath(os.path.join(os.path.split(fasta)[0],name+"_meme")) bfile = self.temporary_path(fname="background") with execution(None) as ex: if str(input_type) in [str(x[0]) for x in input_types]: input_type = int(input_type) if input_type in input_types[0]: #fasta if ass in [x[0] for x in genrep.GenRep().assemblies_available()]: assembly = genrep.Assembly(ass) else: assembly = genrep.Assembly(ex=ex,fasta=fasta) size = None elif input_type in input_types[1]: #regions assembly = genrep.Assembly(ass) regions = track(regions_file,chrmeta=assembly.chrmeta) gRef = assembly.fasta_by_chrom (fasta, size) = assembly.fasta_from_regions( list(regions.read(fields=['chr','start','end'])), out=fasta, path_to_ref=gRef ) else: raise ValueError("Input type not implemented: %s" %input_type) background = assembly.statistics(bfile, frequency=True) meme_args = kw.get("meme_args",[]) nmotifs = kw.get('nmotifs') or _nm if not '-nmotifs' in meme_args: meme_args += ['-nmotifs',"%i" %int(nmotifs)] if size is None: size = sum(x['length'] for x in fasta_length(ex,fasta).values())+1000 meme_out = meme( ex, fasta, outdir, background, maxsize=size, args=meme_args ) tarf = tarfile.open(output, "w:gz") tarf.add(outdir,arcname=os.path.basename(outdir)) tarf.add(fasta,arcname=os.path.basename(fasta)) tarf.close() self.new_file(output, 'meme_archive') return self.display_time()
def __call__(self, **kw): kw['outfile'] = self.temporary_path() reo = re.search(r'([\w\s\-,.=]+)', kw.pop('useropts') or '') if reo: key = None for x in reo.groups()[0].split(): if x.startswith('-'): key = x kw[key] = '' elif key: kw[key] = str(x) if kw.get('labels'): kw['labels'] = kw['labels'].split(",") with execution(None) as ex: output = eval(all_tools[kw.pop('tool')])(ex, **kw) self.new_file(output, 'bedtools_result') return self.display_time()
def test_count_reads(self): with execution(None) as ex: C = Counter(ex, *self.args) bamfiles = [os.path.join(testpath,"gapdhKO.bam")]*2 gtf = os.path.join(testpath,"mm9_3genes_renamed.gtf") count_files = C.count_reads(bamfiles, gtf) with open(count_files["transcripts"]) as trans: self.assertEqual(len(trans.readlines()), 18) with open(count_files["genes"]) as genes: self.assertEqual(len(genes.readlines()), 4) # Edit the numbers so that DE analysis makes sense genes = [x.split('\t') for x in open(count_files["genes"]).readlines()] genes[0][1] = 'Count.KO.1'; genes[0][2] = 'Count.WT.1' genes[1][1] = '2.0'; genes[1][2] = '3.0' genes[3][1] = '800.0'; genes[3][2] = '900.0' genes = ['\t'.join(x) for x in genes] with open(gene_counts,'wb') as g: g.writelines(genes)
def __call__(self, **kw): b2wargs = [] control = None sample = kw.get("sample") assert os.path.exists(str(sample)), "Bam file not found: '%s'." % sample if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists(str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) sample = os.path.abspath(sample) nreads = int(kw.get('normalization') or -1) bamfile = track(sample, format='bam') if nreads < 0: if control is None: nreads = len(set((t[4] for t in bamfile.read()))) else: b2wargs += ["-r"] merge_strands = int(kw.get('merge_strands') or -1) read_extension = int(kw.get('read_extension') or -1) output = self.temporary_path(fname='density_') format = kw.get("format", "sql") with execution(None) as ex: files = bam_to_density(ex, sample, output, nreads=nreads, merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) if merge_strands >= 0: suffixes = ["merged"] else: suffixes = ["fwd", "rev"] for n, x in enumerate(files): tsql = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=bamfile.chrmeta, info={'datatype': 'quantitative'}) tsql.save() if format == "sql": outname = x else: outname = os.path.splitext(x)[0]+"."+format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_'+suffixes[n]) return self.display_time()
def test_count_reads(self): with execution(None) as ex: C = Counter(ex, *self.args) bamfiles = [os.path.join(testpath, "gapdhKO.bam")] * 2 gtf = os.path.join(testpath, "mm9_3genes_renamed.gtf") count_files = C.count_reads(bamfiles, gtf) with open(count_files["transcripts"]) as trans: self.assertEqual(len(trans.readlines()), 18) with open(count_files["genes"]) as genes: self.assertEqual(len(genes.readlines()), 4) # Edit the numbers so that DE analysis makes sense genes = [ x.split('\t') for x in open(count_files["genes"]).readlines() ] genes[0][1] = 'Count.KO.1' genes[0][2] = 'Count.WT.1' genes[1][1] = '2.0' genes[1][2] = '3.0' genes[3][1] = '800.0' genes[3][2] = '900.0' genes = ['\t'.join(x) for x in genes] with open(gene_counts, 'wb') as g: g.writelines(genes)
def __call__(self, **kw): filename = kw.get('sample') assert os.path.exists(str(filename)), "File not found: '%s'" % filename script_path = kw.get("script_path", default_path) tarname = kw.get('name') + "_domainogram.tar.gz" domainograms_tar = tarfile.open(tarname, "w:gz") if re.search(r'sql$', str(filename)): convert((str(filename), 'sql'), (str(filename) + '.bedGraph', 'bedGraph')) filename = str(filename) + '.bedGraph' with execution(None) as ex: res = c4seq.runDomainogram(ex, infile=filename, name=kw.get('name'), prefix=None, regCoord=kw.get('region'), wmaxDomainogram=str( kw.get('wmaxDomainogram')), wmax_BRICKS=str(kw.get('wmax_BRICKS')), script_path=script_path) start = False with open(res) as f: for s in f: s = s.strip() if re.search('####resfiles####', s): start = True elif start and not re.search("RData", s): domainograms_tar.add(s) domainograms_tar.close() self.new_file(tarname, 'domainograms_tar') return self.display_time()
def main(argv = None): """ Entry point when program start """ genrep = None assembly = None lims = None job = None config = None config_file = None background = "" matrix = "" original_sql_data = "" random_sql_data = "" track_filtered = "" track_scanned = "" project = "" username = "" identity_file = "" host = "" website = "" remote_path = "" result_path = "" track_regions_path = "" via = "" limspath = "" fdr = 0 runs = {} logging.basicConfig(filename='run_scanning.log',level=logging.INFO) if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt ( argv[1:],"hu:c:" , [ "help", "via = ", "host = " , "remote_path = " , "website = " , "minilims = ","config = " , "matrix = ", "username = "******"identity_file = ", "project = " ] ) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-h", "--help"): print __doc__ print USAGE sys.exit(0) elif option == "--via": if value == "local": via = "local" elif value == "lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,)) elif option == "--website": website = normalize_url(value) elif option == "--minilims": limspath = normcase(expanduser(value)) elif option == "--host": host = value elif option == "--identity_file": identity_file = value elif option == "--remote_path": remote_path = normcase(expanduser(value)) if not remote_path.endswith(sep): remote_path += sep elif option == "--matrix": matrix = {basename(value):normcase(expanduser(value))} elif option == "--username": username = value elif option == "--project": project = value elif option in ("-c", "--config"): config_file = normcase(expanduser(value)) else: raise Usage("Unhandled option: " + option) # read config file if config_file is None or not exists(config_file) or not isfile(config_file): raise Usage("Config file missing") else: job, config = parseConfig(normcase(expanduser(config_file))) if project == "": project = job.description if matrix == "": if "matrix" in job.options: path = normcase(expanduser(job.options["matrix"])) matrix = {basename(path): path} else: raise Usage("You need give value matrix file ") if limspath == "": if "minilims" in job.options: limspath = job.options["minilims"] else: raise Usage("You need give value minilims path/name") if via == "": if "via" in job.options: via = job.options["via"] else: via = "lsf" if host == "" and "host" in job.options: host = job.options["host"] if identity_file == "" and "identity_file" in job.options: identity_file = job.options["identity_file"] if remote_path == "" and "remote_path" in job.options: remote_path = job.options["remote_path"] if username == "" and "username" in job.options: username = job.options["username"] if website == "" and "website" in job.options: website = job.options["website"] genrep = GenRep(config = config) assembly = genrep.assembly(job.assembly_id) lims = MiniLIMS(limspath) json = create_gdv_project( config["gdv"]["key"], config["gdv"]["email"], project, assembly.nr_assembly_id, config["gdv"]["url"], public = True ) project_id = get_project_id( json ) # compute false discovery rate with execution(lims, description = job.description) as ex: background = genrep.statistics ( assembly, output = unique_filename_in(), frequency = True, matrix_format = True ) if len(job.groups) >2: raise ValueError("They are more than 2 group in config file") for group_number in job.groups: group = job.groups[group_number] for run_number in group["runs"]: run_iter = job.groups[group_number]["runs"][run_number] if "url" in run_iter: url = run_iter["url"] uri = "" if run_iter["run"] not in runs: runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None} if url.startswith("http") or url.startswith("www."): url = normalize_url(url) # download data data = urllib2.urlopen(url) uri = unique_filename_in() with open(uri, "w") as opening_file: opening_file.write(data.read()) else: uri = normcase(expanduser(url)) if group["control"]: runs[run_iter["run"]]["control"] = uri runs[run_iter["run"]]["name"] = basename(uri) else: runs[run_iter["run"]]["experimental"] = uri for run in runs: current_run = runs[run] original_sql_data = unique_filename_in() random_sql_data = unique_filename_in() track_filtered = unique_filename_in() logging.info( "[%s]" % job.description ) logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) ) # convert data to sql with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track: # Get sqlite file if is not arleady in this format if track.format != "sql" or track.format != "db" or track.format != "sqlite": track.convert(original_sql_data, format = "sql") else: original_sql_data = current_run["experimental"] # Generate a random population from orginal if it is not give from config file if current_run["control"] is None: # create random track track.shuffle_track(random_sql_data, repeat_number = 5) else: with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random: # Get sqlite file if is not arleady in this format if track_random.format != "sql" or \ track_random.format != "db" or \ track_random.format != "sqlite": track_random.convert(random_sql_data, format = "sql") else: random_sql_data = current_run["control"] track_scanned, fdr, p_value = sqlite_to_false_discovery_rate( ex, matrix, background, genrep, assembly.chromosomes, original_sql_data, random_sql_data, threshold = -100, via = via, keep_max_only = False, alpha = 0.05, nb_sample = 5.0 ) # filter track with fdr as treshold with new(track_filtered, format = "sql", datatype = "qualitative") as track_out: chromosome_used = {} track_out.meta_track = {"source": basename(current_run["experimental"])} track_out.meta_track.update({"k":"v"}) with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in: meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()]) for chromosome in track_in.all_chrs: data_list = [] for data in track_in.read ( {"chr": chromosome, "score": (fdr, sys.maxsize)}, fields = Track.qualitative_fields ): data_list.append(data) chromosome_used[chromosome] = meta[chromosome] if len(data_list) > 0: track_out.write(chromosome, data_list) track_out.chrmeta = chromosome_used ex.add(track_filtered, "sql: filtred %s" % track_filtered) logging.info( "scanned: %s" % track_scanned ) logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) ) logging.info( "filtred: %s" % track_filtered ) # fix track track_scanned_signal = fix_sqlite_db(track_scanned) logging.info( "scanned signal: %s" % track_scanned_signal ) ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal)) # send filtred track and scanned track to remote if host != "" and remote_path != "" and username != "": args = [] if identity_file != "": args = ["-i", normcase(expanduser(identity_file)), "-C" ] source_filtred = normcase(expanduser(track_filtered)) source_scanned = normcase(expanduser(track_scanned_signal)) result_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered) result_path = "%s%s%s.db" % (website, sep, track_filtered) track_regions_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal) track_regions_path = "%s%s%s.db" % (website, sep, track_scanned_signal) scp(ex, source_filtred, result_destination, args = args) scp(ex, source_scanned, track_regions_destination, args = args) else: result_path = track_filtered # Send to GDV filtred track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, result_path, name = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) # Send to GDV scanned track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, track_regions_path, name = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) logging.info( "++++++++++++") logging.info( "-------------------END--------------------")
def main(): map_args = None # {'bwt_args':["-n",str(3),"-p",str(4),"-d",str(50),"--chunkmbs",str(1024),"-m",str(5)]} opts = (("-v", "--via", "Run executions using method 'via' (can be 'local' or 'lsf')", {'default': "lsf"}), ("-k", "--key", "Alphanumeric key of the new RNA-seq job", {'default': None}), ("-d", "--minilims", "MiniLIMS where RNAseq executions and files will be stored.", {'default': None}), ("-m", "--mapseq-minilims", "MiniLIMS where a previous Mapseq execution and files has been stored. \ Set it to None to align de novo from read files.", {'default': "/data/htsstation/mapseq/mapseq_minilims", 'dest':"ms_limspath"}), ("-w", "--working-directory", "Create execution working directories in wdir", {'default': os.getcwd(), 'dest':"wdir"}), ("-c", "--config", "Config file", {'default': None}), ("-p", "--pileup_level", "Target features, inside of quotes, separated by commas.\ E.g. 'genes,exons,transcripts'",{'default': "genes,exons,transcripts"})) try: usage = "run_rnaseq.py [OPTIONS]" desc = """A High-throughput RNA-seq analysis workflow. It returns a file containing a column of transcript counts for each given BAM file, normalized using DESeq's size factors. """ parser = optparse.OptionParser(usage=usage, description=desc) for opt in opts: parser.add_option(opt[0],opt[1],help=opt[2],**opt[3]) (opt, args) = parser.parse_args() if os.path.exists(opt.wdir): os.chdir(opt.wdir) else: parser.error("Working directory '%s' does not exist." % opt.wdir) if not opt.minilims: parser.error("Must specify a MiniLIMS to attach to") # Rna-seq job configuration M = MiniLIMS(opt.minilims) if opt.key: gl = use_pickle( M, "global variables" ) htss = frontend.Frontend( url=gl['hts_rnaseq']['url'] ) job = htss.job(opt.key) # new *RNA-seq* job instance #h_pileup_level = {'0':'genes', '1':'exons', '2':'transcripts'} #pileup_level = [h_pileup_level[e] for e in job.options.get('pileup_level').split(',')] [M.delete_execution(x) for x in M.search_executions(with_description=opt.key,fails=True)] description = "Job run with mapseq key %s" % opt.key elif os.path.exists(opt.config): pileup_level = opt.pileup_level.split(',') (job,gl) = frontend.parseConfig(opt.config) description = "Job run with config file %s" % opt.config else: raise ValueError("Need either a job key (-k) or a configuration file (-c).") job.options['ucsc_bigwig'] = job.options.get('ucsc_bigwig') or True job.options['gdv_project'] = job.options.get('gdv_project') or False job.options['discard_pcr_duplicates'] = job.options.get('discard_pcr_duplicates') or False assembly_id = job.assembly_id g_rep = genrep.GenRep( gl['genrep_url'], gl.get('bwt_root'), intype=1 ) #intype is for mapping on the genome (intype=0), exons (intype=1) or transcriptome (intype=2) assembly = g_rep.assembly(assembly_id) # Retrieve mapseq output mapseq_url = None if 'hts_mapseq' in gl: mapseq_url = gl['hts_mapseq']['url'] # Program body # with execution(M, description=description, remote_working_directory=opt.wdir ) as ex: if opt.ms_limspath == "None": print "Alignment..." job = mapseq.get_fastq_files( job, ex.working_directory) fastq_root = os.path.abspath(ex.working_directory) bam_files = mapseq.map_groups(ex, job, fastq_root, assembly_or_dict=assembly, map_args=map_args) print "Reads aligned." else: print "Loading BAM files..." (bam_files, job) = mapseq.get_bam_wig_files(ex, job, minilims=opt.ms_limspath, hts_url=mapseq_url, script_path=gl.get('script_path') or '', via=opt.via ) print "Loaded." assert bam_files, "Bam files not found." print "Current working directory:", ex.working_directory rnaseq.rnaseq_workflow(ex, job, assembly, bam_files, pileup_level=pileup_level, via=opt.via) # End of program body # # GDV allfiles = common.get_files(ex.id, M) if 'gdv_project' in job.options and 'sql' in allfiles: allfiles['url'] = {job.options['gdv_project']['public_url']: 'GDV view'} download_url = gl['hts_rnapseq']['download'] [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'], job.options['gdv_project']['project_id'], url=download_url+str(k), name = re.sub('\.sql','',str(f)), gdv_url=gl['gdv']['url'] ) for k,f in allfiles['sql'].iteritems()] print json.dumps(allfiles) # E-mail if 'email' in gl: r = email.EmailReport( sender=gl['email']['sender'], to=str(job.email), subject="RNA-seq job "+str(job.description), smtp_server=gl['email']['smtp'] ) r.appendBody('''Your RNA-seq job is finished. \n The description was: '''+str(job.description)+''' and its unique key is '''+opt.key+'''. \n You can retrieve the results at this url: '''+gl['hts_rnaseq']['url']+"jobs/"+opt.key+"/get_results" ) r.send() sys.exit(0) except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, usage return 2
def __call__(self,opts): self.opts = opts if os.path.exists(self.opts.wdir): os.chdir(self.opts.wdir) else: raise Usage("Working directory '%s' does not exist." %self.opts.wdir) ##### Connect to Minilims, recover global variables, fetch job info self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims") M = MiniLIMS(self.minilims) if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))): raise Usage("Need a job key or a configuration file") if self.opts.key: self.globals = use_pickle(M, "global variables") htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] ) self.job = htss.job( self.opts.key ) [M.delete_execution(x) for x in \ M.search_executions(with_description=self.opts.key,fails=True)] if self.job.options.get("config_file"): if os.path.exists(self.job.options["config_file"]): self.opts.config = os.path.abspath(self.job.options["config_file"]) elif os.path.exists("config.txt"): self.opts.config = os.path.abspath("config.txt") if self.opts.config and os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals ) elif os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config ) self.opts.key = self.job.description else: raise Usage("Need either a job key (-k) or a configuration file (-c).") ##### Genrep instance if 'fasta_file' in self.job.options: if os.path.exists(self.job.options['fasta_file']): self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path']) else: for ext in (".fa",".fa.gz",".tar.gz"): if os.path.exists("ref_sequence"+ext): self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext) if not os.path.exists(self.job.options['fasta_file']): raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"]) g_rep = genrep.GenRep( url=self.globals.get("genrep_url"), root=self.globals.get("bwt_root") ) ##### Configure facility LIMS if 'lims' in self.globals: from bbcflib import daflims self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'], password=pwd )) for loc,pwd in self.globals['lims']['passwd'].iteritems()) ######################################################################## ########################## EXECUTION ################################# ######################################################################## ##### Logging logfile_name = os.path.abspath(self.opts.key+".log") debugfile_name = os.path.abspath(self.opts.key+".debug") self.logfile = open(logfile_name,'w') self.debugfile = open(debugfile_name,'w') self.debug_write(json.dumps(self.globals)+"\n") with execution( M, description=self.opts.key, remote_working_directory=self.opts.wdir ) as ex: self.log_write("Enter execution. Current working directory: %s" %ex.working_directory) self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id, genrep=g_rep, fasta=self.job.options.get('fasta_file'), annot=self.job.options.get('annot_file'), intype=self.job.options.get('input_type_id',0), ex=ex, via=self.opts.via, bowtie2=self.job.options.get("bowtie2",True) ) ##### Check all the options if not self.check_options(): raise Usage("Problem with options %s" %self.opts) self.debug_write(json.dumps(self.job.options)) self.init_files( ex ) ##### Run workflow self.log_write("Starting workflow.") self.main_func(ex,**self.main_args) ##### Add logs to the LIMS in admin mode self.logfile.flush() self.debugfile.flush() log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin") debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin") ex.add(os.path.join(logfile_name), description=log_desc) ex.add(os.path.join(debugfile_name), description=debug_desc) ##### Create GDV project if self.job.options['create_gdv_project']: self.gdv_create(ex) ######################################################################## ######################## POSTPROCESSING ############################## ######################################################################## allfiles = get_files( ex.id, M ) if self.job.options['create_gdv_project'] and \ self.job.options['gdv_project'].get('project',{}).get('id',0)>0: allfiles['url'] = self.gdv_upload(allfiles.get('sql',{})) self.logfile.close() self.debugfile.close() print json.dumps(allfiles) with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done) self.send_email() return 0
def test_internal_add_nh_flag(self): with execution(None) as ex: f = add_nh_flag(os.path.join(path, "mapped.sam")) m = md5sum(ex, f) self.assertEqual(m, "50798b19517575533b8ccae5b1369a3e")
def __call__(self, **kw): b2wargs = [] control = None samples = kw.get('BamMulti', {}).get('sample', []) if not isinstance(samples, list): samples = [samples] samples = [os.path.abspath(s) for s in samples if os.path.exists(s)] if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists( str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) try: nreads = int(kw.get('normalization')) except (ValueError, TypeError): nreads = -1 bamfiles = [track(s, format='bam') for s in samples] if nreads < 0: _nreads = [0] * len(samples) if control is not None: b2wargs += ["-r"] else: _nreads = [nreads for s in samples] try: merge_strands = int(kw.get('merge_strands')) except (ValueError, TypeError): merge_strands = -1 try: read_extension = int(kw.get('read_extension')) except (ValueError, TypeError): read_extension = -1 output = [ self.temporary_path(fname=b.name + '_density_') for b in bamfiles ] format = kw.get("format", "sql") with execution(None) as ex: files = [ bam_to_density(ex, s, output[n], nreads=_nreads[n], merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) for n, s in enumerate(samples) ] info = {'datatype': 'quantitative', 'read_extension': read_extension} if merge_strands >= 0: suffixes = ["merged"] info['shift'] = merge_strands else: suffixes = ["fwd", "rev"] chrmeta = bamfiles[0].chrmeta for suf in suffixes: all_s_files = [ x for y in files for x in y if x.endswith(suf + ".sql") ] if len(all_s_files) > 1: x = self.temporary_path(fname="Density_average_" + suf + ".sql") tsql = track(x, fields=['start', 'end', 'score'], chrmeta=chrmeta, info={'datatype': 'quantitative'}) insql = [] for f in all_s_files: t = track(f, format='sql', chrmeta=chrmeta) t.save() insql.append(t) for c in tsql.chrmeta: tsql.write(merge_scores([t.read(c) for t in insql]), chrom=c) else: x = all_s_files[0] tsql = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=chrmeta, info=info) tsql.save() if format in [None, "sql"]: outname = x else: outname = os.path.splitext(x)[0] + "." + format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_' + suf) return self.display_time()
def __call__(self, **kw): b2wargs = [] control = None samples = kw.get('BamMulti',{}).get('sample', []) if not isinstance(samples, list): samples = [samples] samples = [os.path.abspath(s) for s in samples if os.path.exists(s)] if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists(str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) try: nreads = int(kw.get('normalization')) except (ValueError, TypeError): nreads = -1 bamfiles = [track(s, format='bam') for s in samples] if nreads < 0: _nreads = [0]*len(samples) if control is not None: b2wargs += ["-r"] else: _nreads = [nreads for s in samples] try: merge_strands = int(kw.get('merge_strands')) except (ValueError, TypeError): merge_strands = -1 try: read_extension = int(kw.get('read_extension')) except (ValueError, TypeError): read_extension = -1 output = [self.temporary_path(fname=b.name+'_density_') for b in bamfiles] format = kw.get("format", "sql") with execution(None) as ex: files = [bam_to_density( ex, s, output[n], nreads=_nreads[n], merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs ) for n,s in enumerate(samples)] info = {'datatype': 'quantitative', 'read_extension': read_extension} if merge_strands >= 0: suffixes = ["merged"] info['shift'] = merge_strands else: suffixes = ["fwd", "rev"] chrmeta = bamfiles[0].chrmeta for suf in suffixes: all_s_files = [x for y in files for x in y if x.endswith(suf+".sql")] if len(all_s_files) > 1: x = self.temporary_path(fname="Density_average_"+suf+".sql") tsql = track( x, fields=['start', 'end', 'score'], chrmeta=chrmeta, info={'datatype': 'quantitative'} ) insql = [] for f in all_s_files: t = track(f, format='sql', chrmeta=chrmeta) t.save() insql.append(t) for c in tsql.chrmeta: tsql.write(merge_scores([t.read(c) for t in insql]),chrom=c) else: x = all_s_files[0] tsql = track( x, format='sql', fields=['start', 'end', 'score'], chrmeta=chrmeta, info=info ) tsql.save() if format in [None,"sql"]: outname = x else: outname = os.path.splitext(x)[0]+"."+format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_'+suf) return self.display_time()