Beispiel #1
0
 def __call__(self, **kw):
     # Set assembly
     assembly_id = kw.get('assembly')
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
     # Set features track
     features = track(kw['features'], chrmeta=chrmeta or None)
     chrmeta = features.chrmeta
     # Set filter track
     filter = track(kw.get('filter'), chrmeta=chrmeta or None)
     # Main
     format = kw.get('format', features.format)
     output = self.temporary_path(fname=features.name + '_filtered.' +
                                  format)
     tout = track(output,
                  format,
                  fields=filter.fields,
                  chrmeta=chrmeta,
                  info={'datatype': 'qualitative'})
     for chrom in chrmeta:
         tout.write(overlap(features.read(chrom), filter.read(chrom)),
                    chrom=chrom,
                    clip=True)
     tout.close()
     self.new_file(output, 'filtered')
     return self.display_time()
Beispiel #2
0
 def test_annotate_snps(self):
     assembly = genrep.Assembly('sacCer2')
     filedict = {'chrV':path+'chrV'}
     self.outall, self.outexons = annotate_snps(filedict, ["s1","s2"], assembly)
     with open(self.outall,'r') as f: print "\nAll SNPs ('outall'):\n",f.read()
     with open(self.outexons,'r') as g: print "\nExonic SNPs ('outexons'):\n",g.read()
     raise IOError("Error raised voluntarily to print test outputs.")
Beispiel #3
0
def _get_chrmeta(**kw):
    chrmeta = "guess"
    assembly_id = kw.get('assembly')
    if assembly_id:
        assembly = genrep.Assembly(assembly_id)
        chrmeta = assembly.chrmeta
    return chrmeta
Beispiel #4
0
 def __call__(self, **kw):
     input_type = kw.get('input_type', 0)
     if str(input_type) in [str(x[0]) for x in input_types]:
         input_type = int(input_type)
     if input_type in input_types[0]:  #fasta
         fasta = kw.get('fastafile')
         name = os.path.splitext(os.path.basename(fasta))[0]
         assembly = genrep.Assembly(fasta=fasta)
         size = None
     elif input_type in input_types[1]:  #regions
         assembly = genrep.Assembly(kw.get('assembly'))
         regions_file = kw.get('regions') or ''
         if not os.path.exists(regions_file):
             raise ValueError("File not found: %s" % regions_file)
         regions = track(regions_file, chrmeta=assembly.chrmeta)
         name = regions.name
         gRef = assembly.fasta_by_chrom
         fasta = self.temporary_path(fname=regions.name + '.fa')
         (fasta, size) = assembly.fasta_from_regions(list(
             regions.read(fields=['chr', 'start', 'end'])),
                                                     out=fasta,
                                                     path_to_ref=gRef)
     else:
         raise ValueError("Input type not implemented: %s" % input_type)
     fasta = os.path.abspath(fasta)
     background = assembly.statistics(
         self.temporary_path(fname="background"), frequency=True)
     output = self.temporary_path(fname=name + "_meme.tgz")
     outdir = os.path.join(os.path.split(fasta)[0], name + "_meme")
     meme_args = kw.get("meme_args", [])
     nmotifs = kw.get('nmotifs') or _nm
     if not '-nmotifs' in meme_args:
         meme_args += ['-nmotifs', "%i" % int(nmotifs)]
     with execution(None) as ex:
         if size is None: size = sum(fasta_length(ex, fasta).values())
         meme_out = meme(ex,
                         fasta,
                         outdir,
                         background,
                         maxsize=(size * 3) / 2,
                         args=meme_args)
     tarf = tarfile.open(output, "w:gz")
     tarf.add(outdir, arcname=os.path.basename(outdir))
     tarf.add(fasta, arcname=os.path.basename(fasta))
     tarf.close()
     self.new_file(output, 'meme_archive')
     return self.display_time()
Beispiel #5
0
 def __call__(self, **kw):
     assembly = kw.get('assembly') or 'guess'
     signals_plus = kw.get('SigMultiP', {}).get('signals_plus', [])
     if not isinstance(signals_plus, list): signals_plus = [signals_plus]
     signals_minus = kw.get('SigMultiM', {}).get('signals_minus', [])
     if not isinstance(signals_minus, list): signals_minus = [signals_minus]
     features = kw.get('FeatMulti', {}).get('features', [])
     if not isinstance(features, list): features = [features]
     sptracks = [
         track(sig, chrmeta=assembly) for sig in signals_plus
         if os.path.exists(sig)
     ]
     smtracks = [
         track(sig, chrmeta=assembly) for sig in signals_minus
         if os.path.exists(sig)
     ]
     ftracks = [
         track(feat, chrmeta=assembly) for feat in features
         if os.path.exists(feat)
     ]
     snames = [t.name for t in sptracks + smtracks + ftracks]
     if len(sptracks) > 0:
         chrmeta = sptracks[0].chrmeta
     elif len(smtracks) > 0:
         chrmeta = smtracks[0].chrmeta
     elif len(features) > 0:
         chrmeta = ftracks[0].chrmeta
     else:
         raise ValueError("No data provided")
     if assembly in [x[0] for x in genrep.GenRep().assemblies_available()]:
         chrnames = genrep.Assembly(assembly).chrnames
     else:
         chrnames = [
             x[1] for x in sorted([(v['length'], c)
                                   for c, v in chrmeta.iteritems()],
                                  reverse=True)
         ]
     pdf = self.temporary_path(fname='genome_graph.pdf')
     _fs = ['chr', 'start', 'end', 'score']
     _ff = ['chr', 'start', 'end', 'name']
     genomeGraph([(c, chrmeta[c]['length']) for c in chrnames],
                 [sig.read(fields=_fs) for sig in sptracks],
                 [sig.read(fields=_fs) for sig in smtracks],
                 [feat.read(fields=_ff) for feat in ftracks],
                 output=pdf,
                 new=True,
                 last=True,
                 legend=snames)
     self.new_file(pdf, 'genome_graph')
     return self.display_time()
Beispiel #6
0
    def __call__(self, **kw):
        fasta_file = kw.get('fastafile')
        background = kw.get('background') or None
        assembly_id = kw.get('assembly') or None
        regions_file = kw.get('regions') or None
        motifs_list = kw.get('motifs')
        motif_add = kw.get('customMotif')
        threshold = float(kw.get('threshold') or 0)

        if motifs_list is None: motifs_list = []
        if isinstance(motifs_list, basestring): motifs_list = motifs_list.split("|")
        if not isinstance(motifs_list, list): motifs_list = [motifs_list]

        if background is None and assembly_id is None:
            background = self.temporary_path(fname='background.txt')
            stats = {'A': 0.25,'C': 0.25, 'G': 0.25, 'T': 0.25}
            if fasta_file:
                with execution(None) as ex:
                    stats = fasta_composition(ex,fasta_file,frequency=True)
            with open(background,"w") as bgr:
                bgr.write(" ".join(["1"]+[str(stats[n]) for n in 'ACGT']))
        if assembly_id is not None:
            assembly = genrep.Assembly(assembly_id)
        else:
            if regions_file is not None:
                raise ValueError("Please specify an assembly if you specify regions.")
            regions_file = os.path.abspath(regions_file)
            assembly = None

        motifs = {}
        if motif_add is not None:
            mname = os.path.basename(os.path.splitext(motif_add)[0])
            if mname: motifs[mname] = os.path.abspath(motif_add)
        for mot in motifs_list:
            gid, mname = mot.split(' ')
            pwmfile = self.temporary_path()
            g.get_motif_PWM(int(gid), mname, output=pwmfile)
            motifs[mname] = pwmfile

        if len(motifs) == 0:
            raise ValueError("Please give at least one motif to scan for")

        track_output = self.temporary_path(fname='motif_scan', ext="bed")
        with execution(None) as ex:
            save_motif_profile( ex, motifs, assembly, regions_file, fasta_file,
                                background=background, threshold=threshold,
                                output=track_output,
                                description=None, via='local' )
        self.new_file(track_output, 'motif_track')
        return self.display_time()
Beispiel #7
0
 def _get_chrmeta(self, chrmeta=None):
     """:param chrmeta: (str or dict) assembly name, or dict of the type {chr: {'length': 1234}}."""
     if isinstance(chrmeta, dict):
         return chrmeta
     if isinstance(chrmeta, basestring) and not (str(chrmeta) == "guess"):
         self.assembly = chrmeta
     if self.assembly is None:
         return {}
     from bbcflib import genrep
     if genrep.GenRep().assemblies_available(self.assembly):
         self.assembly = genrep.Assembly(self.assembly)
         return self.assembly.chrmeta
     else:
         self.assembly = None
         return {}
Beispiel #8
0
    def test_map_chromosomes(self):
        stream = fstream([('chrIV', 1), ('IV', 2), (2780, 3),
                          ('NC_001136.9', 4), ('sth', 5)],
                         fields=['chr', 'start'])
        assembly = genrep.Assembly('sacCer2')
        res = list(map_chromosomes(stream, assembly.chromosomes, keep=True))
        expected = [('chrIV', 1), ('chrIV', 2), ('chrIV', 3), ('chrIV', 4),
                    ('sth', 5)]
        self.assertListEqual(res, expected)

        # keep=False
        stream = fstream([('chrIV', 1), ('IV', 2), (2780, 3),
                          ('NC_001136.9', 4), ('sth', 5)],
                         fields=['chr', 'start'])
        res = list(map_chromosomes(stream, assembly.chromosomes, keep=False))
        self.assertListEqual(res, expected[:-1])
Beispiel #9
0
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'):
    """
    Main call to create the library
    """
    if len(params['primary'])<2:
        print('Some parameters are missing, cannot create the library')
        print('primary='+params['primary']+" ; "+'secondary='+params['secondary'])
        return [None,None,None,None]

    if not isinstance(assembly_or_fasta,genrep.Assembly):
        assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta )
    chrnames = assembly_or_fasta.chrnames
    chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems())
    allfiles = assembly_or_fasta.fasta_by_chrom  #assembly_or_fasta.untar_genome_fasta()

    libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f,
                                                            params['primary'], params['secondary'],
                                                            params['length'],  params['type'],
                                                            via=via ))
                    for c, f in allfiles.iteritems())
    resfile = unique_filename_in()
    os.mkdir(resfile)
    bedfiles = {}
    for chrom, future in libfiles.iteritems():
        libfiles[chrom] = future.wait()
        if not os.path.getsize(libfiles[chrom][1])>0:
            time.sleep(60)
            touch(ex,libfiles[chrom][1])
        bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map)
    rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via)
    bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames]
    cat(bedchrom,out=resfile+".bed")
    gzipfile(ex,[resfile+".bed"]+bedchrom)
#    resfile_sql = resfile+".sql"
#    track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species'])
    enz_list = []
    infos_lib = { 'assembly_name':  params['species'],
                  'enzyme1_id':     getEnzymeSeqId(params['primary'], True, enz_list, url),
                  'enzyme2_id':     getEnzymeSeqId(params['secondary'], True, enz_list, url),
                  'segment_length': params['length'],
                  'type':           params['type'],
                  'filename':       resfile }
    return [ libfiles, bedfiles, resfile, infos_lib ]
Beispiel #10
0
    def __call__(self, **kw):
        assembly = genrep.Assembly(kw.get('assembly'))
        chrmeta = assembly.chrmeta or "guess"

        with open(kw['table'], "rb") as f:
            h = f.readline().strip().replace('#', '').split('\t')

        colnames = []
        for i in kw['id_columns'].split(','):
            indice = int(i) - 1
            if indice <= len(
                    h) and indice > 2:  #columns 0,1,2 are for chr,start,end
                colnames.append(h[indice])

        t = track(kw['table'], chrmeta=chrmeta, fields=h)
        (filepath, filename) = os.path.split(kw['table'])
        (shortname, extension) = os.path.splitext(filename)

        outfiles = []
        for _f in colnames:
            output_name = self.temporary_path(fname=shortname + '_' + _f,
                                              ext=kw.get('format', "bedGraph"))
            out_track = track(output_name, chrmeta=chrmeta)
            s = t.read(fields=['chr', 'start', 'end', _f])
            s.fields[3] = "score"
            out_track.write(s, mode='write')
            out_track.close()
            outfiles.append(output_name)

#        print outfiles
        if len(outfiles) > 1:
            tar_name = self.temporary_path(fname=shortname + "_out.tgz")
            tar = tarfile.open(tar_name, "w:gz")
            [tar.add(f, arcname=os.path.basename(f)) for f in outfiles]
            tar.close()
            self.new_file(tar_name, 'output_tar')
        else:
            self.new_file(outfiles[0], 'output')

        return self.display_time()
Beispiel #11
0
    def __call__(self, **kw):
        assembly = genrep.Assembly(kw.get('assembly'))
        format = kw['format']
        if kw['feature_type'] == 'genes':
            map = assembly.get_gene_mapping()
            get_info = self.genes_annot
        elif kw['feature_type'] == 'exons':
            map = assembly.get_exon_mapping()
            get_info = self.exons_annot
        elif kw['feature_type'] == 'transcripts':
            map = assembly.get_transcript_mapping()
            get_info = self.trans_annot

        def _annotate(ids_list):
            with open(ids_list) as ids_file:
                for id in ids_file:
                    id = id.strip()
                    if map.get(id):
                        yield get_info(id, map.get(id))
                    else:
                        yield ('NA', '0', '0', id, 0.0, '0')

        ids_list = kw.get('ids_list')
        fields = ['chr', 'start', 'end', 'name', 'score', 'strand']
        if ids_list:
            assert os.path.exists(
                str(ids_list)), "File not found: '%s'" % ids_list
            fulltrack = FeatureStream(_annotate(ids_list), fields=fields)
            fname = os.path.splitext(os.path.basename(ids_list))[0]
        else:
            fulltrack = FeatureStream((get_info(g, map[g]) for g in map),
                                      fields=fields)
            fname = kw['feature_type']
        output = self.temporary_path(fname=fname + '.' + format)
        out = track(output, chrmeta=assembly)
        out.write(fulltrack)
        self.new_file(output, 'fulltrack')
        return self.display_time()
Beispiel #12
0
    def test_intersect(self):
        # Test from the snp workflow.
        expected = ('chr', 91143, 91144, ('C', '*A', '0',
                                          '|EBMYCG00000002479|Rv0083', 1, 0))
        a = genrep.Assembly('mycoTube_H37RV')
        c = concat_fields(a.annot_track('CDS', 'chr'),
                          infields=['name', 'strand', 'frame'],
                          as_tuple=True)
        feat = fstream([('chr', 91143, 91144, ('C', '*A', '0'))],
                       fields=['chr', 'start', 'end', 'rest'])
        g = intersect([feat, c], win_size=10000)
        self.assertEqual(g.next(), expected)

        fields = ['chr', 'start', 'end', 'name', 'strand', 'score']
        s1 = fstream([('chr', 0, 20, 'a1', 1, 6.),
                      ('chr', 40, 60, 'b', 1, 3.)],
                     fields=fields)
        s2 = fstream([('chr', 10, 30, 'a2', 1, 8.),
                      ('chr', 50, 70, 'b', -1, 4.)],
                     fields=fields)
        res = list(intersect([s1, s2]))
        expected = [('chr', 10, 20, 'a1|a2', 1, 14.),
                    ('chr', 50, 60, 'b|b', 0, 7.)]
        self.assertListEqual(res, expected)
Beispiel #13
0
 def setUp(self):
     self.assembly = genrep.Assembly('mm9')
     self.job = fakejob(self.assembly)
     stranded = False
     self.args = ("local", self.job, self.assembly, ["KO.1", "KO.2"],
                  sys.stderr, sys.stdout, "genes", False, stranded)
Beispiel #14
0
 def setUp(self):
     self.a = genrep.Assembly('sacCer2')
Beispiel #15
0
 def setUp(self):
     self.assembly = genrep.Assembly('ce6')
     """
Beispiel #16
0
def main():
    try:
        # Parse args
        parser = optparse.OptionParser(usage=usage, description=descr)
        for opt in opts:
            parser.add_option(opt[0],opt[1],help=opt[2],**opt[3])

        # Get variables
        (opt, args) = parser.parse_args()
        if opt.assembly:
            assembly_id = re.search('([._\-\w]+)', str(opt.assembly)).groups()[0]
        genrep_root = os.path.abspath(opt.root)
        genrep_url = normalize_url(opt.url)
        if opt.output:
            fout = open(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'w')
        else:
            fout = sys.stdout
        regions = None
        if opt.regions:
            if os.path.exists(opt.regions):
                regions = opt.regions
            else:
                regions = []
                for x in str(opt.regions).split(","):
                    chrom,start,end = re.search('(\S+):(\d+)\-(\d+)',x).groups()[0:3]
                    regions.append([chrom,int(start),int(end)])

        # Program body
        g_rep = genrep.GenRep(url=genrep_url, root=genrep_root)
        if opt.assembly:
            assembly = genrep.Assembly(assembly=assembly_id,genrep=g_rep,intype=opt.intype)
        if opt.list:
            if opt.assembly:
                table = ["\t".join((v['ac'],k,str(v['length'])))
                         for k,v in assembly.chrmeta.iteritems()]
                fout.write("\n".join(table)+"\n")
            else:
                fout.write("\n".join(v[1] for v in g_rep.assemblies_available())+"\n")
            return 0
        if not(opt.assembly):
            parser.print_help()
            return 0
        if regions:
            seq = assembly.fasta_from_regions(regions=regions, out=fout)[0]
        if opt.bowtie:
            fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie index prefix\n")
            fout.write(assembly.index_path+"\n")
        if opt.bowtie2:
            fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie2 index prefix\n")
            fout.write(re.sub(r'bowtie/','bowtie2/',assembly.index_path)+"\n")
        if opt.fasta:
            fout.write(">"+str(assembly.id)+":"+assembly.name+" fasta file\n")
            fout.write(assembly.fasta_path()+"\n")
        if opt.db:
            fout.write(">"+str(assembly.id)+":"+assembly.name+" sqlite file\n")
            fout.write(assembly.sqlite_path+"\n")
        if opt.genes:
            if os.path.exists(opt.genes):
                glist = _parse_list(opt.genes)
            else:
                glist = opt.genes.split(",")
            for gcoord in assembly.gene_coordinates(glist):
                fout.write("\t".join([str(x) for x in gcoord])+"\n")
        if opt.all:
            from bbcflib.track import track
            if opt.intype == 1:
                feats = assembly.exon_track()
            elif opt.intype == 2:
                feats = assembly.transcript_track()
            else:
                feats = assembly.gene_track()
            with track(fout,format='bed',fields=['strand']) as _tfeat:
                _tfeat.write(feats)
        if opt.stats:
            stats = assembly.statistics(frequency=True)
            bases = ["A","C","G","T"]
            fout.write("#Assembly: %s\n" % assembly.name)
            [fout.write("%s\t%s\n" % (x,stats[x])) for x in bases]
            fout.write("#N\t%s\n" % stats["N"] )
            [[fout.write("%s\t%s\n" % (x+y,stats[x+y])) for y in bases] 
             for x in bases]
        fout.close()
        if opt.convert:
            if not(os.path.exists(opt.convert)):
                raise Usage("No such file: %s."%opt.convert)
            if not(opt.output):
                raise Usage("Need an output file name.")
            import pysam
            infile = pysam.Samfile( opt.convert )
            header = infile.header
            chromosomes = dict((v['ac'],k) for k,v in assembly.chrmeta.iteritems())
            for h in header["SQ"]:
                if h["SN"] in chromosomes:
                    h["SN"] = chromosomes[h["SN"]]
            outfile = pysam.Samfile(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'wb', header=header )
            for read in infile:
                outfile.write(read)
            outfile.close()
            infile.close()

        return 0
    except Usage, err:
        print >>sys.stderr, err.msg
        print >>sys.stderr, usage
        return 2
Beispiel #17
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not (feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('SigMulti', {}).get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     snames = [sig.name for sig in signals]
     if feature_type == 0:  #bodies
         features = genes
     elif feature_type == 1:  #promoters
         prom_pars = {
             'before_start': int(kw.get('upstream') or prom_up_def),
             'after_start': int(kw.get('downstream') or prom_down_def),
             'on_strand': True
         }
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2:  #exons
         features = exons
     elif feature_type == 3:  #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     highlights = kw.get('HiMulti', {}).get('highlights', [])
     if not isinstance(highlights, list): highlights = [highlights]
     if highlights is not None:
         highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
         hinames = [t.name for t in highlights]
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     set_index = []
     set_labels = []
     if int(kw['mode']) == 0:  #correl
         cormax = int(kw.get('cormax') or _cormax)
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [
             x[:3] for chrom in srtdchrom
             for x in sorted_stream(features(chrom))
         ]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals], features,
                            (-cormax, cormax), True)
     elif int(kw['mode']) == 1:  #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             if 'name' not in feat.fields:
                 feat = add_name_field(feat)
             means = score_by_feature([s.read(chrom) for s in signals],
                                      feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else: narr = vstack((narr, _n))
         set_index = [narr.shape[0]]
         for hitrack in highlights:
             for chrom in chrmeta:
                 hiread = hitrack.read(chrom)
                 if 'name' not in hiread.fields:
                     hiread = add_name_field(hiread)
                 means = score_by_feature([s.read(chrom) for s in signals],
                                          hiread)
                 mf = means.fields[len(hiread.fields):]
                 _n, _l = score_array(means, mf)
                 if _n.size == 0: continue
                 narr = vstack((narr, _n))
                 set_labels.extend(_l)
             set_index.append(narr.shape[0])
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr,
           xarr,
           labels=snames,
           output=pdf,
           highlights=[set_index, set_labels])
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
Beispiel #18
0
    def __call__(self, **kw):
        if kw.get('input_type') == 'Table':
            filename = kw.get('table')
            assert os.path.exists(
                str(filename)), "File not found: '%s'" % filename
            robjects.r("""
Mdata = read.delim('%s',row.names=1)
conds = sapply(strsplit(colnames(Mdata),".",fixed=T),"[[",1)
""" % filename)
            conds = robjects.r("conds").rx()
        else:
            from QuantifyTable import QuantifyTablePlugin
            assembly = genrep.Assembly(kw.get('assembly'))
            chrmeta = assembly.chrmeta or "guess"
            kw['score_op'] = 'sum'
            signals1 = kw['Group1']['signals1']
            signals2 = kw['Group2']['signals2']
            if not isinstance(signals1, (list, tuple)): signals1 = [signals1]
            if not isinstance(signals2, (list, tuple)): signals2 = [signals2]
            signals = signals1 + signals2
            kw['SigMulti'] = {
                'signals': signals
            }  # to pass it to QuantifyTable plugin
            table = QuantifyTablePlugin().quantify(**kw)
            stracks = []
            norm_factors = []
            for sig in signals:
                assert os.path.exists(
                    str(sig)), "Signal file not found: '%s'." % sig
                _t = track(sig, chrmeta=chrmeta)
                if 'normalization' in _t.info:
                    _nf = float(_t.info['normalization'])
                elif 'nreads' in _t.info:
                    _nf = float(_t.info['nreads']) * 1e-7 / float(
                        _t.info.get('read_extension', 1))
                else:
                    _nf = 1
                stracks.append(_t)
                norm_factors.append(_nf)
            t = track(table,chrmeta=chrmeta,fields=['chr','start','end','name']+ \
                                                   ['score%d'%x for x in range(len(signals))])
            _f = [f for f in t.fields if f.startswith('score')]
            de_list = list(t.read(fields=['name'] + _f))
            t.close()
            os.remove(table)
            # Turn all scores into integers
            de_matrix = numpy.asarray([[
                int(float(s) * norm_factors[k] + .5)
                for k, s in enumerate(x[1:])
            ] for x in de_list],
                                      dtype=numpy.float)
            rownames = numpy.asarray([x[0] for x in de_list])
            colnames = numpy.asarray([s.name for s in stracks])
            # if all prefixes are identical within a group, keep this prefix as group identifier.
            if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \
                    and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1:
                group1 = colnames[0].split('.')[0]
                group2 = colnames[-1].split('.')[0]
            else:
                group1 = "Group1"
                group2 = "Group2"
            conds = [group1] * len(signals1) + [group2] * len(signals2)
            robjects.r.assign('Mdata', numpy2ri(de_matrix))
            robjects.r.assign('row_names', robjects.StrVector(rownames))
            robjects.r.assign('col_names', robjects.StrVector(colnames))
            robjects.r.assign('conds', robjects.StrVector(conds))
            robjects.r("""
Mdata = as.data.frame(Mdata,row.names=row_names)
colnames(Mdata) = col_names
""")

        robjects.r("""
library(DESeq)
if (all(table(conds)>=3)){        # if >3 replicates in all conditions
    method = 'per-condition'        # for each group estimate the variance from its replicates
    sharingMode = 'gene-est-only'   # use the per-gene variance estimates only
} else if (any(table(conds)>1)){ # if few replicates
    method = 'pooled'               # use all groups with replicates to estimate the variance
    sharingMode = 'maximum'         # use the max of the GLM fit and the estimated variance
} else {                         # if no replicates
    method = 'blind'                # pools all groups together to estimate the variance
    sharingMode='fit-only'          # use only the GLM fit across the pooled variance
}
cds = newCountDataSet(Mdata, conds)
cds = estimateSizeFactors(cds)
test = try({
    cds = estimateDispersions(cds, method=method, fitType='parametric', sharingMode=sharingMode)
})
if(class(test) == "try-error") {
    cds = estimateDispersions(cds, method=method, fitType='local', sharingMode=sharingMode)
}
""")

        groups = list(set(conds))
        couples = itertools.combinations(groups, 2)
        output = self.temporary_path(fname='DE')
        for c in couples:
            out = "%s_%s-%s.txt" % ((output, ) + tuple(c))
            robjects.r("""
res = nbinomTest(cds, '%s', '%s')
write.table(res[order(res[,8]),], '%s', row.names=F, quote=F, sep='\t')
            """ % (c[0], c[1], out))
            if kw.get('complete') is None:
                clean = self.clean_deseq_output(out, c)
                shutil.move(clean, out)
            self.new_file(out, 'differential_expression')
        return self.display_time()
Beispiel #19
0
    def __call__(self,opts):
        self.opts = opts
        if os.path.exists(self.opts.wdir):
            os.chdir(self.opts.wdir)
        else:
            raise Usage("Working directory '%s' does not exist." %self.opts.wdir)

##### Connect to Minilims, recover global variables, fetch job info
        self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims")
        M = MiniLIMS(self.minilims)
        if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))):
            raise Usage("Need a job key or a configuration file")
        if self.opts.key:
            self.globals = use_pickle(M, "global variables")
            htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] )
            self.job = htss.job( self.opts.key )
            [M.delete_execution(x) for x in \
                 M.search_executions(with_description=self.opts.key,fails=True)]
            if self.job.options.get("config_file"):
                if os.path.exists(self.job.options["config_file"]):
                    self.opts.config = os.path.abspath(self.job.options["config_file"])
                elif os.path.exists("config.txt"):
                    self.opts.config = os.path.abspath("config.txt")
            if self.opts.config and os.path.exists(self.opts.config):
                (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals )
        elif os.path.exists(self.opts.config):
            (self.job,self.globals) = frontend.parseConfig( self.opts.config )
            self.opts.key = self.job.description
        else:
            raise Usage("Need either a job key (-k) or a configuration file (-c).")
##### Genrep instance
        if 'fasta_file' in self.job.options:
            if os.path.exists(self.job.options['fasta_file']):
                self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path'])
            else:
                for ext in (".fa",".fa.gz",".tar.gz"):
                    if os.path.exists("ref_sequence"+ext):
                        self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext)
            if not os.path.exists(self.job.options['fasta_file']):
                raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"])
        g_rep = genrep.GenRep( url=self.globals.get("genrep_url"),
                               root=self.globals.get("bwt_root") )
##### Configure facility LIMS
        if 'lims' in self.globals:
            from bbcflib import daflims
            self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'],
                                                       password=pwd ))
                                 for loc,pwd in self.globals['lims']['passwd'].iteritems())
########################################################################
##########################  EXECUTION  #################################
########################################################################
##### Logging
        logfile_name = os.path.abspath(self.opts.key+".log")
        debugfile_name = os.path.abspath(self.opts.key+".debug")
        self.logfile = open(logfile_name,'w')
        self.debugfile = open(debugfile_name,'w')
        self.debug_write(json.dumps(self.globals)+"\n")
        with execution( M, description=self.opts.key,
                        remote_working_directory=self.opts.wdir ) as ex:
            self.log_write("Enter execution. Current working directory: %s" %ex.working_directory)
            self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id,
                                                 genrep=g_rep,
                                                 fasta=self.job.options.get('fasta_file'),
                                                 annot=self.job.options.get('annot_file'),
                                                 intype=self.job.options.get('input_type_id',0),
                                                 ex=ex, via=self.opts.via,
                                                 bowtie2=self.job.options.get("bowtie2",True) )
##### Check all the options
            if not self.check_options():
                raise Usage("Problem with options %s" %self.opts)
            self.debug_write(json.dumps(self.job.options))
            self.init_files( ex )
##### Run workflow
            self.log_write("Starting workflow.")
            self.main_func(ex,**self.main_args)
##### Add logs to the LIMS in admin mode
            self.logfile.flush()
            self.debugfile.flush()
            log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin")
            debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin")
            ex.add(os.path.join(logfile_name), description=log_desc)
            ex.add(os.path.join(debugfile_name), description=debug_desc)
##### Create GDV project
            if self.job.options['create_gdv_project']: self.gdv_create(ex)

########################################################################
########################  POSTPROCESSING  ##############################
########################################################################
        allfiles = get_files( ex.id, M )
        if self.job.options['create_gdv_project'] and \
                self.job.options['gdv_project'].get('project',{}).get('id',0)>0:
            allfiles['url'] = self.gdv_upload(allfiles.get('sql',{}))
        self.logfile.close()
        self.debugfile.close()
        print json.dumps(allfiles)
        with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done)
        self.send_email()
        return 0
Beispiel #20
0
#!/usr/bin/env python

from bbcflib import genrep
import os, getopt, sys

opts = dict(getopt.getopt(sys.argv[1:], "d:", [])[0])
basepath = opts.get('-d') or "/data/epfl/bbcf/genrep/nr_assemblies"
basepath += "/%s"
for _a, info in genrep.GenRep().assemblies_available():
    for n in range(100):
        assembly = genrep.Assembly(_a)
        gtf_path = os.path.join(basepath % "gtf",
                                "%s_%i.gtf.gz" % (assembly.md5, n))
        if not (assembly.bbcf_valid and os.path.exists(gtf_path)): break
        sql_path = os.path.join(basepath % "annot_tracks",
                                "%s_%i.sql" % (assembly.md5, n))
        if os.path.exists(sql_path): continue
        print info, gtf_path, sql_path
        assembly.gtf_to_sql(gtf_path=gtf_path, sql_path=sql_path)
Beispiel #21
0
def run(**kwargs):
    """
    Wrapper function to execute any operation contained in this package, directly from
    file inputs. Arguments are:

    :param operation: (str) the name of the function to be called.
    :param output: (str) a filename or a directory to write the results into.
    :param assembly: (str) a genome assembly identifier if needed.
    :param chromosome: (str) a chromosome name if operation must be restricted to a single chromsome.
    :param ...: additional parameters passed to `operation`.

    Example::

        run(operation="score_by_feature",
            output="score_output.bed", chromosome="chr1",
            trackScores="density_file.sql", trackFeatures="genes.sql")
    """
    from bbcflib import genrep

    def _map(fct):
        for module in _module_list:
            __import__(_here + module)
            smod = sys.modules[_here + module]
            if hasattr(getattr(smod, module)(), fct): return module
        return None

    funct = kwargs.pop("operation", 'None')
    module = _map(funct)
    if module is None:
        raise ValueError("No such operation %s." % funct)
    output = kwargs.pop("output", "./") or "./"
    if os.path.isdir(output):
        output = os.path.join(output, unique_filename_in(output) + ".sql")
        format = "sql"
    else:
        format = os.path.splitext(output)[1][1:] or "sql"
    if format in ['gz', 'gzip']:
        format = os.path.splitext(
            output.strip("." + format))[1][1:] + "." + format
    smod = sys.modules[_here + module]
    trackSet = {}
    for targ in getattr(smod, module)().loadable(funct):
        trackSet[targ] = [track(t) for t in kwargs[targ].split(",")]
    assembly = None
    if 'assembly' in kwargs:
        assembly = kwargs.pop('assembly')
    if assembly:
        chrmeta = genrep.Assembly(assembly).chrmeta
    else:
        chrmeta = trackSet[targ][0].chrmeta
        if 'chromosome' in kwargs:
            chrom = kwargs.pop('chromosome')
            chrmeta = {chrom: chrmeta.get(chrom, {})}
    chr = chrmeta.keys()[0]
    info = None
    if 'datatype' in kwargs: info = {'datatype': kwargs.pop('datatype')}
    files = None
    for targ in getattr(smod, module)().loadable(funct):
        kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
    funct_output = getattr(smod, funct)(**kwargs)
    if isinstance(funct_output, list):
        files = []
        for n, stream in enumerate(funct_output):
            outf = "%s_%i.%s" % (output.strip(format), n, format)
            files.append(outf)
            fields = stream.fields
            track(outf, chrmeta=chrmeta, fields=fields,
                  info=info).write(stream, chrom=chr)
        for chr in chrmeta.keys()[1:]:
            for targ in getattr(smod, module)().loadable(funct):
                kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
            funct_output = getattr(smod, funct)(**kwargs)
            for n, stream in enumerate(funct_output):
                track(files[n], chrmeta=chrmeta).write(stream,
                                                       chrom=chr,
                                                       mode='append')
    else:
        files = output
        fields = funct_output.fields
        track(files, chrmeta=chrmeta, fields=fields,
              info=info).write(funct_output, chrom=chr)
        for chr in chrmeta.keys()[1:]:
            for targ in getattr(smod, module)().loadable(funct):
                kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]]
            funct_output = getattr(smod, funct)(**kwargs)
            track(files, chrmeta=chrmeta).write(funct_output,
                                                chrom=chr,
                                                mode='append')
    return files