Esempio n. 1
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))
        ]

        os.remove(infile)

        return result
Esempio n. 2
0
def executewait(dbhandle, statement, error, retry=False, wait=5):
    '''execute sql statement.

    Retry on error, if retry is True.
    Returns a cursor object.
    '''

    cc = dbhandle.cursor()
    i = 20
    while i > 0:
        try:
            cc.execute(statement)
            return cc
        except sqlite3.OperationalError as e:
            msg = e.message
            E.warn("import failed: msg=%s, statement=\n  %s" %
                   (msg, statement))
            # TODO: check for database locked msg
            if not retry:
                raise error, msg
            if not re.search("locked", str(msg)):
                raise error, msg
            time.sleep(wait)
            i -= 1
            continue
        break
    raise sqlite.OperationalError("Database locked and too many retries")
Esempio n. 3
0
def substituteParameters( **kwargs ):
    '''return a local PARAMS dictionary.

    Options in ``**kwargs`` substitute default
    values in PARAMS.

    Finally, task specific configuration values 
    are inserted.
    '''

    # build parameter dictionary
    # note the order of addition to make sure that kwargs takes precedence
    local_params = dict(PARAMS.items() + kwargs.items())

    if "outfile" in local_params:
        # replace specific parameters with task (outfile) specific parameters
        outfile = local_params["outfile"]
        for k in local_params.keys():
            if k.startswith(outfile):
                p = k[len(outfile)+1:]
                if p not in local_params:
                    raise KeyError( "task specific parameter '%s' does not exist for '%s' " % (p,k))
                E.debug( "substituting task specific parameter for %s: %s = %s" % (outfile,p,local_params[k] ) )
                local_params[p] = local_params[k]

    return local_params
Esempio n. 4
0
def main():
    
    try:

        Experiment.initLogging(os.environ.has_key("GATEWAY_INTERFACE"))
        
        if os.environ.has_key("GATEWAY_INTERFACE"):
            # CGI
            (odict, args) = getCGIOptions()
        else:
            # Command line.  All the output is still CGI-ish, though.  Sorry.
            (odict, args) = getOptions()
        (odict, args) = processOptions(odict, args)
            
        data = newCreature(odict, odict["e"], odict.get("p"))
        data = Location.getJsonModule().dumps(data, indent=2)
        
        print "Content-type: application/json"
        print "Content-length: %s" % (len(data))
        print
        print data

    except:
        msg = string.join(apply( traceback.format_exception, sys.exc_info() ), "")
        if (msg[-1] == "\n"):
            msg = msg[:-1]
        logging.getLogger().warning(msg)
        data = "Huh?\n%s" % (msg)
        print "Status: 500 Internal Server Error"
        print "Content-type: text/plain"
        print "Content-length: %s" % (len(data))
        print
        print data
Esempio n. 5
0
    def maskSequence( self, peptide_sequence ):
        """mask peptide sequence
        """
        
        Masker.__init__(self)

        outfile, filename_peptide = tempfile.mkstemp()
        os.write(outfile, ">test\n%s\n" % (peptide_sequence))
        os.close(outfile)

        infile = filename_peptide
        statement = self.mCommand % locals()

        E.debug( "statement: %s" % statement )

        s = subprocess.Popen( statement,
                              shell = True,
                              stdout = subprocess.PIPE,
                              stderr = subprocess.PIPE,
                              close_fds = True)                              

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError("Error in running %s \n%s\nTemporary directory" % (statement, err))

        os.remove( filename_peptide ) 
        
        masked_sequence = re.sub("\s", "", string.join(out.split("\n")[1:], ""))
        
        return masked_sequence
Esempio n. 6
0
def executewait( dbhandle, statement, error, retry = False, wait=5):
    '''execute sql statement.

    Retry on error, if retry is True.
    Returns a cursor object.
    '''

    cc = dbhandle.cursor()    
    i = 20
    while i>0:
        try:
            cc.execute( statement )
            return cc
        except sqlite3.OperationalError as e:
            msg = e.message
            E.warn("import failed: msg=%s, statement=\n  %s" % (msg, statement ) )
        # TODO: check for database locked msg
            if not retry:
                raise error, msg
            if not re.search("locked", str(msg)):
                raise error, msg
            time.sleep(wait)
            i -= 1
            continue
        break
    raise sqlite.OperationalError("Database locked and too many retries")
Esempio n. 7
0
 def _copy(src, dest):
     dest = os.path.abspath(os.path.join(PARAMS["web_dir"], dest))
     if os.path.exists(dest): shutil.rmtree(dest)
     if not os.path.exists(src):
         E.warn("%s does not exist - skipped" % src)
         return
     shutil.copytree(os.path.abspath(src), dest)
Esempio n. 8
0
    def importdata(self):

        # Data[0] = Voltage, Data[1] = Current, Data[2] = Time

        self.trainData, self.testData = loading.Loader().dataload()

        self.myExp = Experiment('Experiment 1', .1)

        for n in self.trainData:
            print "Trials"
            print n
            self.myExp.addTrainingSetTrace(n[0],
                                           self.V_units,
                                           n[1],
                                           self.I_units,
                                           np.size(n[2]) / 10,
                                           FILETYPE='Array')
            self.myExp.trainingset_traces[n].setROI([[1000, 120000.0]])

        for n in self.testData:
            self.myExp.addTestSetTrace([n][0],
                                       self.V_units, [n][1],
                                       self.I_units,
                                       np.size([n][2]) / 10,
                                       FILETYPE='Array')
            self.myExp.testset_traces[n].setROI([[1000, 20000]])

        self.fitaec(self, self.myExp)
Esempio n. 9
0
 def _copy( src, dest ):
     dest = os.path.abspath( os.path.join( PARAMS["web_dir"], dest ) )
     if os.path.exists( dest ): shutil.rmtree( dest )
     if not os.path.exists(src):
         E.warn( "%s does not exist - skipped" % src )
         return
     shutil.copytree( os.path.abspath(src), dest ) 
Esempio n. 10
0
def main():
    
    try:

        Experiment.initLogging(os.environ.has_key("GATEWAY_INTERFACE"))
        
        if os.environ.has_key("GATEWAY_INTERFACE"):
            # CGI
            (odict, args) = getCGIOptions()
        else:
            # Command line.  All the output is still CGI-ish, though.  Sorry.
            (odict, args) = getOptions()
        (odict, args) = processOptions(odict, args)
            
        if odict.has_key("e") or not odict.has_key("c"):
            data = getGallery(odict, odict.get("e"), odict.get("c"))
        else:
            data = getPage(odict, odict["c"], odict.get("p"), odict.get("n"))
        
        print "Content-type: text/html"
        print "Content-length: %s" % (len(data))
        print
        print data

    except:
        msg = string.join(apply( traceback.format_exception, sys.exc_info() ), "")
        if (msg[-1] == "\n"):
            msg = msg[:-1]
        logging.getLogger().warning(msg)
        data = "Huh?\n%s" % (msg)
        print "Status: 500 Internal Server Error"
        print "Content-type: text/plain"
        print "Content-length: %s" % (len(data))
        print
        print data
Esempio n. 11
0
def run_experiment(experiment_var, random_seed):

    evals_at_targets_df = pd.DataFrame()

    for i, dim in enumerate(experiment_var):

        a = 1
        b = -1
        # random initial solution with elements between -1 and 1
        theta0 = (b - a) * np.random.rand(dim + 1, 1) + a

        # allow more iterations in higher dimensions
        parms.max_iterations = parms.max_iterations * dim

        error_list, sample_evals = ex.run_problem(
            dim, sample_size, num_targets, num_subintervals, cost_function,
            theta0, balance, noise, parms, random_seed)

        ############# benchmark optimization run

        target_values = ex.create_targets(error_list, num_targets)

        benchmarker = bm.Benchmark(sample_evals, target_values, error_list)

        evals_at_targets = benchmarker.benchmark()
        evals_at_targets_df[i] = evals_at_targets

    return evals_at_targets_df
Esempio n. 12
0
    def maskSequence(self, peptide_sequence):
        """mask peptide sequence
        """

        Masker.__init__(self)

        outfile, filename_peptide = tempfile.mkstemp()
        os.write(outfile, ">test\n%s\n" % (peptide_sequence))
        os.close(outfile)

        infile = filename_peptide
        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        os.remove(filename_peptide)

        masked_sequence = re.sub("\s", "",
                                 string.join(out.split("\n")[1:], ""))

        return masked_sequence
Esempio n. 13
0
    def maskSequences( self, sequences ):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x,s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x,s) )
                     
        os.close(outfile)
                     
        statement = self.mCommand % locals()

        E.debug( "statement: %s" % statement )

        s = subprocess.Popen( statement,
                              shell = True,
                              stdout = subprocess.PIPE,
                              stderr = subprocess.PIPE,
                              close_fds = True)                              

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError("Error in running %s \n%s\nTemporary directory" % (statement, err))

        result = [ x.sequence for x in FastaIterator.iterate( StringIO.StringIO( out) ) ]

        os.remove( infile )
        
        return result
Esempio n. 14
0
    def validate(self, samples):

        # check segment lengths
        l = [x[1] - x[0] for x in self.segments]
        values_input = min(l), max(l), numpy.mean(l), numpy.std(l)

        fail = False

        for i, sample in enumerate(samples):
            l = [x[1] - x[0] for x in sample]
            values_sample = min(l), max(l), numpy.mean(l), numpy.std(l)

            for val, inp, samp in zip(("min", "max", "mean", "std"),
                                      values_input,
                                      values_sample):
                d = abs(inp - samp) / float(inp)

                # segment length distribution fails
                if d >= self.stringency_level:
                    fail = True
                    E.warn("segment length distribution in sample %i: expected %s (%f) != observed %s (%f)" %
                           (i, val, inp, val, samp))

                    break

            if fail:
                break
        else:
            fail = False

        return "\t".join(("%i" % (not fail)), )
Esempio n. 15
0
def getStdoutStderr( stdout_path, stderr_path, tries=5 ):
    '''get stdout/stderr allowing for same lag.

    Try at most *tries* times. If unsuccessfull, throw PipelineError.

    Removes the files once they are read. 

    Returns tuple of stdout and stderr.
    '''
    x = tries
    while x >= 0:
        if os.path.exists( stdout_path ): break
        time.sleep(1)
        x -= 1
            
    x = tries
    while x >= 0:
        if os.path.exists( stderr_path ): break
        time.sleep(1)
        x -= 1

    try:
        stdout = open( stdout_path, "r" ).readlines()
    except IOError, msg:
        E.warn( "could not open stdout: %s" % msg )
        stdout = []
Esempio n. 16
0
    def startUp(self):

        self.mHeaders = ("nid", "node", "parent", "level", "start", "end")

        if not self.isComplete():
            self.mOutfile = self.openOutputStream(self.mFilenameSegments)
            self.min_domain_size = self.mConfig.get('adda', 'min_domain_size',
                                                    30)
            self.min_segment_size = self.mConfig.get('segments',
                                                     'min_segment_size', 30)
            self.min_distance_border = self.mConfig.get(
                'segments', 'min_distance_border', 0)
            self.resolution = self.mConfig.get('segments', 'resolution', 10.0)

            E.debug( "splitting parameters: resolution=%f, min_domain_size=%i, min_distance_border=%i" % \
                         (self.resolution,
                          self.min_domain_size,
                          self.min_distance_border ) )

            # rescale
            self.r_min_domain_size = int(
                float(self.min_domain_size) / self.resolution)
            self.r_min_distance_border = int(
                float(self.min_distance_border) / self.resolution)

            if self.mContinueAt == None:
                self.mOutfile.write("\t".join(self.mHeaders) + "\n")
                self.mOutfile.flush()
Esempio n. 17
0
    def _iterate(self):
        """iterate over muliple files."""
        def _iter(infile):

            identifier = None

            for line in infile:
                if line.startswith("#"): continue
                if line.startswith(">"):

                    if self.regexIdentifier:
                        try:
                            identifier = re.search(self.regexIdentifier,
                                                   line[1:-1]).groups()[0]
                        except AttributeError:
                            raise ValueError(
                                "could not parse identifier from line %s - check the input"
                                % line[1:-1])
                    else:
                        identifier = re.split("\s", line[1:-1])[0]

                else:
                    if not identifier:
                        raise ValueError(
                            "refusing to emit sequence without identifier - check the input"
                        )
                    yield identifier, line.strip()

        for filename in self.filenames:
            if self.format == "tar.gz" or self.format == "tar" or (
                    self.format == "auto" and filename.endswith("tar.gz")):
                if filename == "-":
                    tf = tarfile.open(fileobj=sys.stdin, mode="r|*")
                else:
                    tf = tarfile.open(filename, mode="r")
                for f in tf:
                    b, ext = os.path.splitext(f.name)
                    if ext.lower() in (".fasta", ".fa"):
                        E.info("extracting %s" % f.name)
                        infile = tf.extractfile(f)
                        for x in _iter(infile):
                            yield x
                    else:
                        E.info("skipping %s" % f.name)

                if tf != sys.stdin: tf.close()
                continue
            elif self.format == "fasta.gz" or (self.format == "auto"
                                               and filename.endswith(".gz")):
                infile = gzip.open(filename, "r")
            elif filename == "-":
                infile = sys.stdin
            else:
                infile = open(filename, "r")

            for x in _iter(infile):
                yield x
            if filename != "-": infile.close()

        raise StopIteration
Esempio n. 18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id$",
                                   usage=globals()["__doc__"])

    parser.add_option("-p", "--proc", dest="processors", type="int",
                      help="use # processors [%default]")

    parser.set_defaults(
        processors=1)

    options, args = E.Start(parser, argv=argv)

    t1 = Test(RunnerGat,
              small_test_segmented_workspaces(),
              [ValidatorNumSamples,
               ValidatorSegmentDistribution])

    t1.run(options.stdout,
           processors=options.processors)

    E.Stop()
Esempio n. 19
0
    def validate( self, samples ):

        # check segment lengths
        l = [ x[1] - x[0] for x in self.segments ]
        values_input = min( l ), max(l ), numpy.mean( l ), numpy.std( l )

        fail = False

        for i, sample in enumerate( samples ):
            l = [ x[1] - x[0] for x in sample ]
            values_sample = min( l ), max( l ), numpy.mean( l ), numpy.std( l )
            
            for val, inp, samp in zip( ("min", "max", "mean", "std" ),
                                  values_input,
                                  values_sample ):
                d = abs(inp - samp) / float(inp)

                # segment length distribution fails
                if d >= self.stringency_level:
                    fail = True
                    E.warn( "segment length distribution in sample %i: expected %s (%f) != observed %s (%f)" %\
                                ( i, val, inp, val, samp ) )

                    break
            
            if fail: break
        else:
            fail = False

        return "\t".join( ( "%i" % (not fail) ), )
Esempio n. 20
0
def buildPFAMDomains( infiles, outfile ):
    '''map PFAM domains onto current sequence collection. 
    The mapping is done by ID lookup.'''
    
    infile = infiles[0]
    with IOTools.openFile( "nrdb50.fasta.tsv") as inf:

        reader = csv.DictReader( inf, dialect='excel-tab' )
        map_id2nid = {}
        for row in reader:
            map_id2nid[row['repid']] = row['nid']
    
    rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" )

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    with IOTools.openFile( infile ) as inf:
        for entry in FastaIterator.iterate( inf ):
            c.input += 1
            pid, start, end, pfam_id, description = rx.match( entry.title ).groups()
            try:
                outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) )
            except KeyError:
                c.missed += 1
                continue
            c.output += 1

    outf.close()
    E.info( c )
Esempio n. 21
0
def main():
    min_unit, T_slot, a, b, c = 68, 1, 0.88652179221, 0.25726495726, 0.0073070866
    exp_times = 1  # 实验次数
    record_num = cfg.record_num
    # send_time_list = []
    # 获取分段的数量
    if min_unit == 68:
        piece_num = record_num
    else:
        piece_num = 68 * record_num // min_unit + (1 if (68 * record_num %
                                                         min_unit) else 0)

    # 获取每轮发送的数量的数组
    # pieces_each_round = 4  # 初始值
    # with open("TimeOFpiecesEachRound.csv", "w", newline="", encoding="utf-8") as datacsv:
    #     csvwriter = csv.writer(datacsv, dialect="excel")
    #     csvwriter.writerow(["pieces_each_round", "已解码个数"])
    #
    #     while pieces_each_round <= piece_num:  # 何时间终止
    #         print(f"\n--------------------------------------------\n")
    #         print(f"\n|        pieces_each_round = %4d           |\n" %(pieces_each_round, ))
    #         print(f"\n--------------------------------------------\n")
    #         send_time = Experiment.run(min_unit, T_slot, a, b, c, pieces_each_round)
    #         send_time_list.append(send_time)
    #         print(send_time_list)
    #         # 保存
    #         csvwriter.writerow([pieces_each_round, send_time])
    #         print("所有轮次发送时间之和:  ", send_time)
    #         pieces_each_round += 2  # 每轮增加的值

    pieces_each_round = piece_num  # 只发送一轮 !!!!!
    print("开始实验...")
    if len(sys.argv) == 1:
        print("自动化")
        for exp_index in range(1, exp_times + 1):
            print(f"\n开始第{exp_index}次实验...")
            send_time = Experiment.run(min_unit, T_slot, a, b, c,
                                       pieces_each_round, exp_index, -1)
            print(
                f"第{exp_index}次实验的时间是{send_time}\n---------------------------")
    elif len(sys.argv) == 2 and sys.argv[1] == 'source':
        print("源端单独运行")
        send_time = Experiment.run(min_unit, T_slot, a, b, c,
                                   pieces_each_round, 1, 0)
        print(f"源端运行完毕, 实验的时间是{send_time}\n---------------------------")
    elif len(sys.argv) == 3 and sys.argv[1] == 'forward':
        forward_index = int(sys.argv[2])
        if forward_index > len(cfg.Dest_ADDR):
            print("转发层节点下标超出上限")
        print(f"第{forward_index}个转发层单独运行")
        Experiment.run(min_unit, T_slot, a, b, c, pieces_each_round, 1,
                       forward_index)
        print("转发层运行完毕.")
    else:
        print("""格式错误!
                 自动化: python exp_main.py
                 源  端: python exp_main.py source
                 转发层: python exp_main.py forward 2 (数字表示第几个源端, 需要和config中的地址一一对应)
        """)
Esempio n. 22
0
def compress(infile):
    '''gzip infile'''

    statement = "gzip -f %(infile)s" % locals()

    E.debug("executing statement '%s'" % statement)

    return E.run(statement)
Esempio n. 23
0
def main():
    
    Experiment.initLogging(os.environ.has_key("GATEWAY_INTERFACE"))
    
    (odict, args) = getOptions()
    (odict, args) = processOptions(odict, args)
        
    data = doit(odict, odict["e"])
Esempio n. 24
0
def compress( infile ):
    '''gzip infile'''

    statement = "gzip -f %(infile)s" % locals() 

    E.debug( "executing statement '%s'" % statement )

    return E.run( statement )
Esempio n. 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id$",
                                   usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("please supply two BAM files.")

    samfile = pysam.Samfile(args[0], "rb")

    readone = set()
    readtwo = set()
    removeone = set()
    removetwo = set()

    for read in samfile.fetch():
        if read.is_read1:
            if read.qname in readone:
                removeone.add(read.qname)
            readone.add(read.qname)
        else:
            if read.qname in readtwo:
                removetwo.add(read.qname)
            readtwo.add(read.qname)

    discarded = 0
    samout = pysam.Samfile(args[1], mode='wb', template=samfile)
    for read in samfile.fetch():
        if (read.qname in removeone) and read.is_read1:
            discarded += 1
        elif (read.qname in removetwo) and read.is_read2:
            discarded += 1
        else:
            samout.write(read)
    samfile.close()
    samout.close()

    E.info( "%s of %s first reads removed; %s of %s second reads; %s of %s multi mapped reads which fell into %s positions (average of %s positions per read)" % \
                ( len(removeone),
                  len(readone),
                  len(removetwo),
                  len(readtwo),
                  len(removeone)+len(removetwo),
                  len(readone)+len(readtwo),
                  discarded,
                  float(discarded)/(len(removeone)+len(removetwo))) )
    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 26
0
def run_experiments():

    try:
        experiment_file = args['file']
    except KeyError as e:
        log.error("run_experiments requires argument %s" % e.args[0])
        return
    import Experiment
    Experiment.run_experiments(experiment_file)
Esempio n. 27
0
def run_experiments():

    try:
        experiment_file = args['file']
    except KeyError as e:
        log.error("run_experiments requires argument %s" % e.args[0])
        return
    import Experiment
    Experiment.run_experiments(experiment_file)
Esempio n. 28
0
def open_file(filename, trigger, *args):
    binfile = filename[0:-4] + "_data.bin"
    if len(args) == 0:
        exp = Experiment.Experiment(binfile, filename, trigger)
    if len(args) == 1:
        exp = Experiment.Experiment(binfile,
                                    filename,
                                    trigger,
                                    stim_length=args[0])
    experiments[filename[0:-4]] = exp
    names.update(exp.stim_names)
Esempio n. 29
0
def stats():
  congestion = (1.0*Experiment.drop_count)/Experiment.packet_count
  overall_rtt = Experiment.rtt()
  single_rtt = Experiment.single_rtt()
  double_rtt = Experiment.double_rtt()
  if(len(Experiment.ribdeltas) != 0):
    ribdeltas = reduce(lambda x, y: x+y, Experiment.ribdeltas, 0.0)/len(Experiment.ribdeltas)
  else:
    ribdeltas = 0

  return " ".join(map(str,[congestion, overall_rtt[0], overall_rtt[1], Experiment.packet_count, Experiment.drop_count, Experiment.probe_count, Experiment.revmatch, Experiment.cycles, ribdeltas]))
Esempio n. 30
0
    def _iterate( self ):
        """iterate over muliple files."""
        
        def _iter( infile ):

            identifier = None

            for line in infile:
                if line.startswith("#"):  continue
                if line.startswith(">"):

                    if self.regexIdentifier:
                        try:
                            identifier = re.search(self.regexIdentifier, line[1:-1]).groups()[0]
                        except AttributeError:
                            raise ValueError("could not parse identifier from line %s - check the input" % line[1:-1])
                    else:
                        identifier = re.split("\s", line[1:-1])[0]

                else:
                    if not identifier:
                        raise ValueError("refusing to emit sequence without identifier - check the input")
                    yield identifier, line.strip()

        for filename in self.filenames:
            if self.format == "tar.gz" or self.format == "tar" or (self.format == "auto" and filename.endswith( "tar.gz" )):
                if filename == "-":
                    tf = tarfile.open( fileobj = sys.stdin, mode = "r|*" ) 
                else:
                    tf = tarfile.open( filename, mode = "r" )
                for f in tf:
                    b, ext = os.path.splitext( f.name )
                    if ext.lower() in ( ".fasta", ".fa" ):
                        E.info( "extracting %s" % f.name)
                        infile = tf.extractfile( f )
                        for x in _iter( infile ): yield x
                    else:
                        E.info( "skipping %s" % f.name )

                if tf != sys.stdin: tf.close()
                continue
            elif self.format == "fasta.gz" or (self.format == "auto" and filename.endswith(".gz")):
                infile = gzip.open( filename, "r" )
            elif filename == "-":
                infile = sys.stdin
            else:
                infile = open( filename, "r")

            for x in _iter( infile ): yield x
            if filename != "-": infile.close()

        raise StopIteration
Esempio n. 31
0
def bamToBed(infile, outfile):
    '''convert bam to bed with bedtools.'''

    statement = "bamToBed -i %(infile)s > %(outfile)s" % locals()

    E.debug("executing statement '%s'" % statement)

    retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True)
    if retcode < 0:
        raise OSError("Child was terminated by signal %i: \n%s\n" %
                      (-retcode, statement))

    return outfile
def calculateFalsePositiveRate(infiles, outfile):
    '''
    calculate the false positive rate in taxonomic
    abundances
    '''

    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    true_file = infiles[0]
    true_set = set()
    estimate_set = set()
    for estimate_file in infiles[1:]:
        if os.path.basename(estimate_file)[
                len("metaphlan_"):] == os.path.basename(true_file):
            tablenames = [
                P.toTable(os.path.basename(true_file)),
                P.toTable(os.path.basename(estimate_file))
            ]

            for species in cc.execute("""SELECT species_name FROM %s""" %
                                      tablenames[0]).fetchall():
                true_set.add(species[0])
            for species in cc.execute(
                    """SELECT taxon FROM %s WHERE taxon_level == 'species'""" %
                    tablenames[1]).fetchall():
                if species[0].find("_unclassified") != -1: continue
                estimate_set.add(species[0])

    total_estimate = len(estimate_set)
    total_true = len(true_set)

    E.info("counting false positives and false negatives")
    print(estimate_set.difference(true_set))
    nfp = len(estimate_set.difference(true_set))
    nfn = len(true_set.difference(estimate_set))
    ntp = len(estimate_set.intersection(true_set))

    E.info("writing results")
    track = P.snip(os.path.basename(true_file), ".load")
    outf = open(outfile, "w")
    outf.write("track\ttp_rate\tfp_rate\tfn_rate\n")
    outf.write("\t".join(
        map(str, [
            track,
            float(ntp) / total_estimate,
            float(nfp) / total_estimate,
            float(nfn) / total_true
        ])) + "\n")
    outf.close()
Esempio n. 33
0
def run(infile, options):

    options.tablename = quoteTableName(options.tablename,
                                       backend=options.backend)

    if options.map:
        m = {}
        for x in options.map:
            f, t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}

    existing_tables = None

    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect(options.psql_connection)
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect(options.database)
        try:
            os.chmod(options.database, 0664)
        except OSError, msg:
            E.warn("could not change permissions of database: %s" % msg)

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None  # "NULL"
        options.string_value = "%s"  # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait(dbhandle, statement, error, options.retry)
        existing_tables = set([x[0] for x in cc])
        cc.close()

        quick_import_statement = "sqlite3 -header -csv -separator '\t' %s '.import %%s %s'" % (
            options.database, options.tablename)
Esempio n. 34
0
def bamToBed( infile, outfile ):
    '''convert bam to bed with bedtools.'''

    statement = "bamToBed -i %(infile)s > %(outfile)s" % locals()

    E.debug( "executing statement '%s'" % statement )

    retcode = subprocess.call(  statement,
                                cwd = os.getcwd(), 
                                shell = True )
    if retcode < 0:
        raise OSError( "Child was terminated by signal %i: \n%s\n" % (-retcode, statement ))

    return outfile
Esempio n. 35
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = open(args[0], 'r')
    genes_list = []
    header = None

    for line in infile:
        if line.startswith("#"): continue
        if line.startswith("gene_id"):
            header = line.rstrip('\n')
            num_fields = len(header.split('\t')) - 2
            total_reads = [0] * num_fields
            continue

        la = line.rstrip('\n').split('\t')
        if len(la) < 3:
            continue
        genes_list.append(la)
        total_reads = map(lambda x, y: float(x) + float(y), total_reads,
                          la[2::])

    total_reads = map(lambda x: float(x) / 1000000, total_reads)

    print header

    for gene in genes_list:
        my_str_list = gene[0:2]
        vals = map(
            lambda x, y: float(x) / float(y) / (float(gene[1]) / 1000.0),
            gene[2::], total_reads)
        my_str_list.extend(map(str, vals))
        print "\t".join(my_str_list)

    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 36
0
def bamToMEDIPS( infile, outfile ):
    '''convert bam to medips format

    contig, start, end, strand

    Start is 1-based.
    '''

    statement = '''bamToBed -i %(infile)s | awk '{printf("%%s\\t%%i\\t%%i\\t%%s\\n", $1,$2+1,$3,$6)}' > %(outfile)s''' % locals()

    E.debug( "executing statement '%s'" % statement )

    E.run( statement )

    return outfile
Esempio n. 37
0
def iterator_test( infile, report_step = 100000 ):
    '''only output parseable lines from infile.'''
    
    ninput, noutput, nerrors = 0, 0, 0

    while 1:
        try:
            x = infile.next()
        except ParsingError, msg:
            nerrors += 1
            ninput += 1
            E.warn( str(msg) )
            continue
        except StopIteration:
            break
Esempio n. 38
0
def run( infile, options ):

    options.tablename = quoteTableName( options.tablename, backend = options.backend )

    if options.map:
        m = {}
        for x in options.map:
            f,t = x.split(":")
            m[f] = t
        options.map = m
    else:
        options.map = {}
        
    existing_tables = None
    
    if options.backend == "pg":
        import pgdb
        dbhandle = pgdb.connect( options.psql_connection )
        error = pgdb.DatabaseError
        options.null = "NULL"
        options.string_value = "'%s'"
        if options.insert_quick:
            raise ValueError("quick import not implemented.")

    elif options.backend == "sqlite":
        import sqlite3
        dbhandle = sqlite3.connect( options.database )
        try:
            os.chmod( options.database, 0664 )
        except OSError, msg:
            E.warn("could not change permissions of database: %s" % msg )

        # Avoid the following error:
        # sqlite3.ProgrammingError: You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings
        # Note: might be better to make csv2db unicode aware.
        dbhandle.text_factory = str

        error = sqlite3.OperationalError
        options.insert_many = True  # False
        options.null = None # "NULL" 
        options.string_value = "%s" # "'%s'"

        statement = "SELECT name FROM sqlite_master WHERE type='table'"
        cc = executewait( dbhandle, statement, error, options.retry )
        existing_tables = set( [ x[0] for x in cc ] )
        cc.close()

        quick_import_statement = "sqlite3 -header -csv -separator '\t' %s '.import %%s %s'" % (options.database, options.tablename)
def adience_poisson_experiment(dataset, tau_mode, tau=1.):
    """Ejecuta un experimento entero (entrenamiento y test) con la configuración poisson con el dataset Adience.

    :param dataset: Dataset para realizar el experimento.
    :param tau_mode: Modo de ejecución del parámetro tau en el experimento (constante o aprender valor)
    :param tau: Valor del parámetro tau. Valor inicial si se va a aprender, o valor constante.
    """
    #    RUN ADIENCE POISSON
    assert tau_mode in ["non_learnable", "sigm_learnable"]

    #Create net architecture
    poisson_resnet = Resnet_2x4_poisson(tau_mode)

    model = Model(inputs=poisson_resnet.inputs,
                  outputs=poisson_resnet.get_net())

    #Create experiment
    experiment = Experiment.Experiment(dataset, model)

    #Train
    experiment.train('adience_poisson_t=' + tau_mode,
                     '/TFG/ordinal_unimodal_mio/src/logs/')

    #Test
    experiment.test()
def adience_baseline_experiment_sgd(dataset):
    """Ejecuta un experimento entero (entrenamiento y test) con la configuración baseline con el optimizador SGD de nesterov con el dataset Adience.

    :param dataset: Dataset para realizar el experimento.
    """
    #    RUN ADIENCE BASELINE

    #Create net architecture
    baseline_resnet = Resnet_2x4()

    model = Model(inputs=baseline_resnet.inputs,
                  outputs=baseline_resnet.get_net())

    #Create experiment
    experiment = Experiment.Experiment(dataset, model)

    arguments = {
        'epochs': 100,
        'optimizer': SGD,
        'learning_rate': 1e-2,
        'momentum': 0.9,
        'loss_fn': 'categorical_crossentropy',
        'metrics': ['accuracy']
    }

    callbacks = ['ModelCheckpoint']

    #Train
    experiment.train('adience_baseline',
                     '/TFG/ordinal_unimodal_mio/src/logs/',
                     arguments=arguments,
                     callbacks=callbacks)

    #Test
    experiment.test()
Esempio n. 41
0
def suggest_analysis_layout(solid_runs):
    """Generate a bash script to build the analysis directory scheme

    Given a set of SolidRuns, print a set of script commands for running the
    build_analysis_dir.py program to create and populate the analysis directories.

    The script can be edited before being executed by the user.

    Arguments:
      solid_runs: a list of SolidRun objects.
    """
    print "#!/bin/sh\n#\n# Script commands to build analysis directory structure"
    for run in solid_runs:
        build_analysis_dir_cmd = 'build_analysis_dir.py'
        top_dir = os.path.abspath(
            os.path.join(os.getcwd(), os.path.basename(run.run_dir)))
        for sample in run.samples:
            for project in sample.projects:
                # Create one experiment per project
                cmd_line = []
                expt = Experiment.Experiment()
                expt.name = project.getProjectName()
                expt.type = "expt"
                expt.sample = project.getSample().name
                expt.library = project.getLibraryNamePattern()
                # Print the arguments for the layout
                cmd_line.extend(
                    (build_analysis_dir_cmd, "--top-dir=%s_analysis" % top_dir,
                     "--link=absolute", "--naming-scheme=partial"))
                cmd_line.append(expt.describe())
                cmd_line.append(run.run_dir)
                print "#\n%s" % (' \\\n').join(cmd_line)
Esempio n. 42
0
  def test_t4(self):
    r1 = Experiment.largest([3,2,3,4]) 
    self.assertEqual(r1, 4)

  #def test_t5(self):
   # r1 = Experiment.largest([3,-2,3,-1,4]) 
    self.assertEqual(r1, 4)
def main():
    min_unit, T_slot, a, b, c = 68, 1, 0.88652179221, 0.25726495726, 0.0073070866
    exp_times = 1  # 实验次数
    record_num = cfg.record_num
    send_time_list = []
    # 获取分段的数量
    if min_unit == 68:
        piece_num = record_num
    else:
        piece_num = 68 * record_num // min_unit + (1 if (68 * record_num %
                                                         min_unit) else 0)

    # 获取每轮发送的数量的数组
    pieces_each_round = 4  # 初始值
    with open("TimeOFpiecesEachRound.csv", "w", newline="",
              encoding="utf-8") as datacsv:
        csvwriter = csv.writer(datacsv, dialect="excel")
        csvwriter.writerow(["pieces_each_round", "源端发送数量"])

        while pieces_each_round <= piece_num:  # 何时间终止
            print(f"\n--------------------------------------------\n")
            print(f"\n|        pieces_each_round = %4d           |\n" %
                  (pieces_each_round, ))
            print(f"\n--------------------------------------------\n")
            send_num = Experiment.run(min_unit, T_slot, a, b, c,
                                      pieces_each_round, 1, -1)
            send_time_list.append(send_num)
            print(send_time_list)
            # 保存
            csvwriter.writerow([pieces_each_round, send_num])
            print("所有轮次发送时间之和:  ", send_num)
            pieces_each_round += 2  # 每轮增加的值
Esempio n. 44
0
def bamToMEDIPS(infile, outfile):
    '''convert bam to medips format

    contig, start, end, strand

    Start is 1-based.
    '''

    statement = '''bamToBed -i %(infile)s | awk '{printf("%%s\\t%%i\\t%%i\\t%%s\\n", $1,$2+1,$3,$6)}' > %(outfile)s''' % locals(
    )

    E.debug("executing statement '%s'" % statement)

    E.run(statement)

    return outfile
Esempio n. 45
0
def bedToMEDIPS( infile, outfile ):
    '''convert bam to medips format

    contig, start, end, strand

    Start is 1-based.
    '''

    if infile.endswith( ".gz" ): cat = "zcat"
    else: cat = "cat"

    statement = '''%(cat)s %(infile)s | awk '{printf("%%s\\t%%i\\t%%i\\t%%s\\n", $1,$2+1,$3,$6)}' > %(outfile)s''' % locals()

    E.run( statement )

    return outfile
Esempio n. 46
0
def experimentCounting(ranger):
    #print("Probabilistic Counting")
    #print("\nTo use hash functions:")

    hashes = list(hashlib.algorithms_guaranteed)

    for element in hashes:
        if element.lower().startswith("shake_"):
            hashes.remove(element)

    #print(hashlib.algorithms_guaranteed)
    #print(hashes)

    setups = Experiment.getSetup(["distinct", "hashes"])

    distincts = setups.get("various numbers of distinct elements")
    numHashes = setups.get("number of hashes")

    #for i in range(len(distincts)):
    #    distinct = distincts[i][0]
    #    calcCounting(distinct,3,hashes)

    for i in range(len(numHashes)):
        numHash = numHashes[i]
        calcCounting(ranger, numHash, hashes)
Esempio n. 47
0
def experiment(args, logger, dataProcessor):
    exp = Experiment.Experiment()

    model = dataProcessor.loadNetwork(args, 0)
    #PolicyValueFn.PolicyValueFn(args).to(args.device)
    data = exp.evaluationWithDifferentMinMaxSearchAgent(model)
#    data = exp.evaluationForNetworkWithFourRollout(model,start=10,end=50,step=10,random_cnt=1,numOfEvaluations=1)
    logger.info(data)
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(iotools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.items():
        rel_abundance[gi] = float(ab) / total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in list(rel_abundance.keys()):
        E.info("processing gi %s" % gi)
        taxid = cc.execute(
            """SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" %
            gi).fetchone()[0]
        species_id = cc.execute(
            """SELECT species_id FROM categories WHERE taxid == '%s'""" %
            taxid).fetchone()[0]
        species_name = cc.execute(
            """SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'"""
            % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" %
               (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.items():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
Esempio n. 49
0
def bedToMEDIPS(infile, outfile):
    '''convert bam to medips format

    contig, start, end, strand

    Start is 1-based.
    '''

    if infile.endswith(".gz"): cat = "zcat"
    else: cat = "cat"

    statement = '''%(cat)s %(infile)s | awk '{printf("%%s\\t%%i\\t%%i\\t%%s\\n", $1,$2+1,$3,$6)}' > %(outfile)s''' % locals(
    )

    E.run(statement)

    return outfile
Esempio n. 50
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-o", "--output-prefix", dest="output_prefix", type="string",
                      help="output filename prefix [default=%default]."  )

    parser.add_option("-c", "--chromosome-table", dest="filename_chromosome_table", type="string",
                      help="filename with tab separated list of chromosome names [default=%default]."  )

    parser.add_option( "--action", dest="action", type="choice",
                       choices=("plot", "run"),
                       help="action to perform [default=%default]")
    parser.add_option( "-s", "--signal-value", dest="signal_value", type="string", 
                       help="use either p.value or sig.value as ranking measure [default=%default]" )
    
    parser.set_defaults(
        action = "plot",
        output_prefix = "output",
        half_width = None,
        overlap_ratio = 0,
        is_broadpeak = False,
        signal_value = "signal.value",
        filename_chromosome_table = "genome_table.txt",
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if options.action == "plot":
        plotIDR( options.output_prefix + ".pdf", args)
    elif options.action == "run":
        if len(args) != 2:
            raise ValueError("require exactly two replicates")
        runIDR( options, args[0], args[1])
        
    ## write footer and output benchmark information.
    E.Stop()
Esempio n. 51
0
 def _outputHistogram( counts, bins, section ):
     outf = E.openOutputFile( "%s.table" % section )
     outf.write("%s\tcounts\tfrequency\tcumulative\n" % section )
     t, cc = sum( counts ), 0
     for bin, c in zip(bins[:-1], counts):
         cc += c
         outf.write( "\t".join( (str(bin), str(c), 
                                "%6.4f" % (100.0 * c / t ), 
                                "%6.4f" % (100.0 * cc / t ) ) ) + "\n" )
Esempio n. 52
0
def doAmazing(odict, args):
    
    if args:
        exp_names = [args[0]]
    else:
        exp_names = Experiment.getAllExperiments()
        
    for exp_name in exp_names:
        exp = Experiment.Experiment(exp_name)
        exp.regenHTML()
Esempio n. 53
0
def checkBlastRuns( infiles, outfile ):
    '''check if output files are complete.
    '''
    
    outf = IOTools.openFile( outfile, "w" )

    outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\
                    "\t".join(Logfile.RuntimeInformation._fields))

    for infile in infiles:
        E.debug( "processing %s" % infile)
        chunkid = P.snip( os.path.basename( infile ), ".blast.gz" )
        logfile = infile + ".log"
        chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta"

        with IOTools.openFile( infile ) as inf:
            l = inf.readline()
            ids = set()
            total_results = 0
            for l in inf:
                if l.startswith("#//"): continue
                ids.add( int(l.split("\t")[0] ) )
                total_results += 1
            found_first = min(ids)
            found_last = max(ids)
            found_total = len(ids)

        l = IOTools.getFirstLine( chunkfile )
        query_first = l[1:-1]
        l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n")
        query_last = l2[0][1:]

        logresults = Logfile.parse( logfile )
        
        outf.write( "\t".join( map(str, (\
                        chunkid, query_first, query_last,
                        found_first, found_last,
                        found_total, total_results,
                        logresults[-1].has_finished,
                        len(logresults),
                        "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" )
        
    outf.close()
Esempio n. 54
0
	def __init__(self, measurementSizeX, measurementSizeY, measurementSizeZ, psfSizeX, psfSizeY, psfSizeZ, numChannels, homeDirectory, ops):	
		Experiment.__init__(self,measurementSizeX, measurementSizeY, measurementSizeZ, psfSizeX, psfSizeY, psfSizeZ, numChannels, homeDirectory, ops)
	
		# parameters of the top sphere
		self.spherePositionX=self.objectSizeX / 2
		self.spherePositionY=self.objectSizeY / 2
		self.spherePositionZ=self.objectSizeZ / 2 - 27
		self.sphereRadius=20
		self.sphereIntensity=10000

		# parameters of the bottom sphere
		self.spherePosition2X=self.objectSizeX / 2
		self.spherePosition2Y=self.objectSizeY / 2
		self.spherePosition2Z=self.objectSizeZ / 2 
		self.sphereRadius2=5
		self.sphereIntensity2=10000

		self.background=0.000001

		self.directory=homeDirectory+"/SpheresHighIntensity/"
Esempio n. 55
0
def main():
    print "DEBUG: Entering Driver.main()"

    expGUI = ExperimentGUI() # create new ExperimentGUI object using default values    

    res = Results() # create new Results object using default value
    res.writeToFile(expGUI.window.get_title() + " Experiment\n")
    res.writeToFile("Experiment started at: " + str(datetime.now()) + "\n\n")

    gtk.main()

   # http://www.pygtk.org/dist/pygtk2-tut.pdf
    
    catA = ImageCategory("A", ["A0.jpg", "A1.jpg", "A2.jpg", "A3.jpg", "A4.jpg", "A5.jpg"])
    catB = ImageCategory("B", ["B0.jpg", "B1.jpg", "B2.jpg", "B3.jpg", "B4.jpg"])

    categories = [catA, catB]
    
    lb1 = LearningBlock(["A0.jpg", "A1.jpg", "B2.jpg"], 10.0)
    lb2 = LearningBlock(["B3.jpg", "B4.jpg", "A1.jpg"], 15.0)
    tb1 = TestingBlock(["A0.jpg", "A1.jpg", "B2.jpg"])
    tb2 = TestingBlock(["A3.jpg", "B4.jpg"])

    lblockList = [lb1, lb2]
    tblockList = [tb1, tb2]
    
    lp = LearningPhase(lblockList)
    tp = TestingPhase(tblockList)
    phaseList = [lp, tp]
    exp = Experiment(phaseList)
    exp.runPhases(categories, res)
    
    print "DEBUG: Entering gtk.main()"
    
    #gtk.main()
    
    print "DEBUG: Exiting gtk.main()"

    res.writeToFile("Experiment ended at: " + str(datetime.now()))

    print "DEBUG: Exiting Driver.main()"
Esempio n. 56
0
def bigwig( infile, contig_sizes ):
    '''convert infile to bigwig file'''

    if infile.endswith( ".wig"):
        outfile = infile[:-4] + ".bigwig"
    else:
        outfile = infile + ".bigwig"
        
    tmp, filename_sizes = tempfile.mkstemp() 

    os.write( tmp, "\n".join( [ "\t".join(map(str,x)) for x in contig_sizes.iteritems() ] ) )
    os.close( tmp )

    statement = "wigToBigWig -clip %(infile)s %(filename_sizes)s %(outfile)s " % locals() 

    E.debug( "executing statement '%s'" % statement )

    if E.run( statement ):
        os.unlink( infile )

    os.unlink( filename_sizes )
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(IOTools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.iteritems():
        rel_abundance[gi] = float(ab)/total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in rel_abundance.keys():
        E.info("processing gi %s" % gi)
        taxid = cc.execute("""SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0]
        species_id = cc.execute("""SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0]
        species_name = cc.execute("""SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.iteritems():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
Esempio n. 58
0
    def run( self, outfile, processors = 1 ):

        tasks = []

        manager = multiprocessing.Manager()
        lock = manager.Lock()

        for segmentor in self.test_generator:
            headers = segmentor.headers
            tasks.append( (lock, outfile, segmentor, self.runner, self.validators) )

        for v in self.validators: headers.extend( v.headers )
        outfile.write( "%s\n" % "\t".join( headers) )
            
        E.info( "created %i tasks for %i workers" % (len(tasks), processors ) )
        
        if processors > 1:
            pool = multiprocessing.Pool( processors )
            pool.map( runSimulation, tasks ) 
        else:
            for task in tasks:
                runSimulation( task )
def calculateFalsePositiveRate(infiles, outfile):
    '''
    calculate the false positive rate in taxonomic
    abundances
    '''

    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    true_file = infiles[0]
    true_set = set()
    estimate_set = set()
    for estimate_file in infiles[1:]:
        if os.path.basename(estimate_file)[len("metaphlan_"):] == os.path.basename(true_file):
            tablenames = [P.toTable(os.path.basename(true_file)), P.toTable(os.path.basename(estimate_file))]

            for species in cc.execute("""SELECT species_name FROM %s""" % tablenames[0]).fetchall():
                true_set.add(species[0])
            for species in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == 'species'""" % tablenames[1]).fetchall():
                if species[0].find("_unclassified") != -1: continue
                estimate_set.add(species[0])
    
    total_estimate = len(estimate_set)
    total_true = len(true_set)

    E.info("counting false positives and false negatives")
    print estimate_set.difference(true_set)
    nfp = len(estimate_set.difference(true_set))
    nfn = len(true_set.difference(estimate_set))
    ntp = len(estimate_set.intersection(true_set))

    E.info("writing results")
    track = P.snip(os.path.basename(true_file), ".load")
    outf = open(outfile, "w")
    outf.write("track\ttp_rate\tfp_rate\tfn_rate\n")
    outf.write("\t".join(map(str, [track, float(ntp)/total_estimate, float(nfp)/total_estimate, float(nfn)/total_true])) + "\n")
    outf.close()