def saveDf(data, filename):
        '''
        save *.df files
        : DataFrame
        '''

        saved_df = DataFrame.save(data, filename)
        print saved_df
Example #2
0
def mapNewData(working_bucket, data, meta_data, anno_data,syn_file,agilent_file,network_table):
    """
    Given local file locations for source data, meta data, annotations data,
        synonyms file and the agilent (probe->gene) file,
    Creates a new dataframe, containing only gene information for genes
        present in the network table, indexed by gene name, columns are sample ids
    Returns dataframe pickle file location and dataframe
    """
    anno = pandas.io.parsers.read_table(anno_data)
    data = pandas.io.parsers.read_table(data)
    metadata = pandas.io.parsers.read_table(meta_data)
    agl = pandas.io.parsers.read_table(agilent_file)
    
    #get rid of control probes

    data.index = anno['ProbeName']
    control_probe_names = anno['ProbeName'][anno['ControlType'] != 0]
    data = data.drop(control_probe_names)

    agl.set_index('ProbeID')
    agl2 = agl[agl['GeneSymbol'].notnull()]
    agl2 = agl2.set_index('ProbeID')

    #map probes to genes from network

    a = agl2['GeneSymbol'].tolist()
    b = set(a)
    table = Table(network_table)
    temp_nets = table.scan()
    network_genes = []
    i = 0
    for net in temp_nets:
        network_genes += net['gene_ids'][6:].split('~:~')
    network_genes_set = set(network_genes)


    mm = {}
    added = []
    with open(syn_file,'r') as synonyms:
        for line in synonyms:
            parsed = line.split()
            try:
                temp = []
                for p in parsed[:5]:
                    tt = p.split('|')
                    for t in tt:
                        if len(t) > 2 and t in network_genes_set and parsed[2] in b:
                            added.append(t)
                            temp.append(t)
                if len(temp) > 0:
                    if parsed[2] not in mm:
                      mm[parsed[2]] = []
                    for t in temp:
                        if t not in mm[parsed[2]]:
                            mm[parsed[2]].append(t)
                
            except IndexError:
                pass
    ng2p = {}
    probes = []
    with open(agilent_file, 'r') as gl:
        for line in gl:
            parsed = line.split()
            try:
                if parsed[2] in mm: #mouse gene is mapped to network gene
                    for ng in mm[parsed[2]]:
                        if ng not in ng2p:
                            ng2p[ng] = []
                        if parsed[0] not in ng2p[ng]:
                            ng2p[ng].append(parsed[0])
                            probes.append(parsed[0])          
            except IndexError:
                pass
    #create newly trimmed and annotated data frame
    #save pickle locally

    df = DataFrame(np.zeros((len(ng2p), len(data.columns))), index=ng2p.keys(), columns=data.columns)
    for k,v in ng2p.iteritems():
        df.ix[k] = data.ix[v].median()
    saved = os.path.join(os.path.split(agilent_file)[0],'trimmed_dataframe.pandas')
    df.save(saved)
    
    #send pickled dataframe to working bucket
    conn = boto.connect_s3()
    b = conn.get_bucket(working_bucket)
    k=Key(b)
    k.key = 'trimmed_dataframe.pandas'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(saved)

    k.key = 'metadata.txt'
    k.storage_class = 'REDUCED_REDUNDANCY'
    k.set_contents_from_filename(meta_data)

    return saved,df 
Example #3
0
def profile_comparative(benchmarks):

    from vbench.api import BenchmarkRunner
    from vbench.db import BenchmarkDB
    from vbench.git import GitRepo
    from suite import BUILD, DB_PATH, PREPARE, dependencies
    TMP_DIR = tempfile.mkdtemp()

    try:

        prprint("Opening DB at '%s'...\n" % DB_PATH)
        db = BenchmarkDB(DB_PATH)

        prprint("Initializing Runner...")

        # all in a good cause...
        GitRepo._parse_commit_log = _parse_wrapper(args.base_commit)

        runner = BenchmarkRunner(
            benchmarks, REPO_PATH, REPO_PATH, BUILD, DB_PATH,
            TMP_DIR, PREPARE, always_clean=True,
            # run_option='eod', start_date=START_DATE,
            module_dependencies=dependencies)

        repo = runner.repo  # (steal the parsed git repo used by runner)
        h_head = args.target_commit or repo.shas[-1]
        h_baseline = args.base_commit

        # ARGH. reparse the repo, without discarding any commits,
        # then overwrite the previous parse results
        # prprint ("Slaughtering kittens..." )
        (repo.shas, repo.messages,
         repo.timestamps, repo.authors) = _parse_commit_log(None,REPO_PATH,
                                                                args.base_commit)

        prprint('Target [%s] : %s\n' % (h_head, repo.messages.get(h_head, "")))
        prprint('Baseline [%s] : %s\n' % (h_baseline,
                repo.messages.get(h_baseline, "")))

        prprint("Removing any previous measurements for the commits.")
        db.delete_rev_results(h_baseline)
        db.delete_rev_results(h_head)

        # TODO: we could skip this, but we need to make sure all
        # results are in the DB, which is a little tricky with
        # start dates and so on.
        prprint("Running benchmarks for baseline [%s]" % h_baseline)
        runner._run_and_write_results(h_baseline)

        prprint("Running benchmarks for target [%s]" % h_head)
        runner._run_and_write_results(h_head)

        prprint('Processing results...')

        head_res = get_results_df(db, h_head)
        baseline_res = get_results_df(db, h_baseline)
        ratio = head_res['timing'] / baseline_res['timing']
        totals = DataFrame({HEAD_COL:head_res['timing'],
                                BASE_COL:baseline_res['timing'],
                                'ratio':ratio,
                                'name':baseline_res.name},
                                columns=[HEAD_COL, BASE_COL, "ratio", "name"])
        totals = totals.ix[totals[HEAD_COL] > args.min_duration]
            # ignore below threshold
        totals = totals.dropna(
        ).sort("ratio").set_index('name')  # sort in ascending order

        h_msg =   repo.messages.get(h_head, "")
        b_msg =   repo.messages.get(h_baseline, "")

        print_report(totals,h_head=h_head,h_msg=h_msg,
                     h_baseline=h_baseline,b_msg=b_msg)

        if args.outdf:
            prprint("The results DataFrame was written to '%s'\n" %  args.outdf)
            totals.save(args.outdf)
    finally:
        #        print("Disposing of TMP_DIR: %s" % TMP_DIR)
        shutil.rmtree(TMP_DIR)
def prepare_data(path, smoothstr, smooth, exp_genotype, ctrl_genotype):

    df2 = DataFrame()
    if not os.path.exists(path + "/Speed_calculations/"):
        os.makedirs(path + "/Speed_calculations/")
    for csvfile in sorted(glob.glob(path + "/*.csv")):
        csvfilefn = os.path.basename(csvfile)
        try:
            experimentID,date,time = csvfilefn.split("_",2)
            genotype,laser,repID = experimentID.split("-",2)
            repID = repID + "_" + date
            print "processing: ", experimentID
        except:
            print "invalid filename:", csvfilefn
            continue 
        df = pd.read_csv(csvfile, index_col=0)

        if not df.index.is_unique:
            raise Exception("CORRUPT CSV. INDEX (NANOSECONDS SINCE EPOCH) MUST BE UNIQUE")


        #resample to 10ms (mean) and set a proper time index on the df
        df = flymad_analysis.fixup_index_and_resample(df, '10L')

        #smooth the positions, and recalculate the velocitys based on this.
        dt = flymad_analysis.kalman_smooth_dataframe(df, arena, smooth)

        df['laser_state'] = df['laser_state'].fillna(value=0)

        lasermask = df[df['laser_state'] == 1]
        df['tracked_t'] = df['tracked_t'] - np.min(lasermask['tracked_t'].values)

        #MAXIMUM SPEED = 300:
        df['v'][df['v'] >= 300] = np.nan
        
        #the resampling above, using the default rule of 'mean' will, if the laser
        #was on any time in that bin, increase the mean > 0.
        df['laser_state'][df['laser_state'] > 0] = 1
            
        df['Genotype'] = genotype
        df['lasergroup'] = laser
        df['RepID'] = repID

        #combine 60s trials together into df2:
        dfshift = df.shift()
        laserons = df[ (df['laser_state']-dfshift['laser_state']) == 1 ]    
        #SILLY BUG. Sometimes laserons contains incorrect timepoints. To fix this, I compare each laseron to the 
        # previous and discard if ix - prev is less than the experimental looping time.
        prev = pd.DatetimeIndex([datetime.datetime(1986, 5, 27)])[0]  #set to random time initially
        for ix,row in laserons.iterrows():
            before = ix - DateOffset(seconds=9.95)
            after = ix + DateOffset(seconds=59.95)
            if (ix - prev).total_seconds() <= 59.95:
                continue
            else:
                print "ix:", ix, "\t span:", ix - prev
                prev = ix
                dftemp = df.ix[before:after][['Genotype', 'lasergroup','v', 'laser_state']]
                dftemp['align'] = np.linspace(0,(after-before).total_seconds(),len(dftemp))
                df2 = pd.concat([df2, dftemp])

    expdf = df2[df2['Genotype'] == exp_genotype]
    ctrldf = df2[df2['Genotype']== ctrl_genotype]

    #we no longer need to group by genotype, and lasergroup is always the same here
    #so just drop it. 
    assert len(expdf['lasergroup'].unique()) == 1, "only one lasergroup handled"

    expmean = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlmean = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expstd = expdf.groupby(['align'], as_index=False).mean().astype(float)
    ctrlstd = ctrldf.groupby(['align'], as_index=False).mean().astype(float)

    expn = expdf.groupby(['align'], as_index=False).count().astype(float)
    ctrln = ctrldf.groupby(['align'], as_index=False).count().astype(float)

    ####AAAAAAAARRRRRRRRRRRGGGGGGGGGGGGGGHHHHHHHHHH so much copy paste here
    df2.save(path + "/df2" + smoothstr + ".df")
    expmean.save(path + "/expmean" + smoothstr + ".df")
    ctrlmean.save(path + "/ctrlmean" + smoothstr + ".df")
    expstd.save(path + "/expstd" + smoothstr + ".df")
    ctrlstd.save(path + "/ctrlstd" + smoothstr + ".df")
    expn.save(path + "/expn" + smoothstr + ".df")
    ctrln.save(path + "/ctrln" + smoothstr + ".df")

    return expmean, ctrlmean, expstd, ctrlstd, expn, ctrln, df2