コード例 #1
0
ファイル: gt.py プロジェクト: wrbutton/foo
def delete_inf(path):
    filelist = gt.globit(path, '*INF*')
    folderlist = gt.globit(path, '*_final*')
    folderlist = [x for x in folderlist if 'finalqc' not in x]
    for f in filelist:
        print(f)
        os.remove(f)
    for f in folderlist:
        print(f)
        shutil.rmtree(f)
コード例 #2
0
def predict_cells(input, save=False):
    """ can accept directory and loop through files or one dataframe at a time,
     uses v1.0 of the SVM classifier to consolidate reps to consensus and return prediction
     when save is True a dataframe will be saveh"""

    with open('/Users/WRB/Dropbox/bin/python/celllineclassifier.p',
              'rb') as file:
        clf = pickle.load(file)
    if isinstance(input, str):
        if os.path.isdir(input):
            vlist = gt.globit(input, '*_Qctrl_n*')
            if len(vlist) == 0:
                vlist = gt.globit(input, '*QNORM*')
        else:
            vlist = [input]
    elif isinstance(input, pd.Series):
        try:
            res = clf.predict([input])[0]
        except:
            print('error with series prediction')
            res = None
        return res
    else:
        vlist = input
    res_table = pd.DataFrame()
    for f in vlist:
        try:
            d, h = gct.extractgct(f)
        except:
            vlist[0] = d
            vlist[1] = h
        ds, hs = gt.dsub(d, h, {'type': 'vehicle'})
        if len(ds) == 0:
            print('error, maybe using ZS file? use QNORM instead')
            return None
        for b in hs.batch.unique():
            dsb, hsb = gt.dsub(ds, hs, {'batch': b})
            med = dsb.median(axis=1).values
            shn = gt.get_shn(f) + '-' + b
            res = clf.predict([med])[0]
            res_table.loc[shn, 'cell'] = res
            print(f'{shn} - {res}')
    if save is True:
        res_table.to_csv(gt.dflt_outpath(fn='cell_predictions.csv'), sep='\t')
    return res_table
コード例 #3
0
def get_vehicle_matrix(path='dflt', batch='all', delim=':', getcells=False):
    """" for the path load all files and collapse vehicles, plot matrix
    batches can be all or 'A' only to just take the first one. getcells will re-predict cells """
    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    flv = gt.globit(path, '*Qctrl*')
    if len(flv) == 0:
        flv = gt.globit(path, '*_QNORM_*')

    # should put in a check to extract from regular qnorms
    dlist, hlist = [], []
    for f in flv:
        d, h = gct.extractgct(f)
        h['plate'] = h['plate'].apply(lambda x: x[:6])
        d, h = gt.dsub(d, h, {'type': 'vehicle'})
        if batch == 'all':
            for b in h.batch.unique():
                ds, hs = gt.dsub(d, h, {'batch': b})
                med = ds.median(axis=1)
                hs = gt.gen_label(hs, 'pb', delim=delim)
                dlist.append(med)
                hlist.append(hs.iloc[0])
        elif batch == 'A':
            ds, hs = gt.dsub(d, h, {'batch': 'A'})
            med = ds.median(axis=1)
            hs = gt.gen_label(hs, 'pb', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])
        else:
            med = d.median(axis=1)
            hs = gt.gen_label(hs, 'p', delim=delim)
            dlist.append(med)
            hlist.append(hs.iloc[0])

    vdf = pd.concat(dlist, axis=1)
    vh = pd.DataFrame(hlist)
    vdf.columns = vh.label
    if getcells is True:
        vh['cell2'] = vh.label.apply(lambda x: predict_cells(vdf[x]))
        vh['label'] = vh.label + delim + vh.cell2
    vdf.columns = vh.label
    return vdf, vh
コード例 #4
0
ファイル: qc.py プロジェクト: wrbutton/foo
def assemble_ref_dat(path):
    """ to gather together all reference RNA wells within the given path """
    fl = gt.globit(path, '*_ref_n*')
    dl, hl = [], []
    for f in fl:
        dr, hr = gct.extractgct(f)
        dr, hr = gt.dsub(dr, hr, {'well':['A02','B02']})
        dr = round(dr, 2)
        dl.append(dr)
        hl.append(hr)
    alldata = pd.concat(dl, axis=1)
    return alldata
コード例 #5
0
ファイル: cust_processing.py プロジェクト: wrbutton/foo
def run_plate_analysis(mode='ind', cats='nd', path='dflt'):
    """ runs standard analysis on either each plate individually 'ind' or all togegther 'comb'
    most useful for plates with doses. the default loc  

    default path will be newQC on the desktop """

    path = gt.check_dfltarg(path, os.path.join(gt.check_desktop(), 'newQC'))

    fl = gt.globit(path, '*ZSVCQNORM*')

    print(fl)

    if mode == 'comb':
        dl, hl = [], []
        for i, f in enumerate(fl):
            d, h = gct.extractgct(f)
            if i == 0:
                try:
                    pname = d.name + '+'
                except:
                    pname = h.addr[0].split(':')[0] + '+'
            if len(h.batch.unique()) > 1:
                # fix sample labels for plate/batch
                h.plate = h.plate + h.batch
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)
            dl.append(d)
            hl.append(h)
        try:
            d = pd.concat(dl, axis=1)
            d.name = pname
        except ValueError:
            sys.exit('no gct file plates to analyze')
        h = pd.concat(hl, axis=0)

        analyze_plate(d, h, cats)

    elif mode == 'ind':
        for f in fl:
            d, h = gct.extractgct(f)
            # define labels (should I add plate?)
            h = gt.gen_label(h, cats)

            analyze_plate(d, h, cats)
コード例 #6
0
ファイル: qc.py プロジェクト: wrbutton/foo
def distribute_qc(path = 'dflt'):
    if path is 'dflt':
        inpath = gt.dflt_outpath(fldr_name='newQC')
        outpath = gt.dflt_outpath(fldr_name='QCprocessing')

    folders = ['calibs', 'flogps', 'escore', 'cellid-nolabel', 'cellid-label', 'euclidean']
    folders = [os.path.join(outpath, x) for x in folders]
    srch_terms = ['finalqc/*calibplot', 'finalqc/*FLOGP', 'escore_summary*/', '*cell_line/*cellid_nolabel/*-*cellid_circle',
                  '*cell_line/*-*cellid_circle', '-*euclidean']

    for term, fold in zip(srch_terms, folders):
        try:
            os.makedirs(fold)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
        srch = '*'.join(['', term, '' ]) + '.png'
        for file in gt.globit(inpath, srch):
            shutil.copy(file, fold)
コード例 #7
0
ファイル: qc.py プロジェクト: wrbutton/foo
def combine_fails(path='dflt', ret=False, summ=False, sep=False, thresh=1):
    if path == 'dflt':
        path = gt.dflt_outpath(fldr_name='newQC')
    fl = gt.globit(path, '*QC_fail*')
    files = ' '.join(fl)
    #cmd_str = 'cat ' + files + ' > ' + os.path.join(path, 'QC_fail.txt')
    #subprocess.run(cmd_str, shell=True)
    datlist = []
    for f in fl:
        dat = pd.read_csv(f, sep='\t', skiprows=1)
        dropcols = [x for x in dat.columns if 'Unnamed' in x]
        dat = dat.drop(dropcols, axis=1)
        dat.dropna(inplace=True)
        try:
            dat = dat[dat['Batch'] != ' ']
        except:
            pass
        if sep == False:
            try:
                dat = dat[dat['Batch'] != 'Batch']
            except:
                pass
        datlist. append(dat)
    data = pd.concat(datlist, axis=0)
    data.to_csv(os.path.join(path, 'QCfail_summary.txt'), sep='\t')

    if summ is True:
        gbname = data.groupby('PERT_DESC').size()
        print(gbname[gbname > thresh])
        gbbatch = data.groupby('Batch').size()
        print(gbbatch[gbbatch > thresh])

        # this subsets down to show how many doses totally fail (3 reps each) per name
        # g = f.groupby(['PERT_DESC', 'DOSE']).size()
        # res = g[g > 2].groupby('PERT_DESC').size().sort_values(ascending=False)

    if ret is True:
        return data
コード例 #8
0
ファイル: qc.py プロジェクト: wrbutton/foo
def dl_data(sc='q', src='dflt', dest='dflt', search=None, excl=None, ext=None, p=False):
    """ download data from s3. 'sc' is shortcut, can be 'q' for qc, 'g' for gct, 'z' for zscore,
    'e' for enrichment, 'f' for final """
    if dest == 'dflt':
        dest = gt.dflt_outpath(fldr_name='newQC')
    elif dest == 'foo':
        dest = gt.dflt_outpath(fldr_name='foo')
    else:
        if '/' in dest or '\\' in dest:
            try:
                os.mkdir(dest)
            except:
                pass
        else:
             dest = gt.dflt_outpath(fldr_name=dest)

    tempdest = gt.dflt_outpath(fldr_name='temp_copy_transfer')

    s3c = boto3.client('s3')
    pref = '_review/'
    if src == 'dflt':
        items = s3c.list_objects(Bucket='genometry', Prefix=pref, Delimiter='/')
        folds = sorted([list(x.values())[0].replace(pref, '').strip('/') for x in items['CommonPrefixes']])
        if len(folds) == 0:
            print('hm, zero files in list')
        if len(folds) == 1:
            fold = folds[0]
        if len(folds) > 1:
            fold = folds[-1]
        print('downloading from ', fold)
        src = 's3://genometry/' + pref + fold
        # grab PCA ppt and latest coordinates txt
        s3c.download_file('genometry', 'PCA_analysis/PCA2.pptx', os.path.join(dest, 'PCA.pptx'))
        coords = s3c.list_objects(Bucket='genometry', Prefix='PCA_analysis/')
        coord = sorted([x['Key'] for x in coords['Contents']])[-1]
        s3c.download_file('genometry', coord, os.path.join(tempdest, 'PCA_coords.txt'))
    else:
        src = 's3://genometry/' + src

    search_args = []

    # parse shortcut
    if 'q' in sc:
        search_args.append(('*_qc/*', ''))
    if 'g' in sc:
        search_args.append((['*_fullqnorm_*', '*_QNORM_sorted*', '*_ref_n*', '*_Qctrl_n*'], ''))
        ext = '.gct'
    if 'z' in sc:
        search_args.append(('*_ZSVCQNORM_*', ''))
        ext = '.gct'
    if 'e' in sc:
        search_args.append(('*_escore/*', '*.gct'))
    if 'f' in sc:
        search_args.append(('*_final/*', ''))
        dest = gt.dflt_outpath(fldr_name='finaldata')
        ext = ['.gct', '.txt']
    if 'i' in sc:
        search_args.append(('*_ZSVCINF_*', ''))
        ext = '.gct'
    search_args.append(('', excl))

    if search is not None:
        if '*' not in search:
            search = '*' + search + '*'
        search_args.append((search, ''))

    if excl is not None:
        if '*' not in excl:
            excl = '*' + excl + '*'
        search_args.append(('', excl))

    for search, excl in search_args:
        cmd_str = f'aws s3 cp --recursive {src} {tempdest} --exclude "*"'
        if isinstance(search, str):
            search = [search]
        for st in search:
            cmd_str += f' --include {st}'
        if excl != '':
            cmd_str += f' --exclude {excl}'

        print(cmd_str)

    subprocess.run(cmd_str, shell=True)

    if ext is not None:
        if isinstance(ext, str):
            ext = [ext]
        for ex in ext:
            fl = gt.globit(tempdest, f'*{ex}')
            for f in fl:
                file_dest = os.path.join(tempdest, os.path.basename(f))
                shutil.move(f, file_dest)
            subdirs = [x[0] for x in os.walk(tempdest)][1:]
        for sd in subdirs:
            try:
                shutil.rmtree(sd)
            except:
                pass
    # do final copy from temp to main destination, erase temp
    for f in gt.get_flist(tempdest):
        file_dest = os.path.join(dest, os.path.basename(f))
        shutil.move(f, file_dest)
    shutil.rmtree(tempdest)

    if p is True:
        try:
            process_qc()
        except:
            print('error processing qc')