def extract_values_from_img(img,atlas,df,sid,itp,roi_key = None, scale = None,comp = 'mean'):

    print 'extracting values from %s'%(img)
    if not roi_key:
        for i in range(1,(scale+1)):
            print 'working on roi %s'%(i)
            cde = wr.codegen(6)
            os.system('fslmaths %s -thr %s -uthr %s %s_msk'%(atlas,i,i,cde))
            os.system('fslmaths %s -mas %s_msk.nii.gz %s_valz'%(img,cde,cde))
            if comp == 'count':
                val = subprocess.check_output('fslstats %s_valz.nii.gz -V'%(cde),shell = True)
            else:
                val = subprocess.check_output('fslstats %s_valz.nii.gz -M'%(cde),shell = True)
            os.system('rm %s_*'%(cde))
            df.ix[sid, '%s_%s'%(itp,i)] = float(val)

    else:
        for i,roi in roi_key.iteritems():
            print 'working on roi %s'%(roi)
            cde = wr.codegen(6)
            os.system('fslmaths %s -thr %s -uthr %s %s_msk'%(atlas,i,i,cde))
            os.system('fslmaths %s -mas %s_msk.nii.gz %s_valz'%(img,cde,cde))
            val = subprocess.check_output('fslstats %s_valz.nii.gz -M'%(cde),shell=True)
            os.system('rm %s_*'%(cde))
            df.ix[sid, '%s_%s'%(itp,roi)] = float(val)

    return df
def import_atlas(wdir, atlas, mask=False):
    mtx = ni.load(atlas).get_data()
    cde = codegen(6)

    if mask:
        binar = False
        msk = ni.load(mask).get_data()
        x, y, z = msk.shape
        for i in range(y):
            if any(msk[:][i][i]) > 0:
                for j in msk[:][i][i]:
                    if j != 0 and j != 1:
                        binar = True
        if binar == True:
            oldpth = pthswp(wdir)
            os.system('fslmaths %s -bin %s_binmsk' % (mask, cde))
            msk = ni.load('%s_binmsk.nii.gz' % (cde)).get_data()
            mtx = np.ma.masked_array(mtx, msk.astype(bool))

    return mtx
Exemple #3
0
def spatial_correlation_searchlight_from_NIAK_GLMs(indir,
                                                   cov_img,
                                                   outdir,
                                                   contrast,
                                                   templ_str,
                                                   scalestr='scale',
                                                   norm=False,
                                                   eff='t',
                                                   poly=1,
                                                   taskid='',
                                                   save=False):
    '''
    Given a directory containing a) NIAK glm.mat files at multiple scales, b) atlases at
    the same resolutions, and a covariate image, this function will do the
    following:
        1) Convert covariate image to resolution of each atlas
        2) Perform spatial correlations between a) average connectivity matrix
        seed from every region at every scale, b) the covariance image
        3) Output a dictionary summarizing the results, where key = resolution
        and value = a dataframe containing statistics for each test


    indir = path to directory containing NIAK glm.mat files and atlases.

    cov_img = path to covariate image

    outdir = path to desired output directory

    contrast = label of the desired contrast to extract average connectivity
    values from glm.mat file

    templ_str = search string to locate atlases

    scalestr = the string preceding the number indicating atlas resolution in
    the glm and atlas pahts

    norm = If set to a path, will normalize cov_img to target image that path
    points to. Uses flirt with nearest neighbour interpolation.  If False, no
    normalization will occur

    eff = if set to 'r', will take values within the 'eff' structure of the
    glm.mat file. If set to 't' will take values within the 'ttest' structure
    of the glm.mat file

    poly = If int > 1, results will include test statistics modeled with a
    n-order polynomial, where n = poly

    taskid = used to keep track of id in the case of multiple bootstrap samples

    save = If set to a string, will write results from each resolution to a spreadsheet
    with a file name indicated by string input


    Outputs a dict where the key is scale and the value is a dataframe
    containing results at that scale
    '''
    save = check_bool(save)
    if save:
        if '_' in save:
            print('WARNING: no _ aloud in save. Removing...')
            nsave = ''
            jnk = save.rsplit('_')
            for i in range(len(jnk)):
                nsave = nsave + jnk[i]
            save = nsave

    cde = wr.codegen(6)

    if norm:
        print 'normalizing tmap to fmri space'
        os.system(
            'flirt -interp nearestneighbour -in %s -ref %s -out %s/%s_rtmap' %
            (cov_img, norm, outdir, cde))
        cov_img = os.path.join(outdir, '%s_rtmap.nii.gz' % (cde))

    glmz = glob(os.path.join(indir, 'glm*.mat'))
    dfz = {}
    for glm in glmz:
        ####################the 2 lines below suck######################
        scale = int(
            os.path.split(glm)[1].rsplit('_%s' % (scalestr))[1].rsplit('.')[0])
        scale_templ = glob(os.path.join(indir,
                                        '%s*%s.*' % (templ_str, scale)))[0]
        df, rdf, scalar = wr.get_parcelwise_correlation_map_from_glm_file(
            outdir,
            glm,
            scale_templ,
            scale,
            contrast,
            eff=eff,
            cov_msk='',
            cov_img=cov_img,
            conndf='',
            poly=poly)
        dfz.update({scale: rdf})
        if save:
            rdf.to_csv(
                os.path.join(outdir,
                             '%s_scl%s_res%s.csv' % (save, scale, taskid)))

    resdf = pandas.DataFrame(
        columns=['scale', 'parcel', 'measure', 'value', 'pvalue'])
    os.system('rm %s' % (os.path.join(outdir, '%s_*' % (cde))))

    return dfz
Exemple #4
0
def voxelwise_analysis(scans,
                       pv_vals,
                       outfl,
                       outdir,
                       out_tp='r',
                       nonpar=False,
                       taskid='',
                       parallel=False,
                       parin='',
                       indata=False,
                       intermed=False):
    """Given a list of 3D volumes and a corresponding list of values for a
    predictor variable, run voxelwise correlations.

    scans = a list of paths corresponding to 3D nifti volumes

    pv_vals = a list of values corresponding to subject values, in the same
    order as scans

    outfl = string to be used to identifiy the outfile map (NOTE, taskid will
    be automatically appended to this string)

    outdir = the path to the directory where the outfiles will be written to

    out_tp = the type of map to be generated. 'r' will generate a voxelwise
    rmap, whereas 't' will generate a voxelwise tmap

    nonpar = Set to True to set voxelwise analysis from pearson to spearman

    indata = In case 4D data is already available in an existing variable, data
    can be specified here. Or, if an int file exists, simply add the path here.

    taskid = used to keep track of id in the case of multiple bootstap samples

    parallel = if true, script will copy scans into working directory to allow
    for multiple concurrent processes. Will also make script compatible for
    command line based parallelization.

    parin = input path for parallelization

    intermed = if true, script will not delete the 4D volume used to run the
    voxelwise analysis


    Outputs a path pointing to the newly created tmap


    WARNING: As of now, this script is extremely computationally intensive for
    a local machine. Running a subsample of 135 subjects required 5 GB memory,
    used 100% of my CPU at times, and took well over 10 minutes...

    NOTE: As of now, script does not regress out confounding variables or mask
    analysis
    """

    nonpar = check_bool(nonpar)
    parallel = check_bool(parallel)
    indata = check_bool(indata)
    intermed = check_bool(intermed)

    cde = wr.codegen(6)

    if parallel:
        scans, pv_vals = parallel_in('va', outdir, parin, cde)

    if indata:
        if type(indata) != str:
            data = indata
        else:
            #if not parallel:
            print 'loading data...'
            data = ni.load(indata).get_data()
    else:
        if intermed:
            intfl = 'intfile%s' % (taskid)
        else:
            intfl = '%s_intfile' % (cde)

        # create 4D volume
        cmd = 'fslmerge -t %s' % (os.path.join(outdir, intfl))
        for scn in scans:
            cmd = cmd + ' %s' % (scn)

        #if not parallel:
        print 'creating input 4D volume...'
        os.system(cmd)

        #if not parallel:
        print 'loading data'
        data = ni.load(os.path.join(outdir, '%s.nii.gz' % (intfl))).get_data()

    # run voxelwise analysis
    #if not parallel:
    print 'beginning analysis...'
    x, y, z, t_dim = data.shape
    results = np.zeros((x, y, z))
    aff = ni.load(scans[0]).get_affine()

    for xind in range(x):
        for yind in range(y):
            for zind in range(z):
                if all(data[xind][yind][zind][:]) == 0:
                    continue
                else:
                    dv_vals = data[xind][yind][zind][:]
                    if nonpar:
                        r, p = st.spearmanr(dv_vals, pv_vals)
                    else:
                        r, p = st.pearsonr(dv_vals, pv_vals)
                    if out_tp == 't':
                        r = convert_r2t(r, t_dim)
                    results[xind, yind, zind] = r
        #if not parallel:
        print 'finished %s/%s job clusters' % (xind, x)

    # write image
    outstr = '%s%s' % (os.path.join(outdir, outfl), taskid)
    #if not parallel:
    print 'writing image to %s' % (outstr)
    nimg = ni.Nifti1Image(results, aff)
    ni.save(nimg, outstr)
    outstr = outstr + '.nii'

    # clean up
    data = None
    os.system('rm %s' % (os.path.join(outdir, '%s_*' % (cde))))

    return outstr
Exemple #5
0
def define_bootstrap_sample(ss,
                            subcol,
                            subpth,
                            pv,
                            outpth,
                            sample_perc=0.5,
                            num_gps=3,
                            par=False,
                            rand=False,
                            taskid=''):
    """takes an existing sample of subjects from a spreadsheet and creates a
    subsample, balanced by a variable. Outputs a subsamble membership txt file and
    a list of paths corresponding to subsample member scans)

    ss = a spreadsheet containing at least a list of subjects and values for a
    predictor variable

    subcol = string corresponding to the label of the column containing values for
    the subject IDs. Note these IDs should be consistent with existing file names
    in subpth. If index, set to 'index'

    subpth = a path pointing to the directory containing subject scans

    outpth = desired output directory

    pv = string corresponding to the label of the column containing values for the
    predictor variable

    sample_perc = the percent of the total sample to be used in each subsample.
    Default is 0.5 for 50%. Must be float >0 and <=1

    num_gps = the number of groups used to balance subsample on predictor variable.
    Defualt is 3. Value must be int

    par = if True, makes compatible for command line based parallelization

    task_id = used to keep track of id in the case of multiple bootstrap samples

    rand = Determine how psuedorandom generator is seeded for the randomization
    of the sample. Leave as False to use random.shuffle with a random seed, 
    which will create unreproducible samples and is not recommended for 
    parallelization. Set as an int to use int as seed for mrg322ka PRNG 
    (recommended for parallelization or reproducible results)

    Outputs a list of paths and a vector containing the values for the
    predictor variable

    ***IMPORTANT NOTE***
    Script has the following assumptions:
    1) scans in subpth directory have ids from subcol in them, but the filename of
    the scan does not /start/ with the id.
    2) There are no more than 1 scan per ID within subpth directory
    """

    # prep spreadsheet

    par = check_bool(par)
    rand = check_bool(rand)

    # PUTTING THIS IN FOR NOW TILL GUILLIMIN STOPS BEING A SHIT
    if rand:
        rand = False

    if type(num_gps) != int:
        raise IOError('numgps must be an integer')

    if sample_perc > 1 or sample_perc <= 0:
        raise IOError('sample_perc must be a float between 0 and 1')

    if ss[-3:] == 'xls' or ss[-4:] == 'xlsx':
        subdf = pandas.ExcelFile(ss).parse('Sheet1')
    elif ss[-3:] == 'csv':
        subdf = pandas.read_csv(ss)
    else:
        raise IOError(
            'input spreadsheet filetype not recognized. Please use .xls, .xlsx or .csv'
        )

    if subcol != 'index':
        if subcol in subdf.columns.tolist():
            subdf.index = subdf[:][subcol].tolist()
        else:
            raise IOError('there is no column in the spreadsheet called %s' %
                          (subcol))

    for sub in subdf.index.tolist():
        if len(glob(os.path.join(subpth, '*%s*' % (sub)))) < 1:
            print 'no scan was found for subject %s. Removing from analysis.' % (
                sub)
            #this was not compatible with guillimin's version of pandas
            #subdf.drop(sub,axis = 0, inplace=True)
            subdf = subdf.drop(sub, axis=0)

    if pv not in subdf.columns.tolist():
        raise IOError('there is no column in the spreadsheet called $s' % (pv))
    else:
        subdf = subdf.sort(pv)
    allsubs = subdf.index.tolist()

    # extract subsample

    subsamp_n = len(allsubs) / num_gps
    subsamp_dict = {}
    for i in range(1, (num_gps + 1)):
        if i == 1:
            subsamp_dict.update({i: allsubs[:subsamp_n]})
        else:
            subsamp_dict.update(
                {i: allsubs[(subsamp_n * (i - 1)):(subsamp_n * (i))]})

    subsamp = []
    n_per_gp = int(subsamp_n * sample_perc)
    for j, dictees in enumerate(subsamp_dict.iteritems()):
        gp = dictees[0]
        lst = dictees[1]
        if not rand:
            random.shuffle(lst)
        else:
            if not taskid:
                lst = randomize_4_montecarlo(rand, random.randint(1, 1000000),
                                             ((j + 1) * 10000), lst)
            else:
                lst = randomize_4_montecarlo(rand, taskid, ((j + 1) * 10000),
                                             lst)
        for i in range(n_per_gp):
            subsamp.append(lst[i])

# collect outputs
    scans = []
    for sub in subsamp:
        scn = glob(os.path.join(subpth, '*%s*' % (sub)))[0]
        scans.append(scn)

    pv_vals = []
    for sub in subsamp:
        pv_vals.append(subdf.ix[sub, pv])

    # make membership record
    ndf = pandas.DataFrame(index=subsamp)
    if len(taskid) > 0:
        ndf.to_csv(os.path.join(outpth, '%s_subsample_membership' % (taskid)))
    else:
        cde = wr.codegen(6)
        ndf.to_csv(os.path.join(outpth, '%s_subsample_membership' % (cde)))

    if par:
        flpth = parallel_out('dbc', outpth, scans, pv_vals, taskid)
        return scans, pv_vals, flpth
    else:
        return scans, pv_vals
def spatial_correlation_searchlight_from_NIAK_GLMs(indir,cov_img,outdir,contrast,templ_str,scalestr='scale',norm=False,eff='t',poly=1,taskid='',save=False):
    '''
    Given a directory containing a) NIAK glm.mat files at multiple scales, b) atlases at
    the same resolutions, and a covariate image, this function will do the
    following:
        1) Convert covariate image to resolution of each atlas
        2) Perform spatial correlations between a) average connectivity matrix
        seed from every region at every scale, b) the covariance image
        3) Output a dictionary summarizing the results, where key = resolution
        and value = a dataframe containing statistics for each test


    indir = path to directory containing NIAK glm.mat files and atlases.

    cov_img = path to covariate image

    outdir = path to desired output directory

    contrast = label of the desired contrast to extract average connectivity
    values from glm.mat file

    templ_str = search string to locate atlases

    scalestr = the string preceding the number indicating atlas resolution in
    the glm and atlas pahts

    norm = If set to a path, will normalize cov_img to target image that path
    points to. Uses flirt with nearest neighbour interpolation.  If False, no
    normalization will occur

    eff = if set to 'r', will take values within the 'eff' structure of the
    glm.mat file. If set to 't' will take values within the 'ttest' structure
    of the glm.mat file

    poly = If int > 1, results will include test statistics modeled with a
    n-order polynomial, where n = poly

    taskid = used to keep track of id in the case of multiple bootstrap samples

    save = If set to a string, will write results from each resolution to a spreadsheet
    with a file name indicated by string input


    Outputs a dict where the key is scale and the value is a dataframe
    containing results at that scale
    '''
    save=check_bool(save)
    if save:
        if '_' in save:
            print('WARNING: no _ aloud in save. Removing...')
            nsave=''
            jnk=save.rsplit('_')
            for i in range(len(jnk)):
                nsave=nsave+jnk[i]
            save = nsave

    cde = wr.codegen(6)

    if norm:
        print 'normalizing tmap to fmri space'
        os.system('flirt -interp nearestneighbour -in %s -ref %s -out %s/%s_rtmap'%(cov_img,norm,outdir,cde))
        cov_img = os.path.join(outdir,'%s_rtmap.nii.gz'%(cde))

    glmz = glob(os.path.join(indir,'glm*.mat'))
    dfz = {}
    for glm in glmz:
    ####################the 2 lines below suck######################
        scale = int(os.path.split(glm)[1].rsplit('_%s'%(scalestr))[1].rsplit('.')[0])
        scale_templ = glob(os.path.join(indir,'%s*%s.*'%(templ_str,scale)))[0]
        df, rdf, scalar = wr.get_parcelwise_correlation_map_from_glm_file(outdir,glm,scale_templ,scale,contrast,eff=eff,cov_msk='',cov_img=cov_img,conndf = '',poly=poly)
        dfz.update({scale: rdf})
        if save:
            rdf.to_csv(os.path.join(outdir,'%s_scl%s_res%s.csv'%(save,scale,taskid)))

    resdf = pandas.DataFrame(columns =['scale','parcel','measure','value','pvalue'])
    os.system('rm %s'%(os.path.join(outdir,'%s_*'%(cde))))

    return dfz
def voxelwise_analysis(scans,pv_vals,outfl,outdir,out_tp='r',nonpar=False,taskid='',parallel=False,parin='',indata=False,intermed=False):
    """Given a list of 3D volumes and a corresponding list of values for a
    predictor variable, run voxelwise correlations.

    scans = a list of paths corresponding to 3D nifti volumes

    pv_vals = a list of values corresponding to subject values, in the same
    order as scans

    outfl = string to be used to identifiy the outfile map (NOTE, taskid will
    be automatically appended to this string)

    outdir = the path to the directory where the outfiles will be written to

    out_tp = the type of map to be generated. 'r' will generate a voxelwise
    rmap, whereas 't' will generate a voxelwise tmap

    nonpar = Set to True to set voxelwise analysis from pearson to spearman

    indata = In case 4D data is already available in an existing variable, data
    can be specified here. Or, if an int file exists, simply add the path here.

    taskid = used to keep track of id in the case of multiple bootstap samples

    parallel = if true, script will copy scans into working directory to allow
    for multiple concurrent processes. Will also make script compatible for
    command line based parallelization.

    parin = input path for parallelization

    intermed = if true, script will not delete the 4D volume used to run the
    voxelwise analysis


    Outputs a path pointing to the newly created tmap


    WARNING: As of now, this script is extremely computationally intensive for
    a local machine. Running a subsample of 135 subjects required 5 GB memory,
    used 100% of my CPU at times, and took well over 10 minutes...

    NOTE: As of now, script does not regress out confounding variables or mask
    analysis
    """

    nonpar = check_bool(nonpar)
    parallel = check_bool(parallel)
    indata = check_bool(indata)
    intermed = check_bool(intermed)

    cde = wr.codegen(6)

    if parallel:
        scans,pv_vals = parallel_in('va',outdir,parin,cde)

    if indata:
        if type(indata) != str:
            data = indata
        else:
            #if not parallel:
            print 'loading data...'
            data=ni.load(indata).get_data()
    else:
        if intermed:
            intfl = 'intfile%s'%(taskid)
        else:
            intfl = '%s_intfile'%(cde)

        # create 4D volume
        cmd = 'fslmerge -t %s'%(os.path.join(outdir,intfl))
        for scn in scans:
            cmd = cmd+' %s'%(scn)

        #if not parallel:
        print 'creating input 4D volume...'
        os.system(cmd)

        #if not parallel:
        print 'loading data'
        data = ni.load(os.path.join(outdir,'%s.nii.gz'%(intfl))).get_data()

    # run voxelwise analysis
    #if not parallel:
    print 'beginning analysis...'
    x,y,z,t_dim = data.shape
    results = np.zeros((x,y,z))
    aff = ni.load(scans[0]).get_affine()

    for xind in range(x):
        for yind in range(y):
            for zind in range(z):
                if all(data[xind][yind][zind][:]) == 0:
                    continue
                else:
                    dv_vals = data[xind][yind][zind][:]
                    if nonpar:
                        r,p = st.spearmanr(dv_vals,pv_vals)
                    else:
                        r,p = st.pearsonr(dv_vals,pv_vals)
                    if out_tp == 't':
                        r = convert_r2t(r,t_dim)
                    results[xind,yind,zind] = r
        #if not parallel:
        print 'finished %s/%s job clusters'%(xind,x)

    # write image
    outstr = '%s%s'%(os.path.join(outdir,outfl),taskid)
    #if not parallel:
    print 'writing image to %s'%(outstr)
    nimg = ni.Nifti1Image(results,aff)
    ni.save(nimg,outstr)
    outstr = outstr+'.nii'

    # clean up
    data = None
    os.system('rm %s'%(os.path.join(outdir,'%s_*'%(cde))))

    return outstr
def define_bootstrap_sample(ss,subcol,subpth,pv,outpth,sample_perc=0.5,num_gps=3,par=False,rand=False,taskid = ''):
    """takes an existing sample of subjects from a spreadsheet and creates a
    subsample, balanced by a variable. Outputs a subsamble membership txt file and
    a list of paths corresponding to subsample member scans)

    ss = a spreadsheet containing at least a list of subjects and values for a
    predictor variable

    subcol = string corresponding to the label of the column containing values for
    the subject IDs. Note these IDs should be consistent with existing file names
    in subpth. If index, set to 'index'

    subpth = a path pointing to the directory containing subject scans

    outpth = desired output directory

    pv = string corresponding to the label of the column containing values for the
    predictor variable

    sample_perc = the percent of the total sample to be used in each subsample.
    Default is 0.5 for 50%. Must be float >0 and <=1

    num_gps = the number of groups used to balance subsample on predictor variable.
    Defualt is 3. Value must be int

    par = if True, makes compatible for command line based parallelization

    task_id = used to keep track of id in the case of multiple bootstrap samples

    rand = Determine how psuedorandom generator is seeded for the randomization
    of the sample. Leave as False to use random.shuffle with a random seed, 
    which will create unreproducible samples and is not recommended for 
    parallelization. Set as an int to use int as seed for mrg322ka PRNG 
    (recommended for parallelization or reproducible results)

    Outputs a list of paths and a vector containing the values for the
    predictor variable

    ***IMPORTANT NOTE***
    Script has the following assumptions:
    1) scans in subpth directory have ids from subcol in them, but the filename of
    the scan does not /start/ with the id.
    2) There are no more than 1 scan per ID within subpth directory
    """

    # prep spreadsheet

    par = check_bool(par)
    rand = check_bool(rand)

    # PUTTING THIS IN FOR NOW TILL GUILLIMIN STOPS BEING A SHIT
    if rand:
	rand = False

    if type(num_gps) != int:
        raise IOError('numgps must be an integer')

    if sample_perc > 1 or sample_perc <=0:
        raise IOError('sample_perc must be a float between 0 and 1')

    if ss[-3:] == 'xls' or ss[-4:] == 'xlsx':
        subdf = pandas.ExcelFile(ss).parse('Sheet1')
    elif ss[-3:] == 'csv':
        subdf = pandas.read_csv(ss)
    else:
        raise IOError('input spreadsheet filetype not recognized. Please use .xls, .xlsx or .csv')

    if subcol != 'index':
        if subcol in subdf.columns.tolist():
            subdf.index = subdf[:][subcol].tolist()
        else:
            raise IOError('there is no column in the spreadsheet called %s'%(subcol))

    for sub in subdf.index.tolist():
        if len(glob(os.path.join(subpth,'*%s*'%(sub)))) < 1:
            print 'no scan was found for subject %s. Removing from analysis.'%(sub)
            #this was not compatible with guillimin's version of pandas
            #subdf.drop(sub,axis = 0, inplace=True)
            subdf=subdf.drop(sub,axis=0)

    if pv not in subdf.columns.tolist():
        raise IOError('there is no column in the spreadsheet called $s'%(pv))
    else:
        subdf = subdf.sort(pv)
    allsubs = subdf.index.tolist()

    # extract subsample

    subsamp_n = len(allsubs) / num_gps
    subsamp_dict = {}
    for i in range(1,(num_gps+1)):
        if i == 1:
            subsamp_dict.update({i: allsubs[:subsamp_n]})
        else:
            subsamp_dict.update({i: allsubs[(subsamp_n * (i-1)):(subsamp_n *
            (i))]})

    subsamp = []
    n_per_gp = int(subsamp_n * sample_perc)
    for j,dictees in enumerate(subsamp_dict.iteritems()):
        gp = dictees[0]
        lst = dictees[1]
        if not rand:
            random.shuffle(lst)
        else:
            if not taskid:
                lst = randomize_4_montecarlo(rand,random.randint(1,1000000),((j+1)*10000),lst)
            else:
                lst = randomize_4_montecarlo(rand,taskid,((j+1)*10000),lst)
        for i in range(n_per_gp):
            subsamp.append(lst[i])

   # collect outputs
    scans = []
    for sub in subsamp:
        scn = glob(os.path.join(subpth,'*%s*'%(sub)))[0]
        scans.append(scn)

    pv_vals = []
    for sub in subsamp:
        pv_vals.append(subdf.ix[sub,pv])

    # make membership record
    ndf = pandas.DataFrame(index=subsamp)
    if len(taskid) > 0:
        ndf.to_csv(os.path.join(outpth,'%s_subsample_membership'%(taskid)))
    else:
        cde = wr.codegen(6)
        ndf.to_csv(os.path.join(outpth,'%s_subsample_membership'%(cde)))

    if par:
        flpth = parallel_out('dbc',outpth,scans,pv_vals,taskid)
        return scans,pv_vals,flpth
    else:
        return scans, pv_vals