def findFilesAlreadyInSynapse():
    """Determine the files already stored in Synapse"""
    allFiles= synapseHelpers.query2df(syn.chunkedQuery("select * from file where benefactorId=='syn2351328'"), False)
    print 'Found', len(allFiles), 'files in Synapse.  Fetching urls...'
    def get(id):
        print id
        return syn.get(id, downloadFile=False)
    entities = p.map(get,  allFiles.id)
    return entities
Ejemplo n.º 2
0
    def get_annotations(self):

        _q_params = dict(cols=",".join(self._COLS),
                         id=self._INPUT_BENEFACTOR_ID)

        _query = "select %(cols)s from file where benefactorId=='%(id)s'" % _q_params

        logger.debug(_query)

        # This is a large query, so need to chunk it
        # Make a temporary dict so it looks the same
        qr = self.syn.chunkedQuery(_query)
        annot_qry = dict(results=map(self.fixRow, qr))

        self.annots = synapseHelpers.query2df(annot_qry,
                                              filterSynapseFields=False)

        return self.annots
Ejemplo n.º 3
0
              'BM4-PCG':	['Precentral Gyrus', 'PCG'],
              'BM44-IFG':	['Inferior Frontal Gyrus', 'IFG'],
              'BM46-PFC':	['Dorsolateral Prefrontal Cortex', 'PFC'],
              'BM7-SPL':	['Superior Parietal Lobule', 'SPL'],
              'BM8-FC': 	['Prefrontal Cortex', 'FC'],
              'BMa-AMYG':	['Amygdala','AMYG'],
              'BMb-CD': 	['Caudate Nucleus','CD'],
              'BMc-HIPP':	['Hippocampus', 'HIPP'],
              'BMd-NAc':	['Nucleus Accumbens','NAc'],
              'Bme-PT': 	['Putamen','PT']}

PLATFORM_MAP = {'133AB': 'AffymetrixU133AB', 
                'Plus2': 'AffymetrixU133Plus2'}
	
query = 'select id, name from entity where parentId=="%s"' %OLDPARENTID
df = synapseHelpers.query2df(syn.chunkedQuery(query))
for i in range(1,df.shape[0]):
    row =  df.ix[i, :]
    ent = syn.get(row.id)
    fStudy, fTissue, fPlatform, fDatatype,  fRest = ent.name.split('_')
    name = 'AMP-AD_MSBB_MSSM_%s_%s_%s' % (PLATFORM_MAP[fPlatform],   
                                          TISSUEABRMAP[fTissue][0], fRest)
    print name
    os.rename(ent.path, name)

    f = File(name, parentId=NEWPARENTID, name=name[7:])
    f.consortium = 'AMP-AD'
    f.study = 'MSBB'
    f.center = 'MSSM'
    f.dataType =  'mRNA'
    f.disease = 'Alzheimers Disease'
import synapseclient 
from synapseclient import File
import synapseHelpers
import pandas as pd
import os, urllib, urlparse
import multiprocessing.dummy as mp
from collections import Counter

QUERY = "select * from file where projectId=='syn2351328' and dataType=='DNA' and call_type=='somatic'"

syn = synapseclient.Synapse(skip_checks=True)
syn.login(silent=True)


if __name__ == '__main__':
    df= synapseHelpers.query2df(syn.chunkedQuery(QUERY), True, ['name', 'id', 'parentId'])
    df = df[[x in ('snv_mnv', 'sv', 'indel', 'cnv') for x in df.dataSubType]]

    #Pretyify Source names:
    df['source'] = [c.split('_')[0].upper() if isinstance(c, basestring) else '' for c in df.center]
    df.source[df.workflow_name =='SangerPancancerCgpCnIndelSnvStr'] = 'Sanger'
    
    #Summarize number of samples
    counts = pd.pivot_table(df, 'sample_id', 
                            rows=['source', 'workflow_name'], 
                            cols = ['dataSubType'], 
                            aggfunc=lambda x: len(set(x)))

    #Display number of samples
    #counts.plot(kind='bar')
    #Attempt at getting rid of missing bars
Ejemplo n.º 5
0
    id = syn._findEntityIdByNameAndParent(name, parentId)
    if id is None:
        return False
    activity = syn.getProvenance(id)
    used = set([
        '%s.%s' %
        (x['reference']['targetId'], x['reference']['targetVersionNumber'])
        for x in activity['used'] if x['wasExecuted'] == False
    ])
    currentVersions = set(['%s.%s' % (x.id, x.versionNumber) for x in files])
    return currentVersions == used


mp = Pool(8)
syn = synapseclient.login(silent=True)
allFiles = query2df(syn.chunkedQuery(query_str))
for platform, dataSubType, name in platforms:
    print platform, dataSubType,
    filteredMeta = allFiles[(allFiles.platform == platform)
                            & (allFiles.dataSubType == dataSubType) &
                            (allFiles.acronym != 'PANCAN')]
    files = mp.map(syn.get, filteredMeta.id)
    if isUptodate(name, files, args.parentId):
        print ' is up to date'
        continue
    if list(set(filteredMeta.fileType))[0] in ['seg', 'bed']:
        dfs = mp.map(lambda f: pd.read_csv(f.path, sep='\t'), files)
        df = pd.concat(dfs, axis=0)
        df.to_csv(args.filepath + name, sep='\t', index=False)
        nSamples = len(set(df.Sample))
        nFeatures = 0
Ejemplo n.º 6
0
    metadata['nFeatures'] = nFeatures
    metadata['samples'] = samples
    metadata['patient_barcode'] = [x[:12] for x in metadata.samples]
    metadata.drop(['tissue', u'md5', u'assembly'], axis=1, inplace=True)
    metadata.nFeatures = metadata.nFeatures.astype('int')
    cols = syn.tableQuery('select * from %s limit 1' %args.tableId).asDataFrame().columns

    #Update rows in table
    print 'adding', metadata.shape[0]
    t = syn.store(Table(tableId, metadata[cols]))
    return metadata



if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=('Updates a Synapse Table with '
                                                  'sample sampling information'))
    parser.add_argument('-t', '--table',  dest='tableId', default='syn3281840',
            help='Table where results are stored (e.g. syn3281840) ')
    parser.add_argument('-p', '--project',  dest='projectId', default='syn2812961',
            help='Project (benefactorId) where output files are stored. (e.g. syn2812961)')
    args = parser.parse_args()



    files = synapseHelpers.query2df(syn.chunkedQuery(FILEQUERY % args.projectId), savedSynapseFields=('id', 'name', 'versionNumber'))
    updatedFiles= findUpdates(files, args.tableId)
    print 'NEED TO UPDATE:', updatedFiles.shape[0], 'FILES'
    deleteAffectedRows(updatedFiles, args.tableId)
    dfs = [countAndUpdateTable(row, tableId=args.tableId) for row in updatedFiles.iterrows()]
Ejemplo n.º 7
0
        return False
    activity = syn.getProvenance(id)
    used = set(
        [
            "%s.%s" % (x["reference"]["targetId"], x["reference"]["targetVersionNumber"])
            for x in activity["used"]
            if x["wasExecuted"] == False
        ]
    )
    currentVersions = set(["%s.%s" % (x.id, x.versionNumber) for x in files])
    return currentVersions == used


mp = Pool(8)
syn = synapseclient.login(silent=True)
allFiles = query2df(syn.chunkedQuery(query_str))
for platform, dataSubType, name in platforms:
    print platform, dataSubType,
    filteredMeta = allFiles[
        (allFiles.platform == platform) & (allFiles.dataSubType == dataSubType) & (allFiles.acronym != "PANCAN")
    ]
    files = mp.map(syn.get, filteredMeta.id)
    if isUptodate(name, files, args.parentId):
        print " is up to date"
        continue
    if list(set(filteredMeta.fileType))[0] in ["seg", "bed"]:
        dfs = mp.map(lambda f: pd.read_csv(f.path, sep="\t"), files)
        df = pd.concat(dfs, axis=0)
        df.to_csv(args.filepath + name, sep="\t", index=False)
        nSamples = len(set(df.Sample))
        nFeatures = 0
Ejemplo n.º 8
0
def getChangeSet(version, platform):
    """Extracts the old whitelist id and version of used and filters the changes down
    to a specific platform."""
    old_whitelist  = syn.get(WHITELISTID, version=version)
    whitelist = pd.read_csv(whitelistEntity.path, sep='\t')
    oldToRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform==platform), 
                                'aliquot_barcode'])
    return oldToRemove
    

#mp = Pool(8)
syn = synapseclient.login(silent=True)

whitelistEntity = syn.get(WHITELISTID)
whitelist = pd.read_csv(whitelistEntity.path, sep='\t')
inputFiles = synapseHelpers.query2df(syn.chunkedQuery(QUERY_STR))

code=synapseHelpers.thisCodeInSynapse(parentId='syn1774100')
for i, row in inputFiles.iterrows():
    print row.id, row['name'],
    inputFileEntity = syn.get(row.id)
    outFileName = row['name'][:-4]+'_whitelisted'+row['name'][-4:]
    
    toRemove = set(whitelist.ix[whitelist.Do_not_use & (whitelist.platform == row['platform']), 
                                'aliquot_barcode'])

    if isUptodate(outFileName, [inputFileEntity], toRemove, row['platform']):
        print ' is up to date - but update provenance'
        e = syn.get(getFileIdFromName(outFileName), downloadFile=False)
        syn.store(e, used=[inputFileEntity, whitelistEntity], executed=code)
        continue