def storeFile(syn, fileName, parentId, center, fileFormat, dataSubType, platform=None, cBioFileFormat=None, used=None): logger.info("STORING FILES") fileEnt = File(fileName, parent=parentId) fileEnt.center = center fileEnt.species = "Human" fileEnt.consortium = 'GENIE' fileEnt.dataType = "genomicVariants" fileEnt.fundingAgency = "AACR" fileEnt.assay = 'targetGeneSeq' fileEnt.fileFormat = fileFormat fileEnt.dataSubType = dataSubType fileEnt.fileStage = "staging" fileEnt.platform = platform if platform is not None: fileEnt.platform = platform if cBioFileFormat is not None: fileEnt.cBioFileFormat = cBioFileFormat ent = syn.store(fileEnt, used=used) return (ent)
def upload(args,syn): if args.dataType == "rnaseq": parentId = "syn6034916" pipeline = "syn6126122" dataType = "RNASeq" elif args.dataType == "dnaseq": parentId = "syn6034751" pipeline = "syn6126123" dataType = "TargDNASeq" elif args.dataType == "snparray": parentId = "syn6038475" pipeline = "syn6126121" dataType = "SNParray" elif args.dataType == "exparray": parentId = "syn6038915" pipeline = "syn6126120" dataType = "expression_microarray" elif args.dataType == "exome": parentId = "syn6115597" dataType = "exome" pipeline = "" else: raise ValueError("dataType needs to be rnaseq/dnaseq/snparray/exparray/exome") if args.workflow is not None: workflow = syn.get(pipeline,downloadFile=False) workflow.path = args.workflow workflow.name = os.path.basename(args.workflow) workflow = syn.store(workflow) pipeline = workflow.id fileEnt = File(args.input,parent=parentId) #fileEnt.annotations = temp.to_dict('index').values()[0] fileEnt.dataType = dataType fileEnt.sampleId = sampleId fileEnt = syn.store(fileEnt,used = pipeline) return(fileEnt.id)
def uploadToSynapse(f): """Given a filepath extracts metadata and uploads to Synapse""" center, sample_id, workflow_name, date, call_type, dataType, fileType = ['']*7 url = URLBASE+f if 'OICR_BL' in f: center = 'oicr_bl' elif 'CRG/clindel/somatic' in f: center = 'crg_clindel' else: center = f.split('/')[4] filename = f.split('/')[-1] if center in ('yale', 'wustl', 'LOHcomplete'): if filename =='bd829214-f230-4331-b234-def10bbe7938CNV.vcf.gz': sample_id, dataType, fileType='bd829214-f230-4331-b234-def10bbe7938', 'cnv', 'vcf' else: sample_id, dataType = filename.lower().split('.')[:2] fileType = [i for i in filename.split('.')[2:] if i != 'gz'][-1] elif center in ('broad', 'BSC', 'oicr_sga', 'mda_kchen', 'MDA_HGSC', 'mcgill_popsv', 'sfu', 'UCSC', 'oicr_bl', 'Synteka_pgm21', 'crg_clindel'): sample_id, workflow_name, date, call_type, dataType = filename.replace('indels', 'indel', split('.')[:5]) fileType = [i for i in filename.split('.')[5:] if i != 'gz'][-1] else: print 'Not uploading:', f return print center, workflow_name, date, call_type, dataType, fileType file = File(url, parentId=DIRS[center], synapseStore=False) file.center = center.lower() file.sample_id = sample_id file.workflow_name = workflow_name file.date = date file.call_type = call_type file.dataType = 'DNA' file.disease = 'Cancer' file.dataSubType = dataType file.fileType = fileType #file.analysis_id_tumor = ????? syn.store(file, forceVersion=False)
def storeFile(fileName, stagingID, used, center, annotations, meta=False): print("STORING FILES") fileEnt = File(fileName, parent = stagingID) fileEnt.center = center fileEnt.dataSubType = annotations.get("dataSubType",'') fileEnt.dataType = annotations.get("dataType",'') fileEnt.disease = 'cancer' fileEnt.fileType = annotations.get("fileType",'') fileEnt.organism = 'H**o Sapiens' fileEnt.platform = annotations.get("platform",'') fileEnt.tissueSource = annotations.get("tissueSource",'') fileEnt.consortium = 'GENIE' if meta: fileEnt.fileType = "txt" fileEnt.dataType = "meta" fileEnt.fileStage = "staging" ent = syn.store(fileEnt,annotations = used) return(ent)
def upload(args, syn): if args.dataType == "rnaseq": parentId = "syn6034916" pipeline = "syn6126122" dataType = "RNASeq" elif args.dataType == "dnaseq": parentId = "syn6034751" pipeline = "syn6126123" dataType = "TargDNASeq" elif args.dataType == "snparray": parentId = "syn6038475" pipeline = "syn6126121" dataType = "SNParray" elif args.dataType == "exparray": parentId = "syn6038915" pipeline = "syn6126120" dataType = "expression_microarray" elif args.dataType == "exome": parentId = "syn6115597" dataType = "exome" pipeline = "" else: raise ValueError( "dataType needs to be rnaseq/dnaseq/snparray/exparray/exome") if args.workflow is not None: workflow = syn.get(pipeline, downloadFile=False) workflow.path = args.workflow workflow.name = os.path.basename(args.workflow) workflow = syn.store(workflow) pipeline = workflow.id fileEnt = File(args.input, parent=parentId) #fileEnt.annotations = temp.to_dict('index').values()[0] fileEnt.dataType = dataType fileEnt.sampleId = sampleId fileEnt = syn.store(fileEnt, used=pipeline) return (fileEnt.id)
'tissueType': ['Frontal Pole', 'Superior Temporal Gyrus', 'Parahippocampal Gyrus'], 'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name': 'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_normalized-sex-race-age-RIN-PMI-batch-site.corrected.csv' }, } for id, v in toMove.items(): ent = syn.get(id) print v['name'] os.rename(ent.path, v['name']) f = File(v['name'], parentId=v['parentId'], name=v['name'][7:-4]) print f.name f.consortium, f.study, f.center, f.disease = consortium, study, center, disease f.dataType = v['dataType'] f.platfrom = v['platform'] if 'tissueTypeAbrv' in v: f.tissueTypeAbrv = v['tissueTypeAbrv'] f.tissueType = v['tissueType'] f.fileType = fileType f.organism = organism f = syn.store( f, used=[id], executed=[ 'https://github.com/Sage-Bionetworks/ampAdScripts/blob/e71bbde262625e6999ea9defd98e10fce8f3c542/Mount-Sinai/migrateMSBBMetaAndRNASeq.py' ], activityName='Data migration')
PLATFORM_MAP = {'133AB': 'AffymetrixU133AB', 'Plus2': 'AffymetrixU133Plus2'} query = 'select id, name from entity where parentId=="%s"' %OLDPARENTID df = synapseHelpers.query2df(syn.chunkedQuery(query)) for i in range(1,df.shape[0]): row = df.ix[i, :] ent = syn.get(row.id) fStudy, fTissue, fPlatform, fDatatype, fRest = ent.name.split('_') name = 'AMP-AD_MSBB_MSSM_%s_%s_%s' % (PLATFORM_MAP[fPlatform], TISSUEABRMAP[fTissue][0], fRest) print name os.rename(ent.path, name) f = File(name, parentId=NEWPARENTID, name=name[7:]) f.consortium = 'AMP-AD' f.study = 'MSBB' f.center = 'MSSM' f.dataType = 'mRNA' f.disease = 'Alzheimers Disease' f.platfrom = PLATFORM_MAP[fPlatform] f.tissueTypeAbrv = TISSUEABRMAP[fTissue][1] f.tissueType = TISSUEABRMAP[fTissue][0] f.dataSubType = 'geneExp' f.fileType = 'genomicMatrix' f.organism = 'human' f = syn.store(f, used = [ent], executed=['https://github.com/Sage-Bionetworks/ampAdScripts/blob/4d7d6b78b1e73058483354a1a18bff7422966a4b/Mount-Sinai/migrateMSBBExpression.py'], activityName='Data migration')
meta_data = json.loads(handle.read()) DST_FOLDER = 'syn3079564' #test upload folder #Create Provenance log provenance = Activity(name=meta_data['activity'], desciption=meta_data['description'], used = meta_data['used'] exectuted = meta_data['used'] ) #prov = syn.store(prov) name = of.path.basename(input_path) #Add metadata to files to be uploaded f = File(input_path, name = name, parentId=DST_FOLDER) f.dataType = meta_data['dataType'] f.fileType = meta_data['dataType'] f.variant_workflow = meta_data['workflow'] f.variant_workflow_version = meta_data['workflowVersion'] f.call_type = call_type f.reference_build = meta_data['referenceBuild'] f.center_name = meta_data['center_name'] f.file_md5 = synapseclient.utils.md5_for_file(input_path) f.study = 'PCAWG 2.0' f.submitter_donor_id = meta_data['donor_id'] f.alignment_workflow_name='Workflow_Bundle_BWA (UCSC Implementation)' f.alignment_workflow_source_url='https://github.com/kellrott/tcga_realign' f.alignment_workflow_version='2.6.0' #Store metadata and file to Synapse
'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name' :'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_rawCounts.tsv'}, 'syn2920161':{'parentId' :'syn3157743', #'normalized.sex_race_age_RIN_PMI_batch_site.corrected.csv' 'dataType': 'mRNA', 'platform': 'IlluminaHiSeq2500', 'tissueType':['Frontal Pole', 'Superior Temporal Gyrus','Parahippocampal Gyrus'], 'tissueTypeAbrv': ['FP', 'STG', 'PHG'], 'name' :'AMP-AD_MSBB_MSSM_IlluminaHiSeq2500_mRNA_normalized-sex-race-age-RIN-PMI-batch-site.corrected.csv'}, } for id, v in toMove.items(): ent = syn.get(id) print v['name'] os.rename(ent.path, v['name']) f = File(v['name'], parentId=v['parentId'], name=v['name'][7:-4]) print f.name f.consortium, f.study, f.center, f.disease = consortium, study, center, disease f.dataType = v['dataType'] f.platfrom = v['platform'] if 'tissueTypeAbrv' in v: f.tissueTypeAbrv = v['tissueTypeAbrv'] f.tissueType = v['tissueType'] f.fileType = fileType f.organism = organism f = syn.store(f, used = [id], executed=['https://github.com/Sage-Bionetworks/ampAdScripts/blob/e71bbde262625e6999ea9defd98e10fce8f3c542/Mount-Sinai/migrateMSBBMetaAndRNASeq.py'], activityName='Data migration')