def createLinkVersion(syn, genie_version, caseListEntities, genePanelEntities, databaseSynIdMappingDf): versioning = genie_version.split(".") logger.info(genie_version) main = versioning[0] releaseSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'release'].values[0] publicSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'public'].values[0] #second = ".".join(versioning[1:]) releases = synapseutils.walk(syn, releaseSynId) mainReleaseFolders = next(releases)[1] releaseFolderSynId = [synId for folderName, synId in mainReleaseFolders if folderName == "Release %s" % main] if len(releaseFolderSynId) > 0: secondRelease = synapseutils.walk(syn, releaseFolderSynId[0]) secondReleaseFolders = next(secondRelease)[1] secondReleaseFolderSynIdList = [synId for folderName, synId in secondReleaseFolders if folderName == genie_version] if len(secondReleaseFolderSynIdList) > 0: secondReleaseFolderSynId = secondReleaseFolderSynIdList[0] else: secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = releaseFolderSynId[0])).id else: mainReleaseFolderId = syn.store(synapseclient.Folder("Release %s" % main, parent = releaseSynId)).id secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = mainReleaseFolderId)).id caselistId = db_to_staging.find_caselistid(syn, secondReleaseFolderSynId) publicRelease = syn.getChildren(publicSynId) [syn.store(synapseclient.Link(ents['id'], parent=secondReleaseFolderSynId, targetVersion=ents['versionNumber'])) for ents in publicRelease if ents['type'] != "org.sagebionetworks.repo.model.Folder" and ents['name'] != "data_clinical.txt" and not ents['name'].startswith("data_gene_panel")] [syn.store(synapseclient.Link(ents.id, parent=caselistId, targetVersion=ents.versionNumber)) for ents in caseListEntities] #Store gene panels [syn.store(synapseclient.Link(ents.id, parent=secondReleaseFolderSynId, targetVersion=ents.versionNumber)) for ents in genePanelEntities]
def test_walk(): walked = [] firstfile = utils.make_bogus_data_file() schedule_for_cleanup(firstfile) project_entity = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(project_entity.id) folder_entity = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(folder_entity.id) second_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(second_folder.id) file_entity = syn.store(File(firstfile, parent=project_entity)) schedule_for_cleanup(file_entity.id) walked.append(((project_entity.name, project_entity.id), [ (folder_entity.name, folder_entity.id), (second_folder.name, second_folder.id) ], [(file_entity.name, file_entity.id)])) nested_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=folder_entity)) schedule_for_cleanup(nested_folder.id) secondfile = utils.make_bogus_data_file() schedule_for_cleanup(secondfile) second_file = syn.store(File(secondfile, parent=nested_folder)) schedule_for_cleanup(second_file.id) thirdfile = utils.make_bogus_data_file() schedule_for_cleanup(thirdfile) third_file = syn.store(File(thirdfile, parent=second_folder)) schedule_for_cleanup(third_file.id) walked.append(((os.path.join(project_entity.name, folder_entity.name), folder_entity.id), [(nested_folder.name, nested_folder.id)], [])) walked.append( ((os.path.join(os.path.join(project_entity.name, folder_entity.name), nested_folder.name), nested_folder.id), [], [(second_file.name, second_file.id)])) walked.append(((os.path.join(project_entity.name, second_folder.name), second_folder.id), [], [(third_file.name, third_file.id)])) temp = synapseutils.walk(syn, project_entity.id) temp = list(temp) #Must sort the tuples returned, because order matters for the assert #Folders are returned in a different ordering depending on the name for i in walked: for x in i: if type(x) == list: x = x.sort() for i in temp: for x in i: if type(x) == list: x = x.sort() assert i in walked print("CHECK: synapseutils.walk on a file should return empty generator") temp = synapseutils.walk(syn, second_file.id) assert list(temp) == []
def test_walk(): walked = [] firstfile = utils.make_bogus_data_file() schedule_for_cleanup(firstfile) project_entity = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(project_entity.id) folder_entity = syn.store(Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(folder_entity.id) second_folder = syn.store(Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(second_folder.id) file_entity = syn.store(File(firstfile, parent=project_entity)) schedule_for_cleanup(file_entity.id) walked.append(((project_entity.name,project_entity.id), [(folder_entity.name, folder_entity.id), (second_folder.name, second_folder.id)], [(file_entity.name, file_entity.id)])) nested_folder = syn.store(Folder(name=str(uuid.uuid4()), parent=folder_entity)) schedule_for_cleanup(nested_folder.id) secondfile = utils.make_bogus_data_file() schedule_for_cleanup(secondfile) second_file = syn.store(File(secondfile, parent=nested_folder)) schedule_for_cleanup(second_file.id) thirdfile = utils.make_bogus_data_file() schedule_for_cleanup(thirdfile) third_file = syn.store(File(thirdfile, parent=second_folder)) schedule_for_cleanup(third_file.id) walked.append(((os.path.join(project_entity.name,folder_entity.name),folder_entity.id), [(nested_folder.name,nested_folder.id)], [])) walked.append(((os.path.join(os.path.join(project_entity.name,folder_entity.name),nested_folder.name),nested_folder.id), [], [(second_file.name,second_file.id)])) walked.append(((os.path.join(project_entity.name,second_folder.name),second_folder.id), [], [(third_file.name,third_file.id)])) temp = synu.walk(syn, project_entity.id) temp = list(temp) #Must sort the tuples returned, because order matters for the assert #Folders are returned in a different ordering depending on the name for i in walked: for x in i: if type(x) == list: x = x.sort() for i in temp: assert i in walked print("CHECK: Cannot synu.walk a file returns empty generator") temp = synu.walk(syn, second_file.id) assert list(temp) == []
def dockerValidate(submission, syn, user, password): submissionJson = json.loads(submission['entityBundleJSON']) assert submissionJson['entity'].get('repositoryName') is not None, "Must submit a docker container" dockerRepo = submissionJson['entity']['repositoryName'].replace("docker.synapse.org/","") #assert dockerRepo.startswith("docker.synapse.org") assert submission.get('dockerDigest') is not None, "Must submit a docker container with a docker sha digest" dockerDigest = submission['dockerDigest'] index_endpoint = 'https://docker.synapse.org' #dockerImage = dockerRepo + "@" + dockerDigest #Check if docker is able to be pulled dockerRequestURL = '{0}/v2/{1}/manifests/{2}'.format(index_endpoint, dockerRepo, dockerDigest) token = getAuthToken(dockerRequestURL, user, password) resp = requests.get(dockerRequestURL, headers={'Authorization': 'Bearer %s' % token}) assert resp.status_code == 200, "Docker image + sha digest must exist" #Must check docker image size #Synapse docker registry dockerSize = sum([layer['size'] for layer in resp.json()['layers']]) assert dockerSize/1000000000.0 < 1000, "Docker image must be less than a teribyte" #Send email to me if harddrive is full #should be stateless, if there needs to be code changes to the docker agent preds = synu.walk(syn, CHALLENGE_PREDICTION_FOLDER) predFolders = preds.next()[1] predSynId = [synId for name, synId in predFolders if str(submission.id) == name] logs = synu.walk(syn, CHALLENGE_LOG_FOLDER) logsFolders = logs.next()[1] logsSynId = [synId for name, synId in logsFolders if str(submission.id) == name] if len(predSynId) == 0: predFolder = syn.store(Folder(submission.id, parent = CHALLENGE_PREDICTION_FOLDER)) predFolder = predFolder.id else: predFolder = predSynId[0] if len(logsSynId) == 0: logFolder = syn.store(Folder(submission.id, parent = CHALLENGE_LOG_FOLDER)) logFolder = logFolder.id for participant in submission.contributors: if participant['principalId'] in ADMIN_USER_IDS: access = ['CREATE', 'READ', 'DOWNLOAD', 'UPDATE', 'DELETE', 'CHANGE_PERMISSIONS', 'MODERATE', 'CHANGE_SETTINGS'] else: access = ['READ','DOWNLOAD'] #Comment set permissions out if you don't want to allow participants to see the pred files #syn.setPermissions(predFolder, principalId = participant['principalId'], accessType = access) syn.setPermissions(logFolder, principalId = participant['principalId'], accessType = access) else: logFolder = logsSynId[0] #Add more message if you want to return the prediction files return(True, "Your submission has been validated! As your submission is being ran, please go here: https://www.synapse.org/#!Synapse:%s to check on your log file." % logFolder)
def annotate_proteomics(): ###PROTEOMICS### directory = synu.walk(syn,"syn6101352") for dirpath, dirname, filename in directory: annots = {"topic":"proteomicsExperiment", "disease":"NF1", "diseaseSubtype":"dermalNF", "tissueType":"neurofibroma", "tissueSubtype":"dermalNeurofibroma", "nf1Genotype":"-/-"} if os.path.basename(dirpath[0]) == "Original Calls": annots.update({"dataType":"proteomicsData", "dataSubtype":"peptideIdentification" "fileFormat":"text", "formatSubtype":"tsv"}) else: annots.update({"dataType":"proteomicsData", "dataSubtype":"proteinMatrix" "fileFormat":"text", "formatSubtype":"tsv"}) proteomics = dict(dermalNF.items() + annots.items()) for i in filename: temp = syn.get(i[1],downloadFile=False) temp.annotations.update(proteomics) for i in temp.annotations: if type(temp[i]) == list and len(temp[i]) == 1: temp[i] = temp[i][0] syn.store(temp, forceVersion=False)
def updateAnnoByDict(syn,synId,annoDict): """ Update annotations by giving a dict :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File OR a list of Synpase IDs of File :param annoDict A dict of annotations Example: updateAnnoByDict(syn,"syn12345",{"dataType":"testing","projectName":"foo"}) OR updateAnnoByDict(syn,["syn1","syn2"],{"dataType":"testing","projectName":"foo"}) """ if type(synId) is list: print "Input is a list of Synapse IDs \n" for synID in synId: print "Getting File %s ..." % synID temp = syn.get(synID,downloadFile = False) _helperUpdateAnnoByDict(syn,temp,annoDict) else: print "Input is a Synpase ID \n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperUpdateAnnoByDict(syn,starting,annoDict) else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperUpdateAnnoByDict(syn,temp,annoDict)
def getCenterInputFiles(syn, synId, center, process="main"): """ This function walks through each center's input directory and validates every file """ ################################################################ ##If a file has not changed than it does not need to be processed!!!! ################################################################ logger.info("GETTING %s INPUT FILES" % center) CLINICAL_PAIR_NAME = [ "data_clinical_supp_sample_%s.txt" % center, "data_clinical_supp_patient_%s.txt" % center ] walked = synu.walk(syn, synId) clinicalpair = [] allFiles = [] for dirpath, dirname, filenames in walked: for name, synid in filenames: logger.info(name) paired = False if name in CLINICAL_PAIR_NAME: paired = True clinicalpair.append(synid) if len(clinicalpair) == 2: syns = [i for i in clinicalpair] paths = [reNameFile(syn, i) for i in clinicalpair] clinicalpair = [] allFiles.append((syns, paths)) elif not paired: if process == "vcf": allFiles.append(([synid], [reNameFile(syn, synid)])) elif not name.endswith(".vcf"): allFiles.append(([synid], [reNameFile(syn, synid)])) return (allFiles)
def annotate_snp(): directory = synu.walk(syn,"syn5004874") for dirpath, dirname, filename in directory: annots = {"topic":"geneticVariation", "subTopic":"dnaPolymorphism", "disease":"NF1", "diseaseSubtype":"dermalNF", "tissueType":"neurofibroma", "tissueSubtype":"dermalNeurofibroma", "nf1Genotype":"-/-"} final = dict(dermalNF.items() + annots.items()) for i in filename: temp = syn.get(i[1],downloadFile=False) temp.annotations.update(final) if i[0].startswith("3096"): temp.dataType = "report" temp.dataSubtype = "snpArrayReport" temp.fileType = "text" temp.fileSubtype = "csv" else: temp.dataType = "annotation" temp.dataSubtype = "snpAnnotation" temp.fileType = "text" temp.fileSubtype = "csv" for i in temp.annotations: if type(temp[i]) == list and len(temp[i]) == 1: temp[i] = temp[i][0] #print(temp.annotations) syn.store(temp, forceVersion=False)
def curate_raw_data(syn): raw_data_folders = [ SHIMMER_BACK, SHIMMER_LEFT_ANKLE, SHIMMER_LEFT_WRIST, SHIMMER_RIGHT_ANKLE, SHIMMER_RIGHT_WRIST ] raw_data_locations = [ "Back", "LeftAnkle", "LeftWrist", "RightAnkle", "RightWrist" ] data_cols = [ "subject_id", "device", "device_position", "participant_day", "timestamp_start", "timestamp_end", "source_file", "data_file_handle_id" ] raw_data = pd.DataFrame(columns=data_cols) for folder, device_location in zip(raw_data_folders, raw_data_locations): w = su.walk(syn, folder) parent, folders, _ = next(w) records = [] for folder_name, folder_id in folders: subject, _, subject_files = next(w) subject_num = int(re.search("\d+", folder_name).group()) subject_loc = "BOS" subject_id = "{}_{}".format(subject_num, subject_loc) for file_name, file_id in subject_files: file_day = int(re.search("\d+", file_name).group()) syn_file = syn.get(file_id) df = pd.read_table(syn_file.path) timestamp_start = min(df.timestamp) timestamp_end = max(df.timestamp) fhid = syn_file['dataFileHandleId'] records.append([ subject_id, "Shimmer", device_location, file_day, timestamp_start, timestamp_end, file_id, fhid ]) raw_data_table = pd.DataFrame(records, columns=data_cols) fhids_to_copy = raw_data_table['data_file_handle_id'].tolist() source_files = raw_data_table["source_file"].tolist() new_fhids = [] for i in range(0, len(fhids_to_copy), 100): fhids_subset = fhids_to_copy[i:i + 100] source_files_subset = source_files[i:i + 100] new_fhids_subset = su.copyFileHandles( syn=syn, fileHandles=fhids_subset, associateObjectTypes=["FileEntity"] * len(fhids_subset), associateObjectIds=source_files_subset, contentTypes=["text/tab-separated-values"] * len(fhids_subset), fileNames=[None] * len(fhids_subset)) new_fhids_subset = [ int(i['newFileHandle']['id']) for i in new_fhids_subset['copyResults'] ] new_fhids = new_fhids + new_fhids_subset fhid_mapping = {k: v for k, v in zip(fhids_to_copy, new_fhids)} raw_data_table["data_file_handle_id"] = \ raw_data_table["data_file_handle_id"].map(fhid_mapping) raw_data = raw_data.append(raw_data_table, ignore_index=True, sort=False) return (raw_data)
def download_relevant_children(syn, parent, filtering_prefix, diary, sensor, download_in_parallel=False): """ Returns ------- dict with key synapse_id (str) and values synapse_file (File), measurement_ids (list) """ _, _, entity_info = list(su.walk(syn, parent))[0] entity_info = [(i, j) for i, j in entity_info if i.startswith(filtering_prefix)] relevant_entities = {} for fname, syn_id in entity_info: relevant_entries = find_relevant_diary_entries(fname, diary, sensor) if len(relevant_entries): relevant_entities[syn_id] = {"synapse_file": None, "measurement_ids": relevant_entries.measurement_id} ordered_synapse_ids = list(relevant_entities.keys()) if download_in_parallel: mp = multiprocessing.dummy.Pool(4) children = mp.map(syn.get, ordered_synapse_ids) else: children = list(map(syn.get, ordered_synapse_ids)) for syn_id, f in zip(ordered_synapse_ids, children): relevant_entities[syn_id]["synapse_file"] = f return relevant_entities
def annotate_WGS(): directory = synu.walk(syn,"syn4984626") for dirpath, dirname, filename in directory: annots = {"topic":"geneticVariation", "subTopic":"dnaMutation", "disease":"NF1", "diseaseSubtype":"dermalNF", "tissueType":"neurofibroma", "tissueSubtype":"dermalNeurofibroma", "nf1Genotype":"-/-"} final = dict(dermalNF.items() + annots.items()) for i in filename: temp = syn.get(i[1],downloadFile=False) temp.annotations.update(final) if i[0].endswith(".vcf.gz") or i[0].endswith(".vcf"): temp.fileFormat = "typedText" temp.formatSubtype = "VCF" elif i[0].endswith(".maf.gz") or i[0].endswith(".maf"): temp.fileFormat = "typedText" temp.formatSubtype = "MAF" for i in temp.annotations: if type(temp[i]) == list and len(temp[i]) == 1: temp[i] = temp[i][0] #print(temp.annotations) syn.store(temp, forceVersion=False)
def updateFormatTypeByFileName(syn,synId,annoKey,annoDict): """ Audit entity file type annotations :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File OR a list of Synapse IDs of File :param annoKey: The annotation key for file type. (i.e. "fileType", "fileFormat", or "formatType") :param annoDict A dict where key is the extension of the filename, value is the corresponding file type value in entity annotations Example: updateFormatTypeByFileName(syn,"syn12345","fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"}) OR updateFormatTypeByFileName(syn,["syn1","syn2"],"fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"}) """ if type(synId) is list: print "Input is a list of Synapse IDs \n" for synID in synId: print "Getting File %s ..." % synID temp = syn.get(synID,downloadFile = False) _helperUpdateFormatTypeByFileName(syn,temp,annoKey,annoDict) else: print "Input is a Synpase ID \n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperUpdateFormatTypeByFileName(syn,starting,annoKey,annoDict) else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperUpdateFormatTypeByFileName(syn,temp,annoKey,annoDict)
def get_public_to_consortium_synid_mapping(syn, releaseSynId, test=False): """ Gets the mapping between public version name and its Synapse ids (Can probably be replaced with folder view) """ temp = synapseutils.walk(syn, releaseSynId) officialPublic = dict() for dirpath, dirnames, filenames in temp: release = os.path.basename(dirpath[0]) # checkRelease = release.split(".") final = [i.split("-") for i in release.split(".")] checkRelease = [] for i in final: checkRelease.extend(i) if test: officialPublic["TESTpublic"] = "syn12299959" else: if len(checkRelease) == 3 and checkRelease[0] != "0": if int(checkRelease[1]) > 0: if checkRelease[0] in ["1", "2"]: public_release_name = str(int(checkRelease[0]) + 1) + ".0.0" else: public_release_name = str(int( checkRelease[0])) + ".0-public" officialPublic[public_release_name] = dirpath[1] return officialPublic
def updateAnnoByMetadata(syn, synId, metaDf, refCol, cols2Add,fileExts): """ Audit entity annotations against metadata :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File OR a list of Synpase IDs of File :param metaDf A pandas data frame of entity metadata :param refCol A name of the column in metaDf that matching one of the entity attributes :param cols2Add A list of columns in metaDf need to be added as entity annotations :param fileExts A list of all file extensions (PsychENCODE ONLY!!!) Example: updateAnnoByMetadata(syn,"syn12345",metadata,"id",["dataType","tester"],[".bam",".csv"]) """ if type(synId) is list: print "Input is a list of Synapse IDs \n" for synID in synId: print "Getting File %s ..." % synID temp = syn.get(synID,downloadFile = False) _helperUpdateAnnoByMetadata(syn,temp,metaDf,refCol,cols2Add,fileExts) else: print "Input is a Synpase ID \n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperUpdateAnnoByMetadata(syn,starting,metaDf,refCol,cols2Add,fileExts) else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperUpdateAnnoByMetadata(syn,temp,metaDf,refCol,cols2Add,fileExts)
def searchInMetadata(syn, synId, metaDf, refCol,col2Check,values2Check,fileExts): """ Search for a list of Synapse IDs with a given column and expected values :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder :param metaDf A pandas data frame of entity metadata :param refCol A name of the column in metaDf that matching one of the entity attributes :param col2Check A name of the column in metaDf you are seaching :param values2Check A list of values you are searching :param fileExts A list of all file extensions (PsychENCODE ONLY!!!) Return: A list of Synapse IDs Example: IDs = searchInMetadata(syn,"syn123",metadata,"id","tester",["foo","bar"],[".bam",".csv"]) """ synapseIds = [] print "Search in metadata for key: %s \n" % col2Check starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "ERROR: %s is not a Synapse ID of Project or Folder" % synId else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperSearchInMetadata(syn,temp,metaDf,refCol,col2Check,values2Check,fileExts,synapseIds) return synapseIds
def _getSynapseDir(syn, synapse_id, local_root, dir_list): """ 1. Walks through Synapse parent location hierarchy. 2. update folders in Synapse to match the local dir, 3. get key-value pairs of dirname and synapse id :param syn: :param synapse_id: :param local_root: :param dir_list: :return: """ synapse_dir = {} synapse_root = syn.get(synapse_id) for (dirpath, dirpath_id), _, _ in synapseutils.walk(syn, synapse_id): dirpath = dirpath.replace(synapse_root.name, os.path.abspath(local_root)) synapse_dir[dirpath] = dirpath_id for directory in dir_list: if not synapse_dir.has_key(directory): new_folder = synapseclient.Folder( os.path.basename(directory), synapse_dir[os.path.dirname(directory)]) new_folder = syn.store(new_folder) synapse_dir[directory] = new_folder.id return synapse_dir
def annotate_rna(): directory = synu.walk(syn,"syn6035832") for dirpath, dirname, filename in directory: annots = {"topic":"alignment", "subTopic":"RNAseq"} if "ENCODE controls" in dirpath[0]: annots.update({"disease":"normal", "nf1Genotype":"+/+", "tissueType":"skin"}) else: annots.update({"disease":"NF1", "diseaseSubtype":"dermalNF", "tissueType":"neurofibroma", "tissueSubtype":"dermalNeurofibroma", "nf1Genotype":"-/-"}) if os.path.basename(dirpath[0]) == "Cufflinks Quantitation": annots.update({"dataType":"geneExpressionData", "dataSubtype":"geneCountsFile", "fileType":"text"}) elif os.path.basename(dirpath[0]) == "FeatureCounts Quantitation": annots.update({"dataType":"geneExpressionData", "dataSubtype":"geneCountsFile", "fileType":"text"}) elif os.path.basename(dirpath[0]) == "Gene matrices": annots.update({"dataType":"geneExpressionData", "dataSubtype":"geneExpressionMatrix", "fileType":"text", "fileSubtype":"csv"}) elif os.path.basename(dirpath[0]) == "Dermal NF tumor alignments": annots.update({"dataType":"sequenceAlignment", "dataSubtype":"rnaSequenceAlignment", "fileType":"binary"}) elif os.path.basename(dirpath[0]) == "BAM files": annots.update({"dataType":"sequenceAlignment", "dataSubtype":"rnaSequenceAlignment", "fileType":"binary"}) elif os.path.basename(dirpath[0]) == "Pre-aligned quantitation": annots.update({"dataType":"geneExpressionData", "dataSubtype":"geneCountsFile", "fileType":"text", "fileSubtype":"tsv"}) final = dict(dermalNF.items() + annots.items()) for i in filename: temp = syn.get(i[1],downloadFile=False) temp.annotations.update(final) if i[0].endswith(".gtf"): temp.fileSubtype = "gtf" elif i[0].endswith(".bam"): temp.fileSubtype = "BAM" elif i[0].endswith(".bai"): temp.fileSubtype = "BAI" elif temp.get('fileSubtype',None) is None: temp.fileSubtype = '' for i in temp.annotations: if type(temp[i]) == list and len(temp[i]) == 1: temp[i] = temp[i][0] #print(temp.annotations) syn.store(temp, forceVersion=False)
def synwalk_to_df(syn, synId): res = [] for root, dirs, entities in su.walk(syn, synId): if len(entities) > 0: path_names = [root[0]] * len(entities) path_ids = [root[1]] * len(entities) entity_names = map(lambda x: x[0], entities) entity_ids = map(lambda x: x[1], entities) res.extend(zip(path_names, path_ids, entity_names, entity_ids)) return pd.DataFrame( res, columns=['folderPath', 'folderId', 'entityName', 'entityId'])
def validateFiles(synId, center): """ This function walks through each center's input directory and validates every file """ syn = synapseclient.login() walked = synu.walk(syn, synId) clinicalpair = [] invalid = [] validFiles = [] for dirpath, dirname, filenames in walked: for name, synid in filenames: file = [] if name.endswith(".vcf") and VALIDATE_FILENAME['vcf'] % center in name: fileType = "vcf" elif name.endswith(".bed") and VALIDATE_FILENAME['bed'] % center in name: fileType = "bed" elif VALIDATE_FILENAME['maf'] % center == name: fileType = "maf" elif VALIDATE_FILENAME['cnv'] % center == name: fileType = "cnv" elif VALIDATE_FILENAME['fusions'] % center == name: fileType = "fusions" elif VALIDATE_FILENAME['seg'] % center == name: fileType = "seg" elif VALIDATE_FILENAME['clinical'][0] % center == name: fileType = "clinical" elif name in VALIDATE_FILENAME['clinical'][1:]: clinicalpair.append(synid) if len(clinicalpair) == 2: fileType = "clinical" else: fileType = None else: fileType = None if fileType is not None: if len(clinicalpair) == 2: #Need to pass in both filepath and synid for processing files file = [(syn.get(i).path,i) for i in clinicalpair] clinicalpair = [] else: file = [(syn.get(synid).path,synid)] #Validation only takes in a list of filepaths paths = [i[0] for i in file] message, valid = validate_genie.main(fileType, paths, center) else: print("%s: Cannot be processed" % name) valid = False invalid.append(name) if not valid: invalid.append(name) else: validFiles.extend(file) print(", ".join(invalid) + " can't be processed!") return(validFiles)
def _accumulate_remotes(synapse_parent_id, syn): """Retrieve references to all remote directories and files. """ remotes = {} s_base_folder = syn.get(synapse_parent_id) for (s_dirpath, s_dirpath_id), _, s_filenames in synapseutils.walk(syn, synapse_parent_id): remotes[s_dirpath] = s_dirpath_id if s_filenames: for s_filename, s_filename_id in s_filenames: remotes[os.path.join(s_dirpath, s_filename)] = s_filename_id return s_base_folder, remotes
def get_center_input_files(syn, synid, center, process="main", downloadFile=True): """ This function walks through each center's input directory to get a list of tuples of center files Args: syn: Synapse object synid: Center input folder synid center: Center name process: Process type includes, main, vcf, maf and mafSP. Defaults to main such that the vcf Returns: List of entities with the correct format to pass into validation """ logger.info("GETTING {center} INPUT FILES".format(center=center)) clinical_pair_name = [ "data_clinical_supp_sample_{center}.txt".format(center=center), "data_clinical_supp_patient_{center}.txt".format(center=center), ] center_files = synapseutils.walk(syn, synid) clinicalpair_entities = [] prepared_center_file_list = [] for _, _, entities in center_files: for name, ent_synid in entities: # This is to remove vcfs from being validated during main # processing. Often there are too many vcf files, and it is # not necessary for them to be run everytime. if name.endswith(".vcf") and process != "mutation": continue ent = syn.get(ent_synid, downloadFile=downloadFile) # Clinical file can come as two files. # The two files need to be merged together which is # why there is this format if name in clinical_pair_name: clinicalpair_entities.append(ent) continue prepared_center_file_list.append([ent]) if clinicalpair_entities: # clinicalpair_entities = [x for x in clinicalpair] prepared_center_file_list.append(clinicalpair_entities) return prepared_center_file_list
def auditAgainstMetadata(syn, synId, metaDf, refCol, cols2Check,fileExts): """ Audit entity annotations against metadata :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File :param metaDf A pandas data frame of entity metadata :param refCol A name of the column in metaDf that matching one of the entity attributes :param cols2Check A list of columns in metaDf need to be audited with entity annotations :param fileExts A list of all file extensions (PsychENCODE ONLY!!!) Return: If synId is an ID of a Project/Folder entityMissMetadata: A list of Synapse IDs that have no matching metadata incorrectAnnoated: A dict object where key is the annotation key and value is a list of entities that were annotated incorrectly (i.e. {"sampleId":["syn1","syn2"]}) missedAnno: A dict object where key is the annotation key and value is a list of entities that were missing the annotation (i.e. {"dataType":["syn3","syn4"]}) If synId is an ID of a File entityMissMetadata: A boolean. True when the entity has no matching metadata incorrectAnnoated: A list of keys that were annotated incorrectly missedAnno: A list of keys were not annotated Example: entityMissMetadata,incorrectAnnotated, missingAnno = auditAgainstMetadata(syn,"syn12345",metadata,"id",["dataType","tester"],[".bam",".csv"]) """ entityMissMetadata = [] incorrectAnnotated = {} missingAnno = {} print "Check annotations against metadata.\n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperAuditMetadata(syn,starting,metaDf,refCol,cols2Check,fileExts, entityMissMetadata,incorrectAnnotated,missingAnno) noMeta = False if len(entityMissMetadata): noMeta = True return noMeta,incorrectAnnotated.keys(),missingAnno.keys() else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperAuditMetadata(syn,temp,metaDf,refCol,cols2Check,fileExts, entityMissMetadata,incorrectAnnotated,missingAnno) return entityMissMetadata,incorrectAnnotated,missingAnno
def level_zero_folder(syn, folder_id, tbl, dry_run=False): """Assume everything in a Data folder that doesn't have a name like 'MDD_Level.\\..*' is level 0. """ w = synapseutils.walk(syn, folder_id) folders = set([x[0][1].lstrip('syn') for x in w]) q = "SELECT * FROM {} WHERE (parentId IN ({}) AND (name NOT LIKE 'MDD_%Level_.%') AND (Level is NULL) AND (DataType IS NULL OR DataType <> 'Metadata'))".format(tbl, ",".join(folders)) logger.debug(q) res = syn.tableQuery(q).asDataFrame() res['Level'] = 0 logger.info("{} rows to change for level 0.".format(res.shape[0])) if not dry_run and not res.empty: new_tbl = store_table_and_reindex(syn, tbl, res) return(new_tbl)
def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True) -> List[Tuple[str, str]]: """Gets all files in a given dataset folder. Args: datasetId: synapse ID of a storage dataset. fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. fullpath: if True return the full path as part of this filename; otherwise return just base filename Returns: A list of files; the list consists of tuples (fileId, fileName). Raises: ValueError: Dataset ID not found. """ # select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset') walked_path = synapseutils.walk(self.syn, datasetId) file_list = [] # iterate over all results for dirpath, dirname, filenames in walked_path: # iterate over all files in a folder for filename in filenames: if (not "manifest" in filename[0] and not fileNames) or (not fileNames == None and filename[0] in fileNames): # don't add manifest to list of files unless it is specified in the list of specified fileNames; return all found files # except the manifest if no fileNames have been specified # TODO: refactor for clarity/maintainability if fullpath: # append directory path to filename filename = (dirpath[0] + "/" + filename[0], filename[1]) # add file name file id tuple, rearranged so that id is first and name follows file_list.append(filename[::-1]) return file_list
def auditFormatTypeByFileName(syn,synId,annoKey,annoDict): """ Audit entity file type annotations by checking file name with file type annotation :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File :param annoKey: The annotation key for file type. (i.e. "fileType", "fileFormat", or "formatType") :param annoDict A dict where key is the extension of the filename, value is the corresponding file type value in entity annotations Return: If synId is an ID of a Project/Folder A dict with 3 keys and each value is a list of File Synapse ID (i.e. {"incorrect":[], "missingInAnno":["syn1"], "missingInDict":["syn2","syn3"]) If synId is an ID of a File A string with 4 possible answers: 1. "correct" 2. "incorrect" 3. "missingInAnno" - no file type in entity annotations 4. "missingInDict" - file type is not found in file type annoDict Example: result = auditFormatType(syn,"syn12345","fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"}) """ needAnnotated = {"incorrect":[], "missingInAnno":[], "missingInDict":[]} starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperAuditFormatTypeByFileName(syn,starting,annoKey,annoDict,needAnnotated) result = "correct" for key in needAnnotated.keys(): if len(needAnnotated[key]): result = key return result else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperAuditFormatTypeByFileName(syn,temp,annoKey,annoDict,needAnnotated) return needAnnotated
def auditCommonDict(syn, synId, annoDict): """ Audit entity annotations against common dictionary shared among all enities :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File :param annoDict A dict of annotations shared among entities Return: If synId is an ID of a Project/Folder entityMissAllAnno: A list of Synapse IDs that have not been annotatd incorrectAnnoated: A dict where key is the annotation key and value is a list of entities that were annotated incorrectly (i.e. {"sampleId":["syn1","syn2"]}) missedAnno: A dict where key is the annotation key and value is a list of entities that were missing the annotation (i.e. {"dataType":["syn3","syn4"]}) If synId is an ID of a File entityMissAllAnno: A boolean if synID is a ID of a File incorrectAnnoated: A list of keys that were annotated incorrectly missedAnno: A list of keys were missing the entity annotation Example: entityMissAllAnno, incorrectAnnoated, missingAnno = auditCommonDict(syn,"syn12345",{"dataType":"testing","projectName":"foo"}) """ entityMissAllAnno = [] incorrectAnnotated = {} missingAnno = {} print "Check annotations against common dictionary. \n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperAuditCommonDict(syn,starting,annoDict,entityMissAllAnno,incorrectAnnotated,missingAnno) noAnno = False if len(entityMissAllAnno): noAnno = True return noAnno,incorrectAnnotated.keys(),missingAnno.keys() else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting file %s ..." % i[1] _helperAuditCommonDict(syn,temp,annoDict,entityMissAllAnno,incorrectAnnotated,missingAnno) return entityMissAllAnno,incorrectAnnotated,missingAnno
def get_center_input_files(syn, synid, center, process="main"): ''' This function walks through each center's input directory to get a list of tuples of center files Args: syn: Synapse object synid: Center input folder synid center: Center name process: Process type includes, main, vcf, maf and mafSP. Defaults to main such that the vcf Returns: List of entities with the correct format to pass into validation ''' logger.info("GETTING {center} INPUT FILES".format(center=center)) clinical_pair_name = [ "data_clinical_supp_sample_{center}.txt".format(center=center), "data_clinical_supp_patient_{center}.txt".format(center=center) ] center_files = synapseutils.walk(syn, synid) clinicalpair_entities = [] prepared_center_file_list = [] for _, _, entities in center_files: for name, ent_synid in entities: ent = syn.get(ent_synid) # Clinical file can come as two files. # The two files need to be merged together which is # why there is this format if name in clinical_pair_name: clinicalpair_entities.append(ent) continue prepared_center_file_list.append([ent]) if clinicalpair_entities: # clinicalpair_entities = [x for x in clinicalpair] prepared_center_file_list.append(clinicalpair_entities) return prepared_center_file_list
def annotations_for_folder(syn, folder_id, tbl, key, value, dry_run=False): """Walk a container and find all folders and apply key=value annotation to all files in them. """ w = synapseutils.walk(syn, folder_id) folders = set([x[0][1].lstrip('syn') for x in w]) q = 'SELECT {} FROM {} WHERE parentId IN ({}) AND {} IS NULL'.format(key, tbl, ",".join(folders), key) logger.debug(q) res = syn.tableQuery(q).asDataFrame() logger.info("{} rows to change for key {}.".format(res.shape[0], key)) res[key] = value if not dry_run and not res.empty: new_tbl = store_table_and_reindex(syn, tbl, res) return(new_tbl) else: return(res)
def check_all_files(synfolderID='syn20735395', syn=syn): ''' Checks all files in source folder Synapse scratch space by comparing md5sum to destination GCP store Parameters synfolderID: source folder syn: a synapse object Value: a dataframe showing match or mismatch for each file in source Details Dest md5 is NaN if the file is missing from dest but present in source. ''' l = list(synapseutils.walk(syn, synfolderID))[0][2] synIDs = np.array(l)[:, 1] dflist = [check1file(y, syn) for y in synIDs] df = pd.concat(dflist, axis=0) return (df)
def _get_synapse_file_paths(syn, parent_synid): print('Mapping synapse paths.') e = syn.get(parent_synid) strip_dir = '' if e.concreteType == 'org.sagebionetworks.repo.model.Project': strip_dir = e.name walked = synapseutils.walk(syn, parent_synid) contents = [x for x in walked] file_paths = [] for dirpath, dirnames, filenames in contents: for filename in filenames: file_p = Path(dirpath[0], filename[0]) file_p = Path(*[x for x in file_p.parts if x != strip_dir]) file_path = {'file_p': file_p, 'synid': filename[1]} file_paths.append(file_path) synids = [x['synid'] for x in file_paths] se_list = _threaded_syn_get(syn, synids) remote_md5_dict = {x.id: x.md5 for x in se_list} for file_path in file_paths: file_path.update({'md5': remote_md5_dict[file_path['synid']]}) return file_paths
def annotate_sampleInfo(): directory = synu.walk(syn,"syn4984723") for dirpath, dirname, filename in directory: annots = {"topic":"dataIdentityAndMapping", "subTopic":"clinicalVariables", "disease":"NF1", "diseaseSubtype":"dermalNF", "tissueType":"neurofibroma", "tissueSubtype":"dermalNeurofibroma", "nf1Genotype":"-/-", "dataType":"annotation", "dataSubtype":"sampleAnnotation"} final = dict(dermalNF.items() + annots.items()) for i in filename: temp = syn.get(i[1],downloadFile=False) temp.annotations.update(final) temp.fileSubtype = i[0].split(".")[-1] for i in temp.annotations: if type(temp[i]) == list and len(temp[i]) == 1: temp[i] = temp[i][0] #print(temp.annotations) syn.store(temp, forceVersion=False)
def correctAnnot(syn,synId,projSynId,correctionsFile): """ Propagates annotation changes based on tab-delimited input. Given a tab-separated file containing annotations to be updated, changes annotations across a project. File contains one line per key-value pair, if line has two entries, they are assumed to be oldKey and newKey, if three entries, they are assumed to be key, oldValue, newValue.''' :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project or Folder :param projSynId: A Synapse ID of Project (possibly duplicate of synId). :param correctionsFile Path to a tab-delimited file of old and new annotation values. Example: correctAnnot(syn,"syn12345","syn45678","annotation_corrections.txt") """ with open(correctionsFile) as toChange: for line in toChange: items = line.strip().split('\t') if len(items) == 2: # update keys old = items[0] new = items[1] directory = synu.walk(syn,synId) for dirpath,dirname,files in directory: for item in files: temp = syn.getAnnotations(item[1]) if old not in temp: continue correctedAnnotations = updateKey(oldKey=old,newKey=new,inAnnot=temp) savedAnnotations = syn.setAnnotations(result['file.id'],correctedAnnotations) elif len(items) > 2: # update values kKey = items.pop(0) old = items.pop(0) sql = 'select id,%s from file where projectId=="%s" and file.%s=="%s"' % (kKey,projSynId,kKey,old) results = syn.chunkedQuery(sql) for result in results: temp = syn.getAnnotations(result['file.id']) if kKey not in temp: continue temp[kKey] = items savedAnnotations = syn.setAnnotations(result['file.id'],temp)
def get_center_input_files(syn, synid, center, downloadFile=True): """This function walks through each center's input directory to get a list of tuples of center files Args: syn: Synapse object synid: Center input folder synid center: Center name Returns: List of entities with the correct format to pass into validation """ logger.info(f"GETTING {center} INPUT FILES") center_files = synapseutils.walk(syn, synid) prepared_center_file_list = [] for _, _, entities in center_files: for name, ent_synid in entities: ent = syn.get(ent_synid, downloadFile=downloadFile) prepared_center_file_list.append([ent]) return prepared_center_file_list
def delAnnoByKey(syn, synId, keyList): """ Delete annotations by key for a Synapse object :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File or a list of Synapse IDs :param keyList A list of annotations keys that needs to be deleted Example: delAnnoByKey(syn,"syn12345",["dataType","projectName","sampleId"]) OR delAnnoByKey(syn,["syn1","syn2","syn3"],["dataType"]) """ print "Delte entity annotations by key(s) - \n %s" % "\n".join(keyList) if type(synId) is list: print "Input is a list of Synapse IDs \n" for synID in synId: print "Getting File %s ..." % synID temp = syn.get(synID, downloadFile=False) _helperDelAnnoByKey(syn, temp, keyList) else: print "Input is a Synpase ID \n" starting = syn.get(synId, downloadFile=False) if not is_container(starting): print "%s is a File \n" % synId _helperDelAnnoByKey(syn, starting, keyList) else: directory = synu.walk(syn, synId) for dirpath, dirname, filename in directory: for i in filename: temp = syn.get(i[1], downloadFile=False) print "Getting File %s ..." % i[1] _helperDelAnnoByKey(syn, temp, keyList)
def delAnnoByKey(syn,synId,keyList): """ Delete annotations by key for a Synapse object :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project, Folder, or File or a list of Synapse IDs :param keyList A list of annotations keys that needs to be deleted Example: delAnnoByKey(syn,"syn12345",["dataType","projectName","sampleId"]) OR delAnnoByKey(syn,["syn1","syn2","syn3"],["dataType"]) """ print "Delte entity annotations by key(s) - \n %s" % "\n".join(keyList) if type(synId) is list: print "Input is a list of Synapse IDs \n" for synID in synId: print "Getting File %s ..." % synID temp = syn.get(synID,downloadFile = False) _helperDelAnnoByKey(syn,temp,keyList) else: print "Input is a Synpase ID \n" starting = syn.get(synId,downloadFile = False) if not is_container(starting): print "%s is a File \n" % synId _helperDelAnnoByKey(syn,starting,keyList) else: directory = synu.walk(syn,synId) for dirpath,dirname,filename in directory: for i in filename: temp = syn.get(i[1],downloadFile = False) print "Getting File %s ..." % i[1] _helperDelAnnoByKey(syn,temp,keyList)
import synapseutils import argparse import re syn = synapseclient.login() parser = argparse.ArgumentParser() parser.add_argument("-synId", "--synId", help="Proivde synapse Id of fastq files") parser.add_argument("-sampleId", "--sampleId", help="Proivde sample Id") args = parser.parse_args() print('syn id:', args.synId) print('sample id:', args.sampleId) dirName = './' c = 0 walkedPath = synapseutils.walk(syn, args.synId) for dirpath, dirname, filename in walkedPath: for (inFileName, inFileSynId) in filename: downloadDir = dirName if args.sampleId in inFileName: print('in if:', inFileName) entity = syn.get(inFileSynId, downloadLocation=dirName) c += 1 print('c:', c) exit()
def dockerRun(submission, scoring_sh, syn, client): #These are the volumes that you want to mount onto your docker container OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), submission.id) TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'evaluation_data') #These are the locations on the docker that you want your mounted volumes to be + permissions in docker (ro, rw) #It has to be in this format '/output:rw' MOUNTED_VOLUMES = { OUTPUT_DIR: '/output:rw', TESTDATA_DIR: '/evaluation_data:ro' } #All mounted volumes here in a list ALL_VOLUMES = [OUTPUT_DIR, TESTDATA_DIR] allLogs = synu.walk(syn, CHALLENGE_LOG_FOLDER) logFolder = allLogs.next() logFolderId = [ synId for name, synId in logFolder[1] if name == submission.id ][0] allPreds = synu.walk(syn, CHALLENGE_PREDICTION_FOLDER) predFolder = allPreds.next() predFolderId = [ synId for name, synId in predFolder[1] if name == submission.id ][0] dockerDigest = submission.get('dockerDigest') submissionJson = json.loads(submission['entityBundleJSON']) dockerRepo = submissionJson['entity']['repositoryName'] dockerImage = dockerRepo + "@" + dockerDigest if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) #Mount volumes volumes = {} for vol in ALL_VOLUMES: volumes[vol] = { 'bind': MOUNTED_VOLUMES[vol].split(":")[0], 'mode': MOUNTED_VOLUMES[vol].split(":")[1] } # Run docker image errors = None try: container = client.containers.run(dockerImage, scoring_sh, detach=True, volumes=volumes, name=submission.id + "_t" + str(int(time.time())), network_disabled=True) except docker.errors.APIError as e: container = None errors = str(e) + "\n" #Create log file logFileName = submission.id + "_log.txt" logSynId = None #Create the logfile openLog = open(logFileName, 'w').close() #While docker is still running (the docker python client doesn't update status) #Add sleeps if container is not None: while subprocess.Popen( ['docker', 'inspect', '-f', '{{.State.Running}}', container.name], stdout=subprocess.PIPE).communicate()[0] == "true\n": logFileText = container.logs() with open(logFileName, 'w') as logFile: logFile.write(logFileText) statinfo = os.stat(logFileName) #Only store log file if > 0bytes if statinfo.st_size > 0 and statinfo.st_size / 1000.0 <= 50: ent = File(logFileName, parent=logFolderId) try: logs = syn.store(ent) logSynId = logs.id except synapseclient.exceptions.SynapseHTTPError as e: pass time.sleep(60) #Must run again to make sure all the logs are captured logFileText = container.logs() with open(logFileName, 'w') as logFile: logFile.write(logFileText) statinfo = os.stat(logFileName) #Only store log file if > 0bytes if statinfo.st_size > 0 and statinfo.st_size / 1000.0 <= 50: ent = File(logFileName, parent=logFolderId) try: logs = syn.store(ent) logSynId = logs.id except synapseclient.exceptions.SynapseHTTPError as e: pass container.remove() try: client.images.remove(dockerImage) except: print("Unable to remove image") statinfo = os.stat(logFileName) if statinfo.st_size == 0: with open(logFileName, 'w') as logFile: if errors is not None: logFile.write(errors) else: logFile.write("No Logs, or logs exceed size limit") logFile.flush() ent = File(logFileName, parent=logFolderId) try: logs = syn.store(ent) logSynId = logs.id except synapseclient.exceptions.SynapseHTTPError as e: pass if logSynId is None: logFile = synu.walk(syn, logFolderId) logFiles = logFile.next() logSynId = logFiles[2][0][1] #Zip up predictions and store it into CHALLENGE_PREDICTIONS_FOLDER if len(os.listdir(OUTPUT_DIR)) > 0: zipf = zipfile.ZipFile(submission.id + '_predictions.zip', 'w', zipfile.ZIP_DEFLATED) zipdir(OUTPUT_DIR, zipf) zipf.close() ent = File(submission.id + '_predictions.zip', parent=predFolderId) predictions = syn.store(ent) prediction_synId = predictions.id os.system("rm -rf %s" % OUTPUT_DIR) os.remove(submission.id + '_predictions.zip') else: prediction_synId = None os.remove(logFileName) return (prediction_synId, logSynId)
def checkAgainstDict(syn, synId, annotDictId,verbose=True): """ Compares annotations in use against dictionary. Gets all annotation keys and values in use in a project and compares against those specified by a dictionary. Prints non-matching terms. :param syn: A Synapse object: syn = synapseclient.login()- Must be logged into synapse :param synId: A Synapse ID of Project or Folder :param annotDictId A Synapse ID of annotation dictionary in YAML Return: If synId is an ID of a Project/Folder wrongKeys: A set of keys in use in synId that are not in annotDictId. wrongValues: A set of values in use in synId that are not in annotDictId. Example: wrongKeys, wrongValues = checkAgainstDict(syn,"syn12345","syn45678") """ yamlEnt = syn.get(annotDictId) with open(yamlEnt.path) as f: annotations = yaml.load(f) allKeysInProject = set() allValsInProject = set() systemKeysToExclude = ['creationDate', 'etag', 'id', 'uri', 'accessControl'] directory = synu.walk(syn,synId) for dirpath,dirname,files in directory: for item in files: temp = syn.getAnnotations(item[1]) for key in temp: if key in systemKeysToExclude: continue allKeysInProject.add(key) if isinstance(temp[key], list): for val in temp[key]: allValsInProject.add(str(val)) else: allValsInProject.add(str(temp[key])) print 'Number of key terms in project: %d' % len(allKeysInProject) print 'Number of value terms in project: %d' % len(allValsInProject) wrongKeys = set() wrongValues = set() allKeysInVocab = set(annotations.keys()) if verbose and not allKeysInProject <= allKeysInVocab: print '\nKeys in use that are not found in dictionary: \n' wrongKeys = allKeysInProject.difference(allKeysInVocab) for item in wrongKeys: print '%s' % item allValsInVocab = set() for key, vl in annotations.iteritems(): if isinstance(vl, list): for element in vl: allValsInVocab.add(str(element)) else: allValsInVocab.add(str(vl)) if verbose and not allValsInProject <= allValsInVocab: print '\nValues in use that are not found in dictionary: \n' wrongValues = allValsInProject.difference(allValsInVocab) for item in wrongValues: print '%s' % item return(wrongKeys, wrongValues)
def test_walk(): walked = [] firstfile = utils.make_bogus_data_file() schedule_for_cleanup(firstfile) project_entity = syn.store(Project(name=str(uuid.uuid4()))) schedule_for_cleanup(project_entity.id) folder_entity = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(folder_entity.id) second_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=project_entity)) schedule_for_cleanup(second_folder.id) file_entity = syn.store(File(firstfile, parent=project_entity)) schedule_for_cleanup(file_entity.id) walked.append(((project_entity.name, project_entity.id), [ (folder_entity.name, folder_entity.id), (second_folder.name, second_folder.id) ], [(file_entity.name, file_entity.id)])) nested_folder = syn.store( Folder(name=str(uuid.uuid4()), parent=folder_entity)) schedule_for_cleanup(nested_folder.id) secondfile = utils.make_bogus_data_file() schedule_for_cleanup(secondfile) second_file = syn.store(File(secondfile, parent=nested_folder)) schedule_for_cleanup(second_file.id) thirdfile = utils.make_bogus_data_file() schedule_for_cleanup(thirdfile) third_file = syn.store(File(thirdfile, parent=second_folder)) schedule_for_cleanup(third_file.id) walked.append(((os.path.join(project_entity.name, folder_entity.name), folder_entity.id), [(nested_folder.name, nested_folder.id)], [])) walked.append( ((os.path.join(os.path.join(project_entity.name, folder_entity.name), nested_folder.name), nested_folder.id), [], [(second_file.name, second_file.id)])) walked.append(((os.path.join(project_entity.name, second_folder.name), second_folder.id), [], [(third_file.name, third_file.id)])) #walk() uses query() which returns results that will be eventually consistent with synapse but not immediately after creating the entities start_time = time.time() while syn.query("select id from entity where id=='%s'" % third_file.id).get('totalNumberOfResults') <= 0: assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC) time.sleep(2) temp = synapseutils.walk(syn, project_entity.id) temp = list(temp) #Must sort the tuples returned, because order matters for the assert #Folders are returned in a different ordering depending on the name for i in walked: for x in i: if type(x) == list: x = x.sort() for i in temp: for x in i: if type(x) == list: x = x.sort() assert i in walked print("CHECK: synapseutils.walk on a file should return empty generator") temp = synapseutils.walk(syn, second_file.id) assert list(temp) == []