Beispiel #1
0
def createLinkVersion(syn, genie_version, caseListEntities, genePanelEntities, databaseSynIdMappingDf):
    versioning = genie_version.split(".")
    logger.info(genie_version)
    main = versioning[0]
    releaseSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'release'].values[0]
    publicSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'public'].values[0]
    #second = ".".join(versioning[1:])
    releases = synapseutils.walk(syn, releaseSynId)
    mainReleaseFolders = next(releases)[1]
    releaseFolderSynId = [synId for folderName, synId in mainReleaseFolders if folderName == "Release %s" % main] 
    if len(releaseFolderSynId) > 0:
        secondRelease = synapseutils.walk(syn, releaseFolderSynId[0])
        secondReleaseFolders = next(secondRelease)[1]
        secondReleaseFolderSynIdList = [synId for folderName, synId in secondReleaseFolders if folderName == genie_version] 
        if len(secondReleaseFolderSynIdList) > 0:
            secondReleaseFolderSynId = secondReleaseFolderSynIdList[0]
        else:
            secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = releaseFolderSynId[0])).id
    else:
        mainReleaseFolderId = syn.store(synapseclient.Folder("Release %s" % main, parent = releaseSynId)).id
        secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = mainReleaseFolderId)).id

    caselistId = db_to_staging.find_caselistid(syn, secondReleaseFolderSynId)

    publicRelease = syn.getChildren(publicSynId)
    [syn.store(synapseclient.Link(ents['id'], parent=secondReleaseFolderSynId, targetVersion=ents['versionNumber'])) for ents in publicRelease if ents['type'] != "org.sagebionetworks.repo.model.Folder" and ents['name'] != "data_clinical.txt"  and not ents['name'].startswith("data_gene_panel")]
    [syn.store(synapseclient.Link(ents.id, parent=caselistId, targetVersion=ents.versionNumber)) for ents in caseListEntities]
    #Store gene panels
    [syn.store(synapseclient.Link(ents.id, parent=secondReleaseFolderSynId, targetVersion=ents.versionNumber)) for ents in genePanelEntities]
Beispiel #2
0
def test_walk():
    walked = []
    firstfile = utils.make_bogus_data_file()
    schedule_for_cleanup(firstfile)
    project_entity = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(project_entity.id)
    folder_entity = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(folder_entity.id)
    second_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(second_folder.id)
    file_entity = syn.store(File(firstfile, parent=project_entity))
    schedule_for_cleanup(file_entity.id)

    walked.append(((project_entity.name, project_entity.id), [
        (folder_entity.name, folder_entity.id),
        (second_folder.name, second_folder.id)
    ], [(file_entity.name, file_entity.id)]))

    nested_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=folder_entity))
    schedule_for_cleanup(nested_folder.id)
    secondfile = utils.make_bogus_data_file()
    schedule_for_cleanup(secondfile)
    second_file = syn.store(File(secondfile, parent=nested_folder))
    schedule_for_cleanup(second_file.id)
    thirdfile = utils.make_bogus_data_file()
    schedule_for_cleanup(thirdfile)
    third_file = syn.store(File(thirdfile, parent=second_folder))
    schedule_for_cleanup(third_file.id)

    walked.append(((os.path.join(project_entity.name,
                                 folder_entity.name), folder_entity.id),
                   [(nested_folder.name, nested_folder.id)], []))
    walked.append(
        ((os.path.join(os.path.join(project_entity.name, folder_entity.name),
                       nested_folder.name), nested_folder.id), [],
         [(second_file.name, second_file.id)]))
    walked.append(((os.path.join(project_entity.name, second_folder.name),
                    second_folder.id), [], [(third_file.name, third_file.id)]))

    temp = synapseutils.walk(syn, project_entity.id)
    temp = list(temp)
    #Must sort the tuples returned, because order matters for the assert
    #Folders are returned in a different ordering depending on the name
    for i in walked:
        for x in i:
            if type(x) == list:
                x = x.sort()
    for i in temp:
        for x in i:
            if type(x) == list:
                x = x.sort()
        assert i in walked

    print("CHECK: synapseutils.walk on a file should return empty generator")
    temp = synapseutils.walk(syn, second_file.id)
    assert list(temp) == []
def test_walk():
    walked = []
    firstfile = utils.make_bogus_data_file()
    schedule_for_cleanup(firstfile)
    project_entity = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(project_entity.id)
    folder_entity = syn.store(Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(folder_entity.id)
    second_folder = syn.store(Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(second_folder.id)
    file_entity = syn.store(File(firstfile, parent=project_entity))
    schedule_for_cleanup(file_entity.id)

    walked.append(((project_entity.name,project_entity.id),
                   [(folder_entity.name, folder_entity.id),
                    (second_folder.name, second_folder.id)],
                   [(file_entity.name, file_entity.id)]))

    nested_folder = syn.store(Folder(name=str(uuid.uuid4()), parent=folder_entity))
    schedule_for_cleanup(nested_folder.id)
    secondfile = utils.make_bogus_data_file()
    schedule_for_cleanup(secondfile)
    second_file = syn.store(File(secondfile, parent=nested_folder))
    schedule_for_cleanup(second_file.id)
    thirdfile = utils.make_bogus_data_file()
    schedule_for_cleanup(thirdfile)
    third_file = syn.store(File(thirdfile, parent=second_folder))
    schedule_for_cleanup(third_file.id)


    walked.append(((os.path.join(project_entity.name,folder_entity.name),folder_entity.id),
                   [(nested_folder.name,nested_folder.id)],
                   []))
    walked.append(((os.path.join(os.path.join(project_entity.name,folder_entity.name),nested_folder.name),nested_folder.id),
                   [],
                   [(second_file.name,second_file.id)]))
    walked.append(((os.path.join(project_entity.name,second_folder.name),second_folder.id),
                   [],
                   [(third_file.name,third_file.id)]))


    temp = synu.walk(syn, project_entity.id)
    temp = list(temp)
    #Must sort the tuples returned, because order matters for the assert
    #Folders are returned in a different ordering depending on the name
    for i in walked:
        for x in i:
            if type(x) == list:
                x = x.sort()
    for i in temp:
        assert i in walked

    print("CHECK: Cannot synu.walk a file returns empty generator")
    temp = synu.walk(syn, second_file.id)
    assert list(temp) == []
Beispiel #4
0
def dockerValidate(submission, syn, user, password):
    submissionJson = json.loads(submission['entityBundleJSON'])
    assert submissionJson['entity'].get('repositoryName') is not None, "Must submit a docker container"
    dockerRepo = submissionJson['entity']['repositoryName'].replace("docker.synapse.org/","")
    #assert dockerRepo.startswith("docker.synapse.org")
    assert submission.get('dockerDigest') is not None, "Must submit a docker container with a docker sha digest"
    dockerDigest = submission['dockerDigest']
    index_endpoint = 'https://docker.synapse.org'
    #dockerImage = dockerRepo + "@" + dockerDigest

    #Check if docker is able to be pulled
    dockerRequestURL = '{0}/v2/{1}/manifests/{2}'.format(index_endpoint, dockerRepo, dockerDigest)
    token = getAuthToken(dockerRequestURL, user, password)

    resp = requests.get(dockerRequestURL,
                        headers={'Authorization': 'Bearer %s' % token})
    assert resp.status_code == 200, "Docker image + sha digest must exist"
    
    #Must check docker image size
    #Synapse docker registry
    dockerSize = sum([layer['size'] for layer in resp.json()['layers']])
    assert dockerSize/1000000000.0 < 1000, "Docker image must be less than a teribyte"

    #Send email to me if harddrive is full 
    #should be stateless, if there needs to be code changes to the docker agent
    preds = synu.walk(syn, CHALLENGE_PREDICTION_FOLDER)
    predFolders = preds.next()[1]
    predSynId = [synId for name, synId in predFolders if str(submission.id) == name]

    logs = synu.walk(syn, CHALLENGE_LOG_FOLDER)
    logsFolders = logs.next()[1]
    logsSynId = [synId for name, synId in logsFolders if str(submission.id) == name]

    if len(predSynId) == 0:
        predFolder = syn.store(Folder(submission.id, parent = CHALLENGE_PREDICTION_FOLDER))
        predFolder = predFolder.id
    else:
        predFolder = predSynId[0]
    if len(logsSynId) == 0:
        logFolder = syn.store(Folder(submission.id, parent = CHALLENGE_LOG_FOLDER))
        logFolder = logFolder.id
        for participant in submission.contributors:
            if participant['principalId'] in ADMIN_USER_IDS: 
                access = ['CREATE', 'READ', 'DOWNLOAD', 'UPDATE', 'DELETE', 'CHANGE_PERMISSIONS', 'MODERATE', 'CHANGE_SETTINGS']
            else:
                access = ['READ','DOWNLOAD']
            #Comment set permissions out if you don't want to allow participants to see the pred files
            #syn.setPermissions(predFolder, principalId = participant['principalId'], accessType = access)
            syn.setPermissions(logFolder, principalId = participant['principalId'], accessType = access)
    else:
        logFolder = logsSynId[0]    
        #Add more message if you want to return the prediction files
    return(True, "Your submission has been validated!  As your submission is being ran, please go here: https://www.synapse.org/#!Synapse:%s to check on your log file." % logFolder)
Beispiel #5
0
def annotate_proteomics():
	###PROTEOMICS###
	directory = synu.walk(syn,"syn6101352")
	for dirpath, dirname, filename in directory:
		annots = {"topic":"proteomicsExperiment",
				  "disease":"NF1",
				  "diseaseSubtype":"dermalNF",
				  "tissueType":"neurofibroma",
				  "tissueSubtype":"dermalNeurofibroma",
				  "nf1Genotype":"-/-"}	
		if os.path.basename(dirpath[0]) == "Original Calls":
			annots.update({"dataType":"proteomicsData",
					  "dataSubtype":"peptideIdentification"
					  "fileFormat":"text",
					  "formatSubtype":"tsv"})
		else:
			annots.update({"dataType":"proteomicsData",
					  "dataSubtype":"proteinMatrix"
					  "fileFormat":"text",
					  "formatSubtype":"tsv"})
		proteomics = dict(dermalNF.items() + annots.items())
		for i in filename:
			temp = syn.get(i[1],downloadFile=False)
			temp.annotations.update(proteomics)
			for i in temp.annotations:
				if type(temp[i]) == list and len(temp[i]) == 1:
					temp[i] = temp[i][0] 
			syn.store(temp, forceVersion=False)
def updateAnnoByDict(syn,synId,annoDict):
    """
    Update annotations by giving a dict
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File OR a list of Synpase IDs of File
    :param annoDict        A dict of annotations
    
    Example:

       updateAnnoByDict(syn,"syn12345",{"dataType":"testing","projectName":"foo"})
       OR
       updateAnnoByDict(syn,["syn1","syn2"],{"dataType":"testing","projectName":"foo"})
       
    """
    
    if type(synId) is list:
        print "Input is a list of Synapse IDs \n"
        for synID in synId:
            print "Getting File %s ..." % synID
            temp = syn.get(synID,downloadFile = False)
            _helperUpdateAnnoByDict(syn,temp,annoDict)
    else:
        print "Input is a Synpase ID \n"
        starting = syn.get(synId,downloadFile = False)
        if not is_container(starting):
            print "%s is a File \n" % synId
            _helperUpdateAnnoByDict(syn,starting,annoDict)
        else:
            directory = synu.walk(syn,synId)
            for dirpath,dirname,filename in directory:
                for i in filename:
                    temp = syn.get(i[1],downloadFile = False)
                    print "Getting File %s ..." % i[1]
                    _helperUpdateAnnoByDict(syn,temp,annoDict)
Beispiel #7
0
def getCenterInputFiles(syn, synId, center, process="main"):
    """
    This function walks through each center's input directory and validates every file
    """
    ################################################################
    ##If a file has not changed than it does not need to be processed!!!!
    ################################################################
    logger.info("GETTING %s INPUT FILES" % center)
    CLINICAL_PAIR_NAME = [
        "data_clinical_supp_sample_%s.txt" % center,
        "data_clinical_supp_patient_%s.txt" % center
    ]
    walked = synu.walk(syn, synId)
    clinicalpair = []
    allFiles = []
    for dirpath, dirname, filenames in walked:
        for name, synid in filenames:
            logger.info(name)
            paired = False
            if name in CLINICAL_PAIR_NAME:
                paired = True
                clinicalpair.append(synid)
            if len(clinicalpair) == 2:
                syns = [i for i in clinicalpair]
                paths = [reNameFile(syn, i) for i in clinicalpair]
                clinicalpair = []
                allFiles.append((syns, paths))
            elif not paired:
                if process == "vcf":
                    allFiles.append(([synid], [reNameFile(syn, synid)]))
                elif not name.endswith(".vcf"):
                    allFiles.append(([synid], [reNameFile(syn, synid)]))
    return (allFiles)
Beispiel #8
0
def annotate_snp():
	directory = synu.walk(syn,"syn5004874")
	for dirpath, dirname, filename in directory:
		annots = {"topic":"geneticVariation",
				  "subTopic":"dnaPolymorphism",
				  "disease":"NF1",
				  "diseaseSubtype":"dermalNF",
				  "tissueType":"neurofibroma",
				  "tissueSubtype":"dermalNeurofibroma",
				  "nf1Genotype":"-/-"}
		final = dict(dermalNF.items() + annots.items())
		for i in filename:
			temp = syn.get(i[1],downloadFile=False)
			temp.annotations.update(final)
			if i[0].startswith("3096"):
				temp.dataType = "report"
				temp.dataSubtype = "snpArrayReport"
				temp.fileType = "text"
				temp.fileSubtype = "csv"
			else:
				temp.dataType = "annotation"
				temp.dataSubtype = "snpAnnotation"
				temp.fileType = "text"
				temp.fileSubtype = "csv"			
			for i in temp.annotations:
				if type(temp[i]) == list and len(temp[i]) == 1:
					temp[i] = temp[i][0] 
			#print(temp.annotations)
			syn.store(temp, forceVersion=False)
def curate_raw_data(syn):
    raw_data_folders = [
        SHIMMER_BACK, SHIMMER_LEFT_ANKLE, SHIMMER_LEFT_WRIST,
        SHIMMER_RIGHT_ANKLE, SHIMMER_RIGHT_WRIST
    ]
    raw_data_locations = [
        "Back", "LeftAnkle", "LeftWrist", "RightAnkle", "RightWrist"
    ]
    data_cols = [
        "subject_id", "device", "device_position", "participant_day",
        "timestamp_start", "timestamp_end", "source_file",
        "data_file_handle_id"
    ]
    raw_data = pd.DataFrame(columns=data_cols)
    for folder, device_location in zip(raw_data_folders, raw_data_locations):
        w = su.walk(syn, folder)
        parent, folders, _ = next(w)
        records = []
        for folder_name, folder_id in folders:
            subject, _, subject_files = next(w)
            subject_num = int(re.search("\d+", folder_name).group())
            subject_loc = "BOS"
            subject_id = "{}_{}".format(subject_num, subject_loc)
            for file_name, file_id in subject_files:
                file_day = int(re.search("\d+", file_name).group())
                syn_file = syn.get(file_id)
                df = pd.read_table(syn_file.path)
                timestamp_start = min(df.timestamp)
                timestamp_end = max(df.timestamp)
                fhid = syn_file['dataFileHandleId']
                records.append([
                    subject_id, "Shimmer", device_location, file_day,
                    timestamp_start, timestamp_end, file_id, fhid
                ])
        raw_data_table = pd.DataFrame(records, columns=data_cols)
        fhids_to_copy = raw_data_table['data_file_handle_id'].tolist()
        source_files = raw_data_table["source_file"].tolist()
        new_fhids = []
        for i in range(0, len(fhids_to_copy), 100):
            fhids_subset = fhids_to_copy[i:i + 100]
            source_files_subset = source_files[i:i + 100]
            new_fhids_subset = su.copyFileHandles(
                syn=syn,
                fileHandles=fhids_subset,
                associateObjectTypes=["FileEntity"] * len(fhids_subset),
                associateObjectIds=source_files_subset,
                contentTypes=["text/tab-separated-values"] * len(fhids_subset),
                fileNames=[None] * len(fhids_subset))
            new_fhids_subset = [
                int(i['newFileHandle']['id'])
                for i in new_fhids_subset['copyResults']
            ]
            new_fhids = new_fhids + new_fhids_subset
        fhid_mapping = {k: v for k, v in zip(fhids_to_copy, new_fhids)}
        raw_data_table["data_file_handle_id"] = \
                raw_data_table["data_file_handle_id"].map(fhid_mapping)
        raw_data = raw_data.append(raw_data_table,
                                   ignore_index=True,
                                   sort=False)
    return (raw_data)
def download_relevant_children(syn, parent, filtering_prefix, diary, sensor,
                               download_in_parallel=False):
    """
    Returns
    -------
    dict with key synapse_id (str) and values
    synapse_file (File), measurement_ids (list)
    """
    _, _, entity_info = list(su.walk(syn, parent))[0]
    entity_info = [(i, j) for i, j in entity_info if i.startswith(filtering_prefix)]
    relevant_entities = {}
    for fname, syn_id in entity_info:
        relevant_entries = find_relevant_diary_entries(fname, diary, sensor)
        if len(relevant_entries):
            relevant_entities[syn_id] = {"synapse_file": None,
                                         "measurement_ids": relevant_entries.measurement_id}
    ordered_synapse_ids = list(relevant_entities.keys())
    if download_in_parallel:
        mp = multiprocessing.dummy.Pool(4)
        children = mp.map(syn.get, ordered_synapse_ids)
    else:
        children = list(map(syn.get, ordered_synapse_ids))
    for syn_id, f in zip(ordered_synapse_ids, children):
        relevant_entities[syn_id]["synapse_file"] = f
    return relevant_entities
Beispiel #11
0
def annotate_WGS():
	directory = synu.walk(syn,"syn4984626")
	for dirpath, dirname, filename in directory:
		annots = {"topic":"geneticVariation",
				  "subTopic":"dnaMutation",
				  "disease":"NF1",
				  "diseaseSubtype":"dermalNF",
				  "tissueType":"neurofibroma",
				  "tissueSubtype":"dermalNeurofibroma",
				  "nf1Genotype":"-/-"}
		final = dict(dermalNF.items() + annots.items())
		for i in filename:
			temp = syn.get(i[1],downloadFile=False)
			temp.annotations.update(final)
			if i[0].endswith(".vcf.gz") or i[0].endswith(".vcf"):
				temp.fileFormat = "typedText"
				temp.formatSubtype = "VCF"
			elif i[0].endswith(".maf.gz") or i[0].endswith(".maf"):
				temp.fileFormat = "typedText"
				temp.formatSubtype = "MAF"				
			for i in temp.annotations:
				if type(temp[i]) == list and len(temp[i]) == 1:
					temp[i] = temp[i][0] 
			#print(temp.annotations)
			syn.store(temp, forceVersion=False)
def updateFormatTypeByFileName(syn,synId,annoKey,annoDict):
    """
    Audit entity file type annotations
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File OR a list of Synapse IDs of File
    :param annoKey:        The annotation key for file type. (i.e. "fileType", "fileFormat", or "formatType")
    :param annoDict        A dict where key is the extension of the filename, 
                           value is the corresponding file type value in entity annotations
        
    Example:
    
       updateFormatTypeByFileName(syn,"syn12345","fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"})
       OR
       updateFormatTypeByFileName(syn,["syn1","syn2"],"fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"})
       
    """
    if type(synId) is list:
        print "Input is a list of Synapse IDs \n"
        for synID in synId:
            print "Getting File %s ..." % synID
            temp = syn.get(synID,downloadFile = False)
            _helperUpdateFormatTypeByFileName(syn,temp,annoKey,annoDict)
    else:
        print "Input is a Synpase ID \n"
        starting = syn.get(synId,downloadFile = False)
        if not is_container(starting):
            print "%s is a File \n" % synId
            _helperUpdateFormatTypeByFileName(syn,starting,annoKey,annoDict)
        else:
            directory = synu.walk(syn,synId)
            for dirpath,dirname,filename in directory:
                for i in filename:
                    temp = syn.get(i[1],downloadFile = False)
                    print "Getting File %s ..." % i[1]
                    _helperUpdateFormatTypeByFileName(syn,temp,annoKey,annoDict)
Beispiel #13
0
def get_public_to_consortium_synid_mapping(syn, releaseSynId, test=False):
    """
    Gets the mapping between public version name
    and its Synapse ids (Can probably be replaced with folder view)
    """
    temp = synapseutils.walk(syn, releaseSynId)
    officialPublic = dict()
    for dirpath, dirnames, filenames in temp:
        release = os.path.basename(dirpath[0])
        # checkRelease = release.split(".")
        final = [i.split("-") for i in release.split(".")]
        checkRelease = []
        for i in final:
            checkRelease.extend(i)
        if test:
            officialPublic["TESTpublic"] = "syn12299959"
        else:
            if len(checkRelease) == 3 and checkRelease[0] != "0":
                if int(checkRelease[1]) > 0:
                    if checkRelease[0] in ["1", "2"]:
                        public_release_name = str(int(checkRelease[0]) +
                                                  1) + ".0.0"
                    else:
                        public_release_name = str(int(
                            checkRelease[0])) + ".0-public"
                    officialPublic[public_release_name] = dirpath[1]
    return officialPublic
def updateAnnoByMetadata(syn, synId, metaDf, refCol, cols2Add,fileExts):
    """
    Audit entity annotations against metadata
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File OR a list of Synpase IDs of File
    :param metaDf          A pandas data frame of entity metadata
    :param refCol          A name of the column in metaDf that matching one of the entity attributes
    :param cols2Add        A list of columns in metaDf need to be added as entity annotations 
    :param fileExts        A list of all file extensions (PsychENCODE ONLY!!!) 
        
    Example:
    
        updateAnnoByMetadata(syn,"syn12345",metadata,"id",["dataType","tester"],[".bam",".csv"])
       
    """
    
    if type(synId) is list:
        print "Input is a list of Synapse IDs \n"
        for synID in synId:
            print "Getting File %s ..." % synID
            temp = syn.get(synID,downloadFile = False)
            _helperUpdateAnnoByMetadata(syn,temp,metaDf,refCol,cols2Add,fileExts)
    else:
        print "Input is a Synpase ID \n"
        starting = syn.get(synId,downloadFile = False)
        if not is_container(starting):
            print "%s is a File \n" % synId
            _helperUpdateAnnoByMetadata(syn,starting,metaDf,refCol,cols2Add,fileExts)
        else:
            directory = synu.walk(syn,synId)
            for dirpath,dirname,filename in directory:
                for i in filename:
                    temp = syn.get(i[1],downloadFile = False)
                    print "Getting File %s ..." % i[1]
                    _helperUpdateAnnoByMetadata(syn,temp,metaDf,refCol,cols2Add,fileExts)
def searchInMetadata(syn, synId, metaDf, refCol,col2Check,values2Check,fileExts):
    """
    Search for a list of Synapse IDs with a given column and expected values
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder
    :param metaDf          A pandas data frame of entity metadata
    :param refCol          A name of the column in metaDf that matching one of the entity attributes
    :param col2Check       A name of the column in metaDf you are seaching
    :param values2Check    A list of values you are searching
    :param fileExts        A list of all file extensions (PsychENCODE ONLY!!!) 
    
    Return:
      A list of Synapse IDs
        
    Example:
       IDs = searchInMetadata(syn,"syn123",metadata,"id","tester",["foo","bar"],[".bam",".csv"])
       
    """
    synapseIds = []
    print "Search in metadata for key: %s \n" % col2Check
    starting = syn.get(synId,downloadFile = False)
    if not is_container(starting):
        print "ERROR: %s is not a Synapse ID of Project or Folder" % synId
    else:
        directory = synu.walk(syn,synId)
        for dirpath,dirname,filename in directory:
            for i in filename:
                temp = syn.get(i[1],downloadFile = False)
                print "Getting File %s ..." % i[1]
                _helperSearchInMetadata(syn,temp,metaDf,refCol,col2Check,values2Check,fileExts,synapseIds)
                                    
        return synapseIds
Beispiel #16
0
def _getSynapseDir(syn, synapse_id, local_root, dir_list):
    """
    1. Walks through Synapse parent location hierarchy.
    2. update folders in Synapse to match the local dir,
    3. get key-value pairs of dirname and synapse id

    :param syn:
    :param synapse_id:
    :param local_root:
    :param dir_list:
    :return:
    """
    synapse_dir = {}
    synapse_root = syn.get(synapse_id)

    for (dirpath, dirpath_id), _, _ in synapseutils.walk(syn, synapse_id):
        dirpath = dirpath.replace(synapse_root.name,
                                  os.path.abspath(local_root))
        synapse_dir[dirpath] = dirpath_id

    for directory in dir_list:

        if not synapse_dir.has_key(directory):
            new_folder = synapseclient.Folder(
                os.path.basename(directory),
                synapse_dir[os.path.dirname(directory)])
            new_folder = syn.store(new_folder)
            synapse_dir[directory] = new_folder.id

    return synapse_dir
Beispiel #17
0
def annotate_rna():
	directory = synu.walk(syn,"syn6035832")
	for dirpath, dirname, filename in directory:
		annots = {"topic":"alignment",
				  "subTopic":"RNAseq"}
		if "ENCODE controls" in  dirpath[0]:
			annots.update({"disease":"normal",
					   "nf1Genotype":"+/+",
					   "tissueType":"skin"})
		else:
			annots.update({"disease":"NF1",
					   "diseaseSubtype":"dermalNF",
					   "tissueType":"neurofibroma",
					   "tissueSubtype":"dermalNeurofibroma",
					   "nf1Genotype":"-/-"})
		if os.path.basename(dirpath[0]) == "Cufflinks Quantitation":
			annots.update({"dataType":"geneExpressionData",
					   "dataSubtype":"geneCountsFile",
					   "fileType":"text"})
		elif os.path.basename(dirpath[0]) == "FeatureCounts Quantitation":
			annots.update({"dataType":"geneExpressionData",
					   "dataSubtype":"geneCountsFile",
					   "fileType":"text"})
		elif os.path.basename(dirpath[0]) == "Gene matrices":
			annots.update({"dataType":"geneExpressionData",
					   "dataSubtype":"geneExpressionMatrix",
					   "fileType":"text",
					   "fileSubtype":"csv"})
		elif os.path.basename(dirpath[0]) == "Dermal NF tumor alignments":
			annots.update({"dataType":"sequenceAlignment",
					   "dataSubtype":"rnaSequenceAlignment",
					   "fileType":"binary"})
		elif os.path.basename(dirpath[0]) == "BAM files":
			annots.update({"dataType":"sequenceAlignment",
					   "dataSubtype":"rnaSequenceAlignment",
					   "fileType":"binary"})
		elif os.path.basename(dirpath[0]) == "Pre-aligned quantitation":
			annots.update({"dataType":"geneExpressionData",
					   "dataSubtype":"geneCountsFile",
					   "fileType":"text",
					   "fileSubtype":"tsv"})
		final = dict(dermalNF.items() + annots.items())
		for i in filename:
			temp = syn.get(i[1],downloadFile=False)
			temp.annotations.update(final)
			if i[0].endswith(".gtf"):
				temp.fileSubtype = "gtf"
			elif i[0].endswith(".bam"):
				temp.fileSubtype = "BAM"
			elif i[0].endswith(".bai"):
				temp.fileSubtype = "BAI"
			elif temp.get('fileSubtype',None) is None:
				temp.fileSubtype = ''
			for i in temp.annotations:
				if type(temp[i]) == list and len(temp[i]) == 1:
					temp[i] = temp[i][0] 
			#print(temp.annotations)
			syn.store(temp, forceVersion=False)
Beispiel #18
0
def synwalk_to_df(syn, synId):
    res = []
    for root, dirs, entities in su.walk(syn, synId):
        if len(entities) > 0:
            path_names = [root[0]] * len(entities)
            path_ids = [root[1]] * len(entities)
            entity_names = map(lambda x: x[0], entities)
            entity_ids = map(lambda x: x[1], entities)
            res.extend(zip(path_names, path_ids, entity_names, entity_ids))
    return pd.DataFrame(
        res, columns=['folderPath', 'folderId', 'entityName', 'entityId'])
Beispiel #19
0
def validateFiles(synId, center):
	"""
	This function walks through each center's input directory and validates every file
	"""
	syn = synapseclient.login()
	walked = synu.walk(syn, synId)
	clinicalpair = []
	invalid = []
	validFiles = []
	for dirpath, dirname, filenames in walked:
		for name, synid in filenames:
			file = []
			if name.endswith(".vcf") and VALIDATE_FILENAME['vcf'] % center in name:
				fileType = "vcf"
			elif name.endswith(".bed") and VALIDATE_FILENAME['bed'] % center in name:
				fileType = "bed"
			elif  VALIDATE_FILENAME['maf'] % center == name:
				fileType = "maf"
			elif VALIDATE_FILENAME['cnv'] % center == name:
				fileType = "cnv"
			elif VALIDATE_FILENAME['fusions'] % center == name:
				fileType = "fusions"
			elif VALIDATE_FILENAME['seg'] % center == name:
				fileType = "seg"
			elif VALIDATE_FILENAME['clinical'][0] % center == name:
				fileType = "clinical"
			elif name in VALIDATE_FILENAME['clinical'][1:]:
				clinicalpair.append(synid)
				if len(clinicalpair) == 2:
					fileType = "clinical"
				else:
					fileType = None
			else:
				fileType = None
			if fileType is not None:
				if len(clinicalpair) == 2:
					#Need to pass in both filepath and synid for processing files
					file = [(syn.get(i).path,i) for i in clinicalpair]
					clinicalpair = []
				else:
					file = [(syn.get(synid).path,synid)]
				#Validation only takes in a list of filepaths
				paths = [i[0] for i in file]
				message, valid = validate_genie.main(fileType, paths, center)
			else:
				print("%s: Cannot be processed" % name)
				valid = False
				invalid.append(name)
			if not valid:
				invalid.append(name)
			else:
				validFiles.extend(file)
	print(", ".join(invalid) + " can't be processed!")
	return(validFiles)
Beispiel #20
0
def _accumulate_remotes(synapse_parent_id, syn):
    """Retrieve references to all remote directories and files.
    """
    remotes = {}
    s_base_folder = syn.get(synapse_parent_id)
    for (s_dirpath, s_dirpath_id), _, s_filenames in synapseutils.walk(syn, synapse_parent_id):
        remotes[s_dirpath] = s_dirpath_id
        if s_filenames:
            for s_filename, s_filename_id in s_filenames:
                remotes[os.path.join(s_dirpath, s_filename)] = s_filename_id
    return s_base_folder, remotes
Beispiel #21
0
def get_center_input_files(syn,
                           synid,
                           center,
                           process="main",
                           downloadFile=True):
    """
    This function walks through each center's input directory
    to get a list of tuples of center files

    Args:
        syn: Synapse object
        synid: Center input folder synid
        center: Center name
        process: Process type includes, main, vcf, maf and mafSP.
                 Defaults to main such that the vcf

    Returns:
        List of entities with the correct format to pass into validation
    """
    logger.info("GETTING {center} INPUT FILES".format(center=center))
    clinical_pair_name = [
        "data_clinical_supp_sample_{center}.txt".format(center=center),
        "data_clinical_supp_patient_{center}.txt".format(center=center),
    ]

    center_files = synapseutils.walk(syn, synid)
    clinicalpair_entities = []
    prepared_center_file_list = []

    for _, _, entities in center_files:
        for name, ent_synid in entities:
            # This is to remove vcfs from being validated during main
            # processing. Often there are too many vcf files, and it is
            # not necessary for them to be run everytime.
            if name.endswith(".vcf") and process != "mutation":
                continue

            ent = syn.get(ent_synid, downloadFile=downloadFile)

            # Clinical file can come as two files.
            # The two files need to be merged together which is
            # why there is this format

            if name in clinical_pair_name:
                clinicalpair_entities.append(ent)
                continue

            prepared_center_file_list.append([ent])

    if clinicalpair_entities:
        # clinicalpair_entities = [x for x in clinicalpair]
        prepared_center_file_list.append(clinicalpair_entities)

    return prepared_center_file_list
def auditAgainstMetadata(syn, synId, metaDf, refCol, cols2Check,fileExts):
    """
    Audit entity annotations against metadata
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File
    :param metaDf          A pandas data frame of entity metadata
    :param refCol          A name of the column in metaDf that matching one of the entity attributes
    :param cols2Check      A list of columns in metaDf need to be audited with entity annotations 
    :param fileExts        A list of all file extensions (PsychENCODE ONLY!!!) 
    
    Return:
      If synId is an ID of a Project/Folder
        entityMissMetadata:    A list of Synapse IDs that have no matching metadata
        incorrectAnnoated:     A dict object where key is the annotation key and value is a list of entities 
                               that were annotated incorrectly (i.e. {"sampleId":["syn1","syn2"]})
        missedAnno:            A dict object where key is the annotation key and value is a list of entities
                               that were missing the annotation (i.e. {"dataType":["syn3","syn4"]})
      If synId is an ID of a File
        entityMissMetadata:    A boolean. True when the entity has no matching metadata
        incorrectAnnoated:     A list of keys that were annotated incorrectly
        missedAnno:            A list of keys were not annotated
        
    Example:
       entityMissMetadata,incorrectAnnotated, missingAnno = 
               auditAgainstMetadata(syn,"syn12345",metadata,"id",["dataType","tester"],[".bam",".csv"])
       
    """
    entityMissMetadata = []
    incorrectAnnotated = {}
    missingAnno = {}
    print "Check annotations against metadata.\n"
    starting = syn.get(synId,downloadFile = False)
    if not is_container(starting):
        print "%s is a File \n" % synId
        _helperAuditMetadata(syn,starting,metaDf,refCol,cols2Check,fileExts,
                             entityMissMetadata,incorrectAnnotated,missingAnno)
        noMeta = False
        if len(entityMissMetadata):
            noMeta = True
        return noMeta,incorrectAnnotated.keys(),missingAnno.keys()
    else:
        directory = synu.walk(syn,synId)
        for dirpath,dirname,filename in directory:
            for i in filename:
                temp = syn.get(i[1],downloadFile = False)
                print "Getting File %s ..." % i[1]
                _helperAuditMetadata(syn,temp,metaDf,refCol,cols2Check,fileExts,
                                     entityMissMetadata,incorrectAnnotated,missingAnno)
        return entityMissMetadata,incorrectAnnotated,missingAnno
Beispiel #23
0
def level_zero_folder(syn, folder_id, tbl, dry_run=False):
    """Assume everything in a Data folder that doesn't have a name like
    'MDD_Level.\\..*' is level 0.
    """
    w = synapseutils.walk(syn, folder_id)
    folders = set([x[0][1].lstrip('syn') for x in w])

    q = "SELECT * FROM {} WHERE (parentId IN ({}) AND (name NOT LIKE 'MDD_%Level_.%') AND (Level is NULL) AND (DataType IS NULL OR DataType <> 'Metadata'))".format(tbl, ",".join(folders))
    logger.debug(q)
    res = syn.tableQuery(q).asDataFrame()
    res['Level'] = 0
    logger.info("{} rows to change for level 0.".format(res.shape[0]))
    if not dry_run  and not res.empty:
        new_tbl = store_table_and_reindex(syn, tbl, res)
              return(new_tbl)
Beispiel #24
0
    def getFilesInStorageDataset(
            self,
            datasetId: str,
            fileNames: List = None,
            fullpath: bool = True) -> List[Tuple[str, str]]:
        """Gets all files in a given dataset folder.

        Args:
            datasetId: synapse ID of a storage dataset.
            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
            fullpath: if True return the full path as part of this filename; otherwise return just base filename

        Returns: 
            A list of files; the list consists of tuples (fileId, fileName).
            
        Raises:
            ValueError: Dataset ID not found.
        """

        # select all files within a given storage dataset folder (top level folder in a Synapse storage project or folder marked with contentType = 'dataset')
        walked_path = synapseutils.walk(self.syn, datasetId)

        file_list = []

        # iterate over all results
        for dirpath, dirname, filenames in walked_path:

            # iterate over all files in a folder
            for filename in filenames:

                if (not "manifest" in filename[0]
                        and not fileNames) or (not fileNames == None
                                               and filename[0] in fileNames):

                    # don't add manifest to list of files unless it is specified in the list of specified fileNames; return all found files
                    # except the manifest if no fileNames have been specified
                    # TODO: refactor for clarity/maintainability

                    if fullpath:
                        # append directory path to filename
                        filename = (dirpath[0] + "/" + filename[0],
                                    filename[1])

                    # add file name file id tuple, rearranged so that id is first and name follows
                    file_list.append(filename[::-1])

        return file_list
def auditFormatTypeByFileName(syn,synId,annoKey,annoDict):
    """
    Audit entity file type annotations by checking file name with file type annotation
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File
    :param annoKey:        The annotation key for file type. (i.e. "fileType", "fileFormat", or "formatType")
    :param annoDict        A dict where key is the extension of the filename, 
                           value is the corresponding file type value in entity annotations
    
    Return:
    
      If synId is an ID of a Project/Folder
        A dict with 3 keys and each value is a list of File Synapse ID 
        (i.e. {"incorrect":[], "missingInAnno":["syn1"], "missingInDict":["syn2","syn3"])
      If synId is an ID of a File
        A string with 4 possible answers:
            1. "correct"
            2. "incorrect"
            3. "missingInAnno" - no file type in entity annotations
            4. "missingInDict" - file type is not found in file type annoDict
        
    Example:
    
       result = auditFormatType(syn,"syn12345","fileType",{".bam":"bam", ".doc":"word", "bw":"bigwig"})
       
    """
    
    needAnnotated = {"incorrect":[],
                     "missingInAnno":[],
                     "missingInDict":[]}
    
    starting = syn.get(synId,downloadFile = False)
    if not is_container(starting):
        print "%s is a File \n" % synId
        _helperAuditFormatTypeByFileName(syn,starting,annoKey,annoDict,needAnnotated)
        result = "correct"
        for key in needAnnotated.keys(): 
            if len(needAnnotated[key]):
                result = key
        return result
    else:
        directory = synu.walk(syn,synId)
        for dirpath,dirname,filename in directory:
            for i in filename:
                temp = syn.get(i[1],downloadFile = False)
                print "Getting File %s ..." % i[1]
                _helperAuditFormatTypeByFileName(syn,temp,annoKey,annoDict,needAnnotated)
        return needAnnotated
def auditCommonDict(syn, synId, annoDict):
    """
    Audit entity annotations against common dictionary shared among all enities
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File
    :param annoDict        A dict of annotations shared among entities
    
    Return:
      If synId is an ID of a Project/Folder
        entityMissAllAnno:     A list of Synapse IDs that have not been annotatd
        incorrectAnnoated:     A dict where key is the annotation key and value is a list of entities 
                               that were annotated incorrectly (i.e. {"sampleId":["syn1","syn2"]})
        missedAnno:            A dict where key is the annotation key and value is a list of entities
                               that were missing the annotation (i.e. {"dataType":["syn3","syn4"]})
      If synId is an ID of a File
        entityMissAllAnno:     A boolean if synID is a ID of a File
        incorrectAnnoated:     A list of keys that were annotated incorrectly
        missedAnno:            A list of keys were missing the entity annotation
        
    Example:

       entityMissAllAnno, incorrectAnnoated, missingAnno = 
               auditCommonDict(syn,"syn12345",{"dataType":"testing","projectName":"foo"})
       
    """

    entityMissAllAnno = []
    incorrectAnnotated = {}
    missingAnno = {}
    print "Check annotations against common dictionary. \n"
    starting = syn.get(synId,downloadFile = False)
    if not is_container(starting):
        print "%s is a File \n" % synId
        _helperAuditCommonDict(syn,starting,annoDict,entityMissAllAnno,incorrectAnnotated,missingAnno)
        noAnno = False
        if len(entityMissAllAnno):
            noAnno = True
        return noAnno,incorrectAnnotated.keys(),missingAnno.keys()
    else:
        directory = synu.walk(syn,synId)
        for dirpath,dirname,filename in directory:
            for i in filename:
                temp = syn.get(i[1],downloadFile = False)
                print "Getting file %s ..." % i[1]
                _helperAuditCommonDict(syn,temp,annoDict,entityMissAllAnno,incorrectAnnotated,missingAnno)
        return entityMissAllAnno,incorrectAnnotated,missingAnno
Beispiel #27
0
def get_center_input_files(syn, synid, center, process="main"):
    '''
    This function walks through each center's input directory
    to get a list of tuples of center files

    Args:
        syn: Synapse object
        synid: Center input folder synid
        center: Center name
        process: Process type includes, main, vcf, maf and mafSP.
                 Defaults to main such that the vcf

    Returns:
        List of entities with the correct format to pass into validation
    '''
    logger.info("GETTING {center} INPUT FILES".format(center=center))
    clinical_pair_name = [
        "data_clinical_supp_sample_{center}.txt".format(center=center),
        "data_clinical_supp_patient_{center}.txt".format(center=center)
    ]

    center_files = synapseutils.walk(syn, synid)
    clinicalpair_entities = []
    prepared_center_file_list = []

    for _, _, entities in center_files:
        for name, ent_synid in entities:

            ent = syn.get(ent_synid)

            # Clinical file can come as two files.
            # The two files need to be merged together which is
            # why there is this format

            if name in clinical_pair_name:
                clinicalpair_entities.append(ent)
                continue

            prepared_center_file_list.append([ent])

    if clinicalpair_entities:
        # clinicalpair_entities = [x for x in clinicalpair]
        prepared_center_file_list.append(clinicalpair_entities)

    return prepared_center_file_list
Beispiel #28
0
def annotations_for_folder(syn, folder_id, tbl, key, value, dry_run=False):
    """Walk a container and find all folders and apply key=value 
       annotation to all files in them.
    """
    w = synapseutils.walk(syn, folder_id)
    folders = set([x[0][1].lstrip('syn') for x in w])
    q = 'SELECT {} FROM {} WHERE parentId IN ({}) AND {} IS NULL'.format(key, tbl, ",".join(folders), key)
    logger.debug(q)
    res = syn.tableQuery(q).asDataFrame()
    logger.info("{} rows to change for key {}.".format(res.shape[0], key))

    res[key] = value

    if not dry_run and not res.empty:
        new_tbl = store_table_and_reindex(syn, tbl, res)
        return(new_tbl)
    else:
        return(res)
Beispiel #29
0
def check_all_files(synfolderID='syn20735395', syn=syn):
    '''
    Checks all files in source folder Synapse scratch space by comparing md5sum to
    destination GCP store

    Parameters
    synfolderID: source folder
    syn: a synapse object

    Value: a dataframe showing match or mismatch for each file in source

    Details
    Dest md5 is NaN if the file is missing from dest but present in source.
    '''
    l = list(synapseutils.walk(syn, synfolderID))[0][2]
    synIDs = np.array(l)[:, 1]
    dflist = [check1file(y, syn) for y in synIDs]
    df = pd.concat(dflist, axis=0)
    return (df)
Beispiel #30
0
def _get_synapse_file_paths(syn, parent_synid):
    print('Mapping synapse paths.')
    e = syn.get(parent_synid)
    strip_dir = ''
    if e.concreteType == 'org.sagebionetworks.repo.model.Project':
        strip_dir = e.name
    walked = synapseutils.walk(syn, parent_synid)
    contents = [x for x in walked]
    file_paths = []
    for dirpath, dirnames, filenames in contents:
        for filename in filenames:
            file_p = Path(dirpath[0], filename[0])
            file_p = Path(*[x for x in file_p.parts if x != strip_dir])
            file_path = {'file_p': file_p, 'synid': filename[1]}
            file_paths.append(file_path)
    synids = [x['synid'] for x in file_paths]
    se_list = _threaded_syn_get(syn, synids)
    remote_md5_dict = {x.id: x.md5 for x in se_list}
    for file_path in file_paths:
        file_path.update({'md5': remote_md5_dict[file_path['synid']]})
    return file_paths
Beispiel #31
0
def annotate_sampleInfo():
	directory = synu.walk(syn,"syn4984723")
	for dirpath, dirname, filename in directory:
		annots = {"topic":"dataIdentityAndMapping",
				  "subTopic":"clinicalVariables",
				  "disease":"NF1",
				  "diseaseSubtype":"dermalNF",
				  "tissueType":"neurofibroma",
				  "tissueSubtype":"dermalNeurofibroma",
				  "nf1Genotype":"-/-",
				  "dataType":"annotation",
				  "dataSubtype":"sampleAnnotation"}
		final = dict(dermalNF.items() + annots.items())
		for i in filename:
			temp = syn.get(i[1],downloadFile=False)
			temp.annotations.update(final)
			temp.fileSubtype = i[0].split(".")[-1]			
			for i in temp.annotations:
				if type(temp[i]) == list and len(temp[i]) == 1:
					temp[i] = temp[i][0] 
			#print(temp.annotations)
			syn.store(temp, forceVersion=False)
Beispiel #32
0
def correctAnnot(syn,synId,projSynId,correctionsFile):
	"""
	Propagates annotation changes based on tab-delimited input.
	Given a tab-separated file containing annotations to be updated, changes annotations across a project. File contains one line per key-value pair, if line has two entries, they are assumed to be oldKey and newKey, if three entries, they are assumed to be key, oldValue, newValue.'''

	:param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
	:param synId:          A Synapse ID of Project or Folder
	:param projSynId:      A Synapse ID of Project (possibly duplicate of synId).
	:param correctionsFile Path to a tab-delimited file of old and new annotation values.

	Example:
		correctAnnot(syn,"syn12345","syn45678","annotation_corrections.txt")

	"""

	with open(correctionsFile) as toChange:
		for line in toChange:
			items = line.strip().split('\t')
			if len(items) == 2: # update keys
				old = items[0]
				new = items[1]
				directory = synu.walk(syn,synId)
				for dirpath,dirname,files in directory:
					for item in files:
						temp = syn.getAnnotations(item[1])
						if old not in temp: continue
						correctedAnnotations = updateKey(oldKey=old,newKey=new,inAnnot=temp)
						savedAnnotations = syn.setAnnotations(result['file.id'],correctedAnnotations)
			elif len(items) > 2: # update values
				kKey = items.pop(0)
				old = items.pop(0)
				sql = 'select id,%s from file where projectId=="%s" and file.%s=="%s"' % (kKey,projSynId,kKey,old)
				results = syn.chunkedQuery(sql)
				for result in results:
					temp = syn.getAnnotations(result['file.id'])
					if kKey not in temp: continue
					temp[kKey] = items
					savedAnnotations = syn.setAnnotations(result['file.id'],temp)
def get_center_input_files(syn, synid, center, downloadFile=True):
    """This function walks through each center's input directory
    to get a list of tuples of center files

    Args:
        syn: Synapse object
        synid: Center input folder synid
        center: Center name

    Returns:
        List of entities with the correct format to pass into validation

    """
    logger.info(f"GETTING {center} INPUT FILES")
    center_files = synapseutils.walk(syn, synid)
    prepared_center_file_list = []

    for _, _, entities in center_files:
        for name, ent_synid in entities:
            ent = syn.get(ent_synid, downloadFile=downloadFile)
            prepared_center_file_list.append([ent])

    return prepared_center_file_list
Beispiel #34
0
def delAnnoByKey(syn, synId, keyList):
    """
    Delete annotations by key for a Synapse object
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File or a list of Synapse IDs
    :param keyList         A list of annotations keys that needs to be deleted
   
    Example:
    
       delAnnoByKey(syn,"syn12345",["dataType","projectName","sampleId"])
       OR
       delAnnoByKey(syn,["syn1","syn2","syn3"],["dataType"])
       
    """

    print "Delte entity annotations by key(s) - \n %s" % "\n".join(keyList)

    if type(synId) is list:
        print "Input is a list of Synapse IDs \n"
        for synID in synId:
            print "Getting File %s ..." % synID
            temp = syn.get(synID, downloadFile=False)
            _helperDelAnnoByKey(syn, temp, keyList)
    else:
        print "Input is a Synpase ID \n"
        starting = syn.get(synId, downloadFile=False)
        if not is_container(starting):
            print "%s is a File \n" % synId
            _helperDelAnnoByKey(syn, starting, keyList)
        else:
            directory = synu.walk(syn, synId)
            for dirpath, dirname, filename in directory:
                for i in filename:
                    temp = syn.get(i[1], downloadFile=False)
                    print "Getting File %s ..." % i[1]
                    _helperDelAnnoByKey(syn, temp, keyList)
def delAnnoByKey(syn,synId,keyList):
    """
    Delete annotations by key for a Synapse object
    :param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
    :param synId:          A Synapse ID of Project, Folder, or File or a list of Synapse IDs
    :param keyList         A list of annotations keys that needs to be deleted
   
    Example:
    
       delAnnoByKey(syn,"syn12345",["dataType","projectName","sampleId"])
       OR
       delAnnoByKey(syn,["syn1","syn2","syn3"],["dataType"])
       
    """

    print "Delte entity annotations by key(s) - \n %s" % "\n".join(keyList)
    
    if type(synId) is list:
        print "Input is a list of Synapse IDs \n"
        for synID in synId:
            print "Getting File %s ..." % synID
            temp = syn.get(synID,downloadFile = False)
            _helperDelAnnoByKey(syn,temp,keyList)
    else:
        print "Input is a Synpase ID \n"
        starting = syn.get(synId,downloadFile = False)
        if not is_container(starting):
            print "%s is a File \n" % synId
            _helperDelAnnoByKey(syn,starting,keyList)
        else:
            directory = synu.walk(syn,synId)
            for dirpath,dirname,filename in directory:
                for i in filename:
                    temp = syn.get(i[1],downloadFile = False)
                    print "Getting File %s ..." % i[1]
                    _helperDelAnnoByKey(syn,temp,keyList)
Beispiel #36
0
import synapseutils
import argparse
import re

syn = synapseclient.login()

parser = argparse.ArgumentParser()
parser.add_argument("-synId",
                    "--synId",
                    help="Proivde synapse Id of fastq files")
parser.add_argument("-sampleId", "--sampleId", help="Proivde sample Id")
args = parser.parse_args()
print('syn id:', args.synId)
print('sample id:', args.sampleId)

dirName = './'

c = 0
walkedPath = synapseutils.walk(syn, args.synId)
for dirpath, dirname, filename in walkedPath:
    for (inFileName, inFileSynId) in filename:
        downloadDir = dirName
        if args.sampleId in inFileName:
            print('in if:', inFileName)
            entity = syn.get(inFileSynId, downloadLocation=dirName)
        c += 1

print('c:', c)

exit()
def dockerRun(submission, scoring_sh, syn, client):

    #These are the volumes that you want to mount onto your docker container
    OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              submission.id)
    TESTDATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                'evaluation_data')
    #These are the locations on the docker that you want your mounted volumes to be + permissions in docker (ro, rw)
    #It has to be in this format '/output:rw'
    MOUNTED_VOLUMES = {
        OUTPUT_DIR: '/output:rw',
        TESTDATA_DIR: '/evaluation_data:ro'
    }
    #All mounted volumes here in a list
    ALL_VOLUMES = [OUTPUT_DIR, TESTDATA_DIR]

    allLogs = synu.walk(syn, CHALLENGE_LOG_FOLDER)
    logFolder = allLogs.next()
    logFolderId = [
        synId for name, synId in logFolder[1] if name == submission.id
    ][0]

    allPreds = synu.walk(syn, CHALLENGE_PREDICTION_FOLDER)
    predFolder = allPreds.next()
    predFolderId = [
        synId for name, synId in predFolder[1] if name == submission.id
    ][0]

    dockerDigest = submission.get('dockerDigest')
    submissionJson = json.loads(submission['entityBundleJSON'])
    dockerRepo = submissionJson['entity']['repositoryName']
    dockerImage = dockerRepo + "@" + dockerDigest

    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
    #Mount volumes
    volumes = {}
    for vol in ALL_VOLUMES:
        volumes[vol] = {
            'bind': MOUNTED_VOLUMES[vol].split(":")[0],
            'mode': MOUNTED_VOLUMES[vol].split(":")[1]
        }

    # Run docker image
    errors = None
    try:
        container = client.containers.run(dockerImage,
                                          scoring_sh,
                                          detach=True,
                                          volumes=volumes,
                                          name=submission.id + "_t" +
                                          str(int(time.time())),
                                          network_disabled=True)
    except docker.errors.APIError as e:
        container = None
        errors = str(e) + "\n"
    #Create log file
    logFileName = submission.id + "_log.txt"
    logSynId = None
    #Create the logfile
    openLog = open(logFileName, 'w').close()
    #While docker is still running (the docker python client doesn't update status)
    #Add sleeps
    if container is not None:
        while subprocess.Popen(
            ['docker', 'inspect', '-f', '{{.State.Running}}', container.name],
                stdout=subprocess.PIPE).communicate()[0] == "true\n":
            logFileText = container.logs()
            with open(logFileName, 'w') as logFile:
                logFile.write(logFileText)
            statinfo = os.stat(logFileName)
            #Only store log file if > 0bytes
            if statinfo.st_size > 0 and statinfo.st_size / 1000.0 <= 50:
                ent = File(logFileName, parent=logFolderId)
                try:
                    logs = syn.store(ent)
                    logSynId = logs.id
                except synapseclient.exceptions.SynapseHTTPError as e:
                    pass
            time.sleep(60)

        #Must run again to make sure all the logs are captured
        logFileText = container.logs()
        with open(logFileName, 'w') as logFile:
            logFile.write(logFileText)
        statinfo = os.stat(logFileName)
        #Only store log file if > 0bytes
        if statinfo.st_size > 0 and statinfo.st_size / 1000.0 <= 50:
            ent = File(logFileName, parent=logFolderId)
            try:
                logs = syn.store(ent)
                logSynId = logs.id
            except synapseclient.exceptions.SynapseHTTPError as e:
                pass
        container.remove()
        try:
            client.images.remove(dockerImage)
        except:
            print("Unable to remove image")

    statinfo = os.stat(logFileName)
    if statinfo.st_size == 0:
        with open(logFileName, 'w') as logFile:
            if errors is not None:
                logFile.write(errors)
            else:
                logFile.write("No Logs, or logs exceed size limit")
            logFile.flush()
            ent = File(logFileName, parent=logFolderId)
            try:
                logs = syn.store(ent)
                logSynId = logs.id
            except synapseclient.exceptions.SynapseHTTPError as e:
                pass

    if logSynId is None:
        logFile = synu.walk(syn, logFolderId)
        logFiles = logFile.next()
        logSynId = logFiles[2][0][1]
    #Zip up predictions and store it into CHALLENGE_PREDICTIONS_FOLDER
    if len(os.listdir(OUTPUT_DIR)) > 0:
        zipf = zipfile.ZipFile(submission.id + '_predictions.zip', 'w',
                               zipfile.ZIP_DEFLATED)
        zipdir(OUTPUT_DIR, zipf)
        zipf.close()

        ent = File(submission.id + '_predictions.zip', parent=predFolderId)
        predictions = syn.store(ent)
        prediction_synId = predictions.id
        os.system("rm -rf %s" % OUTPUT_DIR)
        os.remove(submission.id + '_predictions.zip')
    else:
        prediction_synId = None
    os.remove(logFileName)
    return (prediction_synId, logSynId)
Beispiel #38
0
def checkAgainstDict(syn, synId, annotDictId,verbose=True):	
	"""
	Compares annotations in use against dictionary.
	Gets all annotation keys and values in use in a project and compares against those specified by a dictionary. Prints non-matching terms.

	:param syn:            A Synapse object: syn = synapseclient.login()- Must be logged into synapse
	:param synId:          A Synapse ID of Project or Folder
	:param annotDictId     A Synapse ID of annotation dictionary in YAML

	Return:
		If synId is an ID of a Project/Folder
			wrongKeys:		    A set of keys in use in synId that are not in annotDictId.
			wrongValues: 	    A set of values in use in synId that are not in annotDictId.

	Example:
		wrongKeys, wrongValues = checkAgainstDict(syn,"syn12345","syn45678")

	"""

	yamlEnt = syn.get(annotDictId)
	with open(yamlEnt.path) as f:
		annotations = yaml.load(f)

	allKeysInProject = set()
	allValsInProject = set()

	systemKeysToExclude = ['creationDate', 'etag', 'id', 'uri', 'accessControl']

	directory = synu.walk(syn,synId)
	for dirpath,dirname,files in directory:
		for item in files:
			temp = syn.getAnnotations(item[1])
			for key in temp:
				if key in systemKeysToExclude: continue
				allKeysInProject.add(key)
				if isinstance(temp[key], list):
					for val in temp[key]:
						allValsInProject.add(str(val))
				else:
					allValsInProject.add(str(temp[key]))

	print 'Number of key terms in project: %d' % len(allKeysInProject)
	print 'Number of value terms in project: %d' % len(allValsInProject)
	wrongKeys = set()
	wrongValues = set()

	allKeysInVocab = set(annotations.keys())
	if verbose and not allKeysInProject <= allKeysInVocab:
		print '\nKeys in use that are not found in dictionary: \n'
		wrongKeys = allKeysInProject.difference(allKeysInVocab)
		for item in wrongKeys:
			print '%s' % item

	allValsInVocab = set()
	for key, vl in annotations.iteritems():
		if isinstance(vl, list):
			for element in vl:
				allValsInVocab.add(str(element))
		else:	
			allValsInVocab.add(str(vl))
	if verbose and not allValsInProject <= allValsInVocab:
		print '\nValues in use that are not found in dictionary: \n'
		wrongValues = allValsInProject.difference(allValsInVocab)
		for item in wrongValues:
			print '%s' % item
	
	return(wrongKeys, wrongValues)
def test_walk():
    walked = []
    firstfile = utils.make_bogus_data_file()
    schedule_for_cleanup(firstfile)
    project_entity = syn.store(Project(name=str(uuid.uuid4())))
    schedule_for_cleanup(project_entity.id)
    folder_entity = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(folder_entity.id)
    second_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=project_entity))
    schedule_for_cleanup(second_folder.id)
    file_entity = syn.store(File(firstfile, parent=project_entity))
    schedule_for_cleanup(file_entity.id)

    walked.append(((project_entity.name, project_entity.id), [
        (folder_entity.name, folder_entity.id),
        (second_folder.name, second_folder.id)
    ], [(file_entity.name, file_entity.id)]))

    nested_folder = syn.store(
        Folder(name=str(uuid.uuid4()), parent=folder_entity))
    schedule_for_cleanup(nested_folder.id)
    secondfile = utils.make_bogus_data_file()
    schedule_for_cleanup(secondfile)
    second_file = syn.store(File(secondfile, parent=nested_folder))
    schedule_for_cleanup(second_file.id)
    thirdfile = utils.make_bogus_data_file()
    schedule_for_cleanup(thirdfile)
    third_file = syn.store(File(thirdfile, parent=second_folder))
    schedule_for_cleanup(third_file.id)

    walked.append(((os.path.join(project_entity.name,
                                 folder_entity.name), folder_entity.id),
                   [(nested_folder.name, nested_folder.id)], []))
    walked.append(
        ((os.path.join(os.path.join(project_entity.name, folder_entity.name),
                       nested_folder.name), nested_folder.id), [],
         [(second_file.name, second_file.id)]))
    walked.append(((os.path.join(project_entity.name, second_folder.name),
                    second_folder.id), [], [(third_file.name, third_file.id)]))

    #walk() uses query() which returns results that will be eventually consistent with synapse but not immediately after creating the entities
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" %
                    third_file.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    temp = synapseutils.walk(syn, project_entity.id)
    temp = list(temp)
    #Must sort the tuples returned, because order matters for the assert
    #Folders are returned in a different ordering depending on the name
    for i in walked:
        for x in i:
            if type(x) == list:
                x = x.sort()
    for i in temp:
        for x in i:
            if type(x) == list:
                x = x.sort()
        assert i in walked

    print("CHECK: synapseutils.walk on a file should return empty generator")
    temp = synapseutils.walk(syn, second_file.id)
    assert list(temp) == []