def test_is_url(): """test the ability to determine whether a string is a URL""" assert utils.is_url("http://mydomain.com/foo/bar/bat?asdf=1234&qewr=ooo") assert utils.is_url("http://xkcd.com/1193/") assert not utils.is_url("syn123445") assert not utils.is_url("wasssuuuup???") assert utils.is_url('file://foo.com/path/to/file.xyz') assert utils.is_url('file:///path/to/file.xyz') assert utils.is_url('file:/path/to/file.xyz') assert utils.is_url('file:///c:/WINDOWS/clock.avi') assert utils.is_url('file:c:/WINDOWS/clock.avi') assert not utils.is_url('c:/WINDOWS/ugh/ugh.ugh')
def test_windows_file_urls(): url = 'file:///c:/WINDOWS/clock.avi' assert utils.is_url(url) assert utils.file_url_to_path( url, verify_exists=False) == 'c:/WINDOWS/clock.avi', utils.file_url_to_path( url)
def _checkProvenace(item, path): """Determines if provenance item is valid""" if item is None: return item item_path_normalized = os.path.abspath( os.path.expandvars(os.path.expanduser(item))) if os.path.isfile(item_path_normalized): # Add full path item = item_path_normalized if item not in df.index: # If it is a file and it is not being uploaded try: bundle = syn._getFromFile(item) return bundle except SynapseFileNotFoundError: # TODO absence of a raise here appears to be a bug and yet tests fail if this is raised SynapseProvenanceError(( "The provenance record for file: %s is incorrect.\n" "Specifically %s is not being uploaded and is not in Synapse." % (path, item))) elif not utils.is_url(item) and (utils.is_synapse_id(item) is None): raise SynapseProvenanceError( ("The provenance record for file: %s is incorrect.\n" "Specifically %s, is neither a valid URL or synapseId.") % (path, item)) return item
def create_external_file_handle(syn, path, mimetype=None, md5=None, file_size=None): is_local_file = False # defaults to false url = as_url(os.path.expandvars(os.path.expanduser(path))) if is_url(url): parsed_url = urllib_parse.urlparse(url) if parsed_url.scheme == 'file' and os.path.isfile(parsed_url.path): actual_md5 = md5_for_file(parsed_url.path).hexdigest() if md5 is not None and md5 != actual_md5: raise SynapseMd5MismatchError( "The specified md5 [%s] does not match the calculated md5 [%s] for local file [%s]", md5, actual_md5, parsed_url.path) md5 = actual_md5 file_size = os.stat(parsed_url.path).st_size is_local_file = True else: raise ValueError('externalUrl [%s] is not a valid url', url) # just creates the file handle because there is nothing to upload file_handle = syn._createExternalFileHandle(url, mimetype=mimetype, md5=md5, fileSize=file_size) if is_local_file: syn.cache.add(file_handle['id'], file_url_to_path(url)) return file_handle
def _check_path_and_normalize(f): sys.stdout.write('.') if is_url(f): return f path_normalized = os.path.abspath(os.path.expandvars( os.path.expanduser(f))) if not os.path.isfile(path_normalized): print( '\nThe specified path "%s" is either not a file path or does not exist.', f) raise IOError('The path %s is not a file or does not exist' % f) return path_normalized
def _check_size_each_file(df): for idx, row in df.iterrows(): file_path = row['path'] file_name = row['name'] if 'name' in row else os.path.basename( row['path']) if not is_url(file_path): single_file_size = os.stat( os.path.expandvars(os.path.expanduser(file_path))).st_size if single_file_size == 0: raise ValueError( "File {} is empty, empty files cannot be uploaded to Synapse" .format(file_name))
def used(self, target=None, targetVersion=None, wasExecuted=None, url=None, name=None): """ Add a resource used by the activity. This method tries to be as permissive as possible. It accepts a string which might be a synapse ID or a URL, a synapse entity, a UsedEntity or UsedURL dictionary or a list containing any combination of these. In addition, named parameters can be used to specify the fields of either a UsedEntity or a UsedURL. If target and optionally targetVersion are specified, create a UsedEntity. If url and optionally name are specified, create a UsedURL. It is an error to specify both target/targetVersion parameters and url/name parameters in the same call. To add multiple UsedEntities and UsedURLs, make a separate call for each or pass in a list. In case of conflicting settings for wasExecuted both inside an object and with a parameter, the parameter wins. For example, this UsedURL will have wasExecuted set to False:: activity.used({'url':'http://google.com', 'name':'Goog', 'wasExecuted':True}, wasExecuted=False) Entity examples:: activity.used('syn12345') activity.used(entity) activity.used(target=entity, targetVersion=2) activity.used(codeEntity, wasExecuted=True) activity.used({'reference':{'target':'syn12345', 'targetVersion':1}, 'wasExecuted':False}) URL examples:: activity.used('http://mydomain.com/my/awesome/data.RData') activity.used(url='http://mydomain.com/my/awesome/data.RData', name='Awesome Data') activity.used(url='https://github.com/joe_hacker/code_repo', name='Gnarly hacks', wasExecuted=True) activity.used({'url':'https://github.com/joe_hacker/code_repo', 'name':'Gnarly hacks'}, wasExecuted=True) List example:: activity.used(['syn12345', 'syn23456', entity, \ {'reference':{'target':'syn100009', 'targetVersion':2}, 'wasExecuted':True}, \ 'http://mydomain.com/my/awesome/data.RData']) """ # -- A list of targets if isinstance(target, list): badargs = _get_any_bad_args(['targetVersion', 'url', 'name'], locals()) _raise_incorrect_used_usage(badargs, 'list of used resources') for item in target: self.used(item, wasExecuted=wasExecuted) return # -- UsedEntity elif is_used_entity(target): badargs = _get_any_bad_args(['targetVersion', 'url', 'name'], locals()) _raise_incorrect_used_usage( badargs, 'dictionary representing a used resource') resource = target if 'concreteType' not in resource: resource[ 'concreteType'] = 'org.sagebionetworks.repo.model.provenance.UsedEntity' # -- Used URL elif is_used_url(target): badargs = _get_any_bad_args(['targetVersion', 'url', 'name'], locals()) _raise_incorrect_used_usage(badargs, 'URL') resource = target if 'concreteType' not in resource: resource[ 'concreteType'] = 'org.sagebionetworks.repo.model.provenance.UsedURL' # -- Synapse Entity elif is_synapse_entity(target): badargs = _get_any_bad_args(['url', 'name'], locals()) _raise_incorrect_used_usage(badargs, 'Synapse entity') reference = {'targetId': target['id']} if 'versionNumber' in target: reference['targetVersionNumber'] = target['versionNumber'] if targetVersion: reference['targetVersionNumber'] = int(targetVersion) resource = { 'reference': reference, 'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedEntity' } # -- URL parameter elif url: badargs = _get_any_bad_args(['target', 'targetVersion'], locals()) _raise_incorrect_used_usage(badargs, 'URL') resource = { 'url': url, 'name': name if name else target, 'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedURL' } # -- URL as a string elif is_url(target): badargs = _get_any_bad_args(['targetVersion'], locals()) _raise_incorrect_used_usage(badargs, 'URL') resource = { 'url': target, 'name': name if name else target, 'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedURL' } # -- Synapse Entity ID (assuming the string is an ID) elif isinstance(target, str): badargs = _get_any_bad_args(['url', 'name'], locals()) _raise_incorrect_used_usage(badargs, 'Synapse entity') vals = target.split('.') # Handle synapseIds of from syn234.4 if not is_synapse_id(vals[0]): raise ValueError('%s is not a valid Synapse id' % target) if len(vals) == 2: if targetVersion and int(targetVersion) != int(vals[1]): raise ValueError( 'Two conflicting versions for %s were specified' % target) targetVersion = int(vals[1]) reference = {'targetId': vals[0]} if targetVersion: reference['targetVersionNumber'] = int(targetVersion) resource = { 'reference': reference, 'concreteType': 'org.sagebionetworks.repo.model.provenance.UsedEntity' } else: raise SynapseError( 'Unexpected parameters in call to Activity.used().') # Set wasExecuted if wasExecuted is None: # Default to False if 'wasExecuted' not in resource: resource['wasExecuted'] = False else: # wasExecuted parameter overrides setting in an object resource['wasExecuted'] = wasExecuted # Add the used resource to the activity self['used'].append(resource)
def syncToSynapse(syn, manifestFile, dryRun=False, sendMessages=True, retries=MAX_RETRIES): """Synchronizes files specified in the manifest file to Synapse :param syn: A synapse object as obtained with syn = synapseclient.login() :param manifestFile: A tsv file with file locations and metadata to be pushed to Synapse. See below for details :param dryRun: Performs validation without uploading if set to True (default is False) Given a file describing all of the uploads uploads the content to Synapse and optionally notifies you via Synapse messagging (email) at specific intervals, on errors and on completion. **Manifest file format** The format of the manifest file is a tab delimited file with one row per file to upload and columns describing the file. The minimum required columns are **path** and **parent** where path is the local file path and parent is the Synapse Id of the project or folder where the file is uploaded to. In addition to these columns you can specify any of the parameters to the File constructor (**name**, **synapseStore**, **contentType**) as well as parameters to the syn.store command (**used**, **executed**, **activityName**, **activityDescription**, **forceVersion**). Used and executed can be semi-colon (";") separated lists of Synapse ids, urls and/or local filepaths of files already stored in Synapse (or being stored in Synapse by the manifest). Any additional columns will be added as annotations. **Required fields:** ====== ====================== ============================ Field Meaning Example ====== ====================== ============================ path local file path or URL /path/to/local/file.txt parent synapse id syn1235 ====== ====================== ============================ **Common fields:** =============== =========================== ============ Field Meaning Example =============== =========================== ============ name name of file in Synapse Example_file forceVersion whether to update version False =============== =========================== ============ **Provenance fields:** ==================== ===================================== ========================================== Field Meaning Example ==================== ===================================== ========================================== used List of items used to generate file syn1235; /path/to_local/file.txt executed List of items exectued https://github.org/; /path/to_local/code.py activityName Name of activity in provenance "Ran normalization" activityDescription Text description on what was done "Ran algorithm xyx with parameters..." ==================== ===================================== ========================================== Annotations: **Annotations:** Any columns that are not in the reserved names described above will be interpreted as annotations of the file **Other optional fields:** =============== ========================================== ============ Field Meaning Example =============== ========================================== ============ synapseStore Boolean describing whether to upload files True contentType content type of file to overload defaults text/html =============== ========================================== ============ **Example manifest file** =============== ======== ======= ======= =========================== ============================ path parent annot1 annot2 used executed =============== ======== ======= ======= =========================== ============================ /path/file1.txt syn1243 "bar" 3.1415 "syn124; /path/file2.txt" "https://github.org/foo/bar" /path/file2.txt syn12433 "baz" 2.71 "" "https://github.org/foo/baz" =============== ======== ======= ======= =========================== ============================ """ df = readManifestFile(syn, manifestFile) # have to check all size of single file sizes = [ os.stat(os.path.expandvars(os.path.expanduser(f))).st_size for f in df.path if not is_url(f) ] # Write output on what is getting pushed and estimated times - send out message. sys.stdout.write('=' * 50 + '\n') sys.stdout.write( 'We are about to upload %i files with a total size of %s.\n ' % (len(df), utils.humanizeBytes(sum(sizes)))) sys.stdout.write('=' * 50 + '\n') if dryRun: return sys.stdout.write('Starting upload...\n') if sendMessages: notify_decorator = notifyMe(syn, 'Upload of %s' % manifestFile, retries=retries) upload = notify_decorator(_manifest_upload) upload(syn, df) else: _manifest_upload(syn, df)