Beispiel #1
0
def test_is_url():
    """test the ability to determine whether a string is a URL"""
    assert utils.is_url("http://mydomain.com/foo/bar/bat?asdf=1234&qewr=ooo")
    assert utils.is_url("http://xkcd.com/1193/")
    assert not utils.is_url("syn123445")
    assert not utils.is_url("wasssuuuup???")
    assert utils.is_url('file://foo.com/path/to/file.xyz')
    assert utils.is_url('file:///path/to/file.xyz')
    assert utils.is_url('file:/path/to/file.xyz')
    assert utils.is_url('file:///c:/WINDOWS/clock.avi')
    assert utils.is_url('file:c:/WINDOWS/clock.avi')
    assert not utils.is_url('c:/WINDOWS/ugh/ugh.ugh')
Beispiel #2
0
def test_windows_file_urls():
    url = 'file:///c:/WINDOWS/clock.avi'
    assert utils.is_url(url)
    assert utils.file_url_to_path(
        url,
        verify_exists=False) == 'c:/WINDOWS/clock.avi', utils.file_url_to_path(
            url)
Beispiel #3
0
    def _checkProvenace(item, path):
        """Determines if provenance item is valid"""
        if item is None:
            return item

        item_path_normalized = os.path.abspath(
            os.path.expandvars(os.path.expanduser(item)))
        if os.path.isfile(item_path_normalized):
            # Add full path
            item = item_path_normalized
            if item not in df.index:  # If it is a file and it is not being uploaded
                try:
                    bundle = syn._getFromFile(item)
                    return bundle
                except SynapseFileNotFoundError:
                    # TODO absence of a raise here appears to be a bug and yet tests fail if this is raised
                    SynapseProvenanceError((
                        "The provenance record for file: %s is incorrect.\n"
                        "Specifically %s is not being uploaded and is not in Synapse."
                        % (path, item)))

        elif not utils.is_url(item) and (utils.is_synapse_id(item) is None):
            raise SynapseProvenanceError(
                ("The provenance record for file: %s is incorrect.\n"
                 "Specifically %s, is neither a valid URL or synapseId.") %
                (path, item))
        return item
Beispiel #4
0
def create_external_file_handle(syn,
                                path,
                                mimetype=None,
                                md5=None,
                                file_size=None):
    is_local_file = False  # defaults to false
    url = as_url(os.path.expandvars(os.path.expanduser(path)))
    if is_url(url):
        parsed_url = urllib_parse.urlparse(url)
        if parsed_url.scheme == 'file' and os.path.isfile(parsed_url.path):
            actual_md5 = md5_for_file(parsed_url.path).hexdigest()
            if md5 is not None and md5 != actual_md5:
                raise SynapseMd5MismatchError(
                    "The specified md5 [%s] does not match the calculated md5 [%s] for local file [%s]",
                    md5, actual_md5, parsed_url.path)
            md5 = actual_md5
            file_size = os.stat(parsed_url.path).st_size
            is_local_file = True
    else:
        raise ValueError('externalUrl [%s] is not a valid url', url)

    # just creates the file handle because there is nothing to upload
    file_handle = syn._createExternalFileHandle(url,
                                                mimetype=mimetype,
                                                md5=md5,
                                                fileSize=file_size)
    if is_local_file:
        syn.cache.add(file_handle['id'], file_url_to_path(url))
    return file_handle
Beispiel #5
0
def _check_path_and_normalize(f):
    sys.stdout.write('.')
    if is_url(f):
        return f
    path_normalized = os.path.abspath(os.path.expandvars(
        os.path.expanduser(f)))
    if not os.path.isfile(path_normalized):
        print(
            '\nThe specified path "%s" is either not a file path or does not exist.',
            f)
        raise IOError('The path %s is not a file or does not exist' % f)
    return path_normalized
Beispiel #6
0
def _check_size_each_file(df):
    for idx, row in df.iterrows():
        file_path = row['path']
        file_name = row['name'] if 'name' in row else os.path.basename(
            row['path'])
        if not is_url(file_path):
            single_file_size = os.stat(
                os.path.expandvars(os.path.expanduser(file_path))).st_size
            if single_file_size == 0:
                raise ValueError(
                    "File {} is empty, empty files cannot be uploaded to Synapse"
                    .format(file_name))
    def used(self,
             target=None,
             targetVersion=None,
             wasExecuted=None,
             url=None,
             name=None):
        """
        Add a resource used by the activity.

        This method tries to be as permissive as possible. It accepts a string which might be a synapse ID or a URL,
        a synapse entity, a UsedEntity or UsedURL dictionary or a list containing any combination of these.

        In addition, named parameters can be used to specify the fields of either a UsedEntity or a UsedURL.
        If target and optionally targetVersion are specified, create a UsedEntity.
        If url and optionally name are specified, create a UsedURL.

        It is an error to specify both target/targetVersion parameters and url/name parameters in the same call.
        To add multiple UsedEntities and UsedURLs, make a separate call for each or pass in a list.

        In case of conflicting settings for wasExecuted both inside an object and with a parameter, the parameter wins.
        For example, this UsedURL will have wasExecuted set to False::

            activity.used({'url':'http://google.com', 'name':'Goog', 'wasExecuted':True}, wasExecuted=False)

        Entity examples::

            activity.used('syn12345')
            activity.used(entity)
            activity.used(target=entity, targetVersion=2)
            activity.used(codeEntity, wasExecuted=True)
            activity.used({'reference':{'target':'syn12345', 'targetVersion':1}, 'wasExecuted':False})

        URL examples::

            activity.used('http://mydomain.com/my/awesome/data.RData')
            activity.used(url='http://mydomain.com/my/awesome/data.RData', name='Awesome Data')
            activity.used(url='https://github.com/joe_hacker/code_repo', name='Gnarly hacks', wasExecuted=True)
            activity.used({'url':'https://github.com/joe_hacker/code_repo', 'name':'Gnarly hacks'}, wasExecuted=True)

        List example::

            activity.used(['syn12345', 'syn23456', entity, \
                          {'reference':{'target':'syn100009', 'targetVersion':2}, 'wasExecuted':True}, \
                          'http://mydomain.com/my/awesome/data.RData'])
        """
        # -- A list of targets
        if isinstance(target, list):
            badargs = _get_any_bad_args(['targetVersion', 'url', 'name'],
                                        locals())
            _raise_incorrect_used_usage(badargs, 'list of used resources')

            for item in target:
                self.used(item, wasExecuted=wasExecuted)
            return

        # -- UsedEntity
        elif is_used_entity(target):
            badargs = _get_any_bad_args(['targetVersion', 'url', 'name'],
                                        locals())
            _raise_incorrect_used_usage(
                badargs, 'dictionary representing a used resource')

            resource = target
            if 'concreteType' not in resource:
                resource[
                    'concreteType'] = 'org.sagebionetworks.repo.model.provenance.UsedEntity'

        # -- Used URL
        elif is_used_url(target):
            badargs = _get_any_bad_args(['targetVersion', 'url', 'name'],
                                        locals())
            _raise_incorrect_used_usage(badargs, 'URL')

            resource = target
            if 'concreteType' not in resource:
                resource[
                    'concreteType'] = 'org.sagebionetworks.repo.model.provenance.UsedURL'

        # -- Synapse Entity
        elif is_synapse_entity(target):
            badargs = _get_any_bad_args(['url', 'name'], locals())
            _raise_incorrect_used_usage(badargs, 'Synapse entity')

            reference = {'targetId': target['id']}
            if 'versionNumber' in target:
                reference['targetVersionNumber'] = target['versionNumber']
            if targetVersion:
                reference['targetVersionNumber'] = int(targetVersion)
            resource = {
                'reference':
                reference,
                'concreteType':
                'org.sagebionetworks.repo.model.provenance.UsedEntity'
            }
        # -- URL parameter
        elif url:
            badargs = _get_any_bad_args(['target', 'targetVersion'], locals())
            _raise_incorrect_used_usage(badargs, 'URL')

            resource = {
                'url': url,
                'name': name if name else target,
                'concreteType':
                'org.sagebionetworks.repo.model.provenance.UsedURL'
            }

        # -- URL as a string
        elif is_url(target):
            badargs = _get_any_bad_args(['targetVersion'], locals())
            _raise_incorrect_used_usage(badargs, 'URL')
            resource = {
                'url': target,
                'name': name if name else target,
                'concreteType':
                'org.sagebionetworks.repo.model.provenance.UsedURL'
            }

        # -- Synapse Entity ID (assuming the string is an ID)
        elif isinstance(target, str):
            badargs = _get_any_bad_args(['url', 'name'], locals())
            _raise_incorrect_used_usage(badargs, 'Synapse entity')
            vals = target.split('.')  # Handle synapseIds of from syn234.4
            if not is_synapse_id(vals[0]):
                raise ValueError('%s is not a valid Synapse id' % target)
            if len(vals) == 2:
                if targetVersion and int(targetVersion) != int(vals[1]):
                    raise ValueError(
                        'Two conflicting versions for %s were specified' %
                        target)
                targetVersion = int(vals[1])
            reference = {'targetId': vals[0]}
            if targetVersion:
                reference['targetVersionNumber'] = int(targetVersion)
            resource = {
                'reference':
                reference,
                'concreteType':
                'org.sagebionetworks.repo.model.provenance.UsedEntity'
            }
        else:
            raise SynapseError(
                'Unexpected parameters in call to Activity.used().')

        # Set wasExecuted
        if wasExecuted is None:
            # Default to False
            if 'wasExecuted' not in resource:
                resource['wasExecuted'] = False
        else:
            # wasExecuted parameter overrides setting in an object
            resource['wasExecuted'] = wasExecuted

        # Add the used resource to the activity
        self['used'].append(resource)
Beispiel #8
0
def syncToSynapse(syn,
                  manifestFile,
                  dryRun=False,
                  sendMessages=True,
                  retries=MAX_RETRIES):
    """Synchronizes files specified in the manifest file to Synapse

    :param syn:             A synapse object as obtained with syn = synapseclient.login()

    :param manifestFile:    A tsv file with file locations and metadata to be pushed to Synapse.
                            See below for details

    :param dryRun: Performs validation without uploading if set to True (default is False)

    Given a file describing all of the uploads uploads the content to Synapse and optionally notifies you via Synapse
    messagging (email) at specific intervals, on errors and on completion.

    **Manifest file format**

    The format of the manifest file is a tab delimited file with one row per file to upload and columns describing the
    file. The minimum required columns are **path** and **parent** where path is the local file path and parent is the
    Synapse Id of the project or folder where the file is uploaded to. In addition to these columns you can specify any
    of the parameters to the File constructor (**name**, **synapseStore**, **contentType**) as well as parameters to the
    syn.store command (**used**, **executed**, **activityName**, **activityDescription**, **forceVersion**).
    Used and executed can be semi-colon (";") separated lists of Synapse ids, urls and/or local filepaths of files
    already stored in Synapse (or being stored in Synapse by the manifest).
    Any additional columns will be added as annotations.

    **Required fields:**

    ======   ======================                  ============================
    Field    Meaning                                 Example
    ======   ======================                  ============================
    path     local file path or URL                  /path/to/local/file.txt
    parent   synapse id                              syn1235
    ======   ======================                  ============================

    **Common fields:**

    ===============        ===========================                   ============
    Field                  Meaning                                       Example
    ===============        ===========================                   ============
    name                   name of file in Synapse                       Example_file
    forceVersion           whether to update version                     False
    ===============        ===========================                   ============

    **Provenance fields:**

    ====================   =====================================  ==========================================
    Field                  Meaning                                Example
    ====================   =====================================  ==========================================
    used                   List of items used to generate file    syn1235; /path/to_local/file.txt
    executed               List of items exectued                 https://github.org/; /path/to_local/code.py
    activityName           Name of activity in provenance         "Ran normalization"
    activityDescription    Text description on what was done      "Ran algorithm xyx with parameters..."
    ====================   =====================================  ==========================================

    Annotations:

    **Annotations:**

    Any columns that are not in the reserved names described above will be interpreted as annotations of the file

    **Other optional fields:**

    ===============          ==========================================  ============
    Field                    Meaning                                     Example
    ===============          ==========================================  ============
    synapseStore             Boolean describing whether to upload files  True
    contentType              content type of file to overload defaults   text/html
    ===============          ==========================================  ============


    **Example manifest file**

    ===============   ========    =======   =======   ===========================    ============================
    path              parent      annot1    annot2    used                           executed
    ===============   ========    =======   =======   ===========================    ============================
    /path/file1.txt   syn1243     "bar"     3.1415    "syn124; /path/file2.txt"      "https://github.org/foo/bar"
    /path/file2.txt   syn12433    "baz"     2.71      ""                             "https://github.org/foo/baz"
    ===============   ========    =======   =======   ===========================    ============================

    """
    df = readManifestFile(syn, manifestFile)
    # have to check all size of single file
    sizes = [
        os.stat(os.path.expandvars(os.path.expanduser(f))).st_size
        for f in df.path if not is_url(f)
    ]
    # Write output on what is getting pushed and estimated times - send out message.
    sys.stdout.write('=' * 50 + '\n')
    sys.stdout.write(
        'We are about to upload %i files with a total size of %s.\n ' %
        (len(df), utils.humanizeBytes(sum(sizes))))
    sys.stdout.write('=' * 50 + '\n')

    if dryRun:
        return

    sys.stdout.write('Starting upload...\n')
    if sendMessages:
        notify_decorator = notifyMe(syn,
                                    'Upload of %s' % manifestFile,
                                    retries=retries)
        upload = notify_decorator(_manifest_upload)
        upload(syn, df)
    else:
        _manifest_upload(syn, df)