コード例 #1
0
    async def _create(self):
        name = '_TEMP_{0}_TEMP_'.format(str(uuid.uuid4()))
        logging.info('Creating file view project: {0}'.format(name))
        self.view_project = await SynapseProxy.storeAsync(
            syn.Project(name=name))

        logging.info('Creating file view: {0}'.format(name))
        cols = [
            syn.Column(name=self.COL_ID, columnType='ENTITYID'),
            syn.Column(name=self.COL_DATAFILEHANDLEID,
                       columnType='FILEHANDLEID'),
            syn.Column(name=self.COL_NAME,
                       columnType='STRING',
                       maximumSize=256)
        ]
        schema = syn.EntityViewSchema(
            name=name,
            columns=cols,
            properties=None,
            parent=self.view_project,
            scopes=[self.scope],
            includeEntityTypes=[syn.EntityViewType.FILE],
            addDefaultViewColumns=False,
            addAnnotationColumns=False)
        self.view = await SynapseProxy.storeAsync(schema)
コード例 #2
0
def test_table_query():
    """Test command line ability to do table query.

    """

    cols = []
    cols.append(
        synapseclient.Column(name='name',
                             columnType='STRING',
                             maximumSize=1000))
    cols.append(
        synapseclient.Column(name='foo',
                             columnType='STRING',
                             enumValues=['foo', 'bar', 'bat']))
    cols.append(synapseclient.Column(name='x', columnType='DOUBLE'))
    cols.append(synapseclient.Column(name='age', columnType='INTEGER'))
    cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN'))

    project_entity = project

    schema1 = syn.store(
        synapseclient.Schema(name=str(uuid.uuid4()),
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [['Chris', 'bar', 11.23, 45, False],
             ['Jen', 'bat', 14.56, 40,
              False], ['Jane', 'bat', 17.89, 6, False],
             ['Henry', 'bar', 10.12, 1, False]]

    row_reference_set1 = syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    # Test query
    output = run('synapse', '--skip-checks', 'query',
                 'select * from %s' % schema1.id)

    output_rows = output.rstrip("\n").split("\n")

    # Check the length of the output
    assert len(output_rows) == 5, "got %s rows" % (len(output_rows), )

    # Check that headers are correct.
    # Should be column names in schema plus the ROW_ID and ROW_VERSION
    my_headers_set = output_rows[0].split("\t")
    expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list(
        map(lambda x: x.name, cols))
    assert my_headers_set == expected_headers_set, "%r != %r" % (
        my_headers_set, expected_headers_set)
コード例 #3
0
def create_cols(table_type, syn=None):
    if table_type == MC10_SENSOR_NAME:
        cols = [
            sc.Column(name="task_id", columnType="STRING"),
            sc.Column(name="sensor_location", columnType="STRING"),
            sc.Column(name="mc10_accelerometer", columnType="FILEHANDLEID"),
            sc.Column(name="mc10_gyroscope", columnType="FILEHANDLEID"),
            sc.Column(name="mc10_emg", columnType="FILEHANDLEID")
        ]
    elif table_type == SMARTWATCH_SENSOR_NAME:
        cols = [
            sc.Column(name="task_id", columnType="STRING"),
            sc.Column(name="smartwatch_accelerometer",
                      columnType="FILEHANDLEID")
        ]
    elif table_type == "scores":
        cols = list(syn.getTableColumns(SCORES))
        for c in cols:
            c.pop('id')
            if c['name'] in SCORES_COL_MAP:
                c['name'] = SCORES_COL_MAP[c['name']]
        cols = [sc.Column(name="task_id", columnType="STRING")] + cols
    else:
        raise TypeError("table_type must be one of [{}, {}, {}]".format(
            MC10_SENSOR_NAME, SMARTWATCH_SENSOR_NAME, "scores"))
    return cols
コード例 #4
0
 def _create_view(self, entity_types):
     name = '_TEMP_{0}_VIEW_'.format(str(uuid.uuid4()))
     cols = [
         syn.Column(name=self.COL_BENEFACTORID, columnType='ENTITYID'),
         syn.Column(name=self.COL_PROJECTID, columnType='ENTITYID')
     ]
     schema = syn.EntityViewSchema(name=name,
                                   columns=cols,
                                   properties=None,
                                   parent=self.view_project,
                                   scopes=[self.scope],
                                   includeEntityTypes=entity_types,
                                   addDefaultViewColumns=False,
                                   addAnnotationColumns=False)
     return SynapseProxy.client().store(schema)
コード例 #5
0
ファイル: utils.py プロジェクト: milen-sage/annotator
def _keyValCols(keys, values, asSynapseCols):
    """ Get Synapse Column compatible objects from `keys` and `values`.

    Parameters
    ----------
    keys : list
        Column names.
    values : list
        `defaultValue`s of each column.
    asSynapseCols : bool
        Whether to return as synapseclient.Column objects.

    Returns
    -------
    A list of dictionaries compatible with synapseclient.Column objects.
    """
    sanitize = lambda v: v if pd.notnull(v) else ''
    keys = list(map(sanitize, keys))
    values = list(map(sanitize, values))
    val_length = map(lambda v: len(v) if v else 50, values)
    cols = [{
        'name': k,
        'maximumSize': l,
        'columnType': "STRING",
        "defaultValue": v
    } for k, v, l in zip(keys, values, val_length)]
    if asSynapseCols:
        cols = list(map(lambda c: sc.Column(**c), cols))
    return cols
def test_add_column(syn, new_tables):
    source_table = new_tables["schema"][0]
    target_table = new_tables["schema"][1]
    new_column = sc.Column(columnType="STRING", maximumSize=5, name="new_col")
    source_table.addColumn(new_column)
    source_table = syn.store(source_table)
    target_table = synchronize_schemas(
        syn,
        schema_comparison={"added": ["new_col"]},
        source=source_table["id"],
        target=target_table["id"])
    source_cols = [c["name"] for c in syn.getTableColumns(source_table["id"])]
    target_cols = [c["name"] for c in syn.getTableColumns(target_table["id"])]
    assert all([c in source_cols for c in target_cols])
コード例 #7
0
def update_samples_in_release_table(syn, file_mapping, release,
                                    samples_in_release_synid):
    """
    Updates the sample in release table
    This tracks the samples of each release.  1 means it exists, and 0
    means it doesn't

    Args:
        syn: synapse object
        file_mapping: file mapping generated from file mapping function
        release:  GENIE release number (ie. 5.3-consortium)
        samples_in_release_synid: Synapse Id of 'samples in release' Table
    """
    clinical_ent = syn.get(file_mapping["clinical"], followLink=True)
    clinicaldf = pd.read_csv(clinical_ent.path, sep="\t", comment="#")
    cols = [
        i["name"] for i in list(syn.getTableColumns(samples_in_release_synid))
    ]

    if release not in cols:
        schema = syn.get(samples_in_release_synid)
        syn_col = synapseclient.Column(name=release,
                                       columnType="INTEGER",
                                       defaultValue=0)
        new_column = syn.store(syn_col)
        schema.addColumn(new_column)
        schema = syn.store(schema)
    # Columns of samples in release
    samples_per_release = syn.tableQuery(
        'SELECT SAMPLE_ID, "{}" FROM {}'.format(release,
                                                samples_in_release_synid))

    samples_per_releasedf = samples_per_release.asDataFrame()
    new_samples = clinicaldf[[
        "SAMPLE_ID"
    ]][~clinicaldf.SAMPLE_ID.isin(samples_per_releasedf.SAMPLE_ID)]

    new_samples[release] = 1
    old_samples = clinicaldf[["SAMPLE_ID"]][clinicaldf.SAMPLE_ID.isin(
        samples_per_releasedf.SAMPLE_ID)]

    old_samples[release] = 1
    samples_in_releasedf = new_samples.append(old_samples)
    process_functions.updateDatabase(
        syn,
        samples_per_releasedf,
        samples_in_releasedf,
        samples_in_release_synid,
        ["SAMPLE_ID"],
    )
コード例 #8
0
def table_schema(project_obj):
    cols = [synapseclient.Column(name="recordId", columnType="INTEGER"),
            synapseclient.Column(name="externalId", columnType="STRING"),
            synapseclient.Column(name="substudyMemberships", columnType="STRING"),
            synapseclient.Column(name="bool_property", columnType="BOOLEAN"),
            synapseclient.Column(name="str_property", columnType="STRING"),
            synapseclient.Column(name="raw_data", columnType="FILEHANDLEID")]
    schema = synapseclient.Schema(name = str(uuid.uuid4()),
                                  columns = cols,
                                  parent = project_obj["id"])
    return schema
コード例 #9
0
ファイル: bootstrap.py プロジェクト: thomasyu888/synapsegenie
def _create_table(syn: Synapse, name: str, col_config: List[dict],
                  parent: str) -> Schema:
    """Create Synapse Table

    Args:
        syn: Synapse connection
        name: Table name
        col_config: Column dict configuration
        parent: Synapse id of project

    Returns:
        Stored Synapse Table

    """
    cols = [synapseclient.Column(**col) for col in col_config]
    schema = synapseclient.Schema(name=name, columns=cols, parent=parent)
    schema = syn.store(schema)
    return schema
コード例 #10
0
def _create_table(syn: Synapse, name: str, col_config: List[dict],
                  parent: str) -> Schema:
    """Create Synapse Table

    Args:
        syn: Synapse connection
        name: Table name
        col_config: Column dict configuration
        parent: Synapse id of project

    Returns:
        Stored Synapse Table

    """
    cols = [synapseclient.Column(**col) for col in col_config]
    schema = process_functions._create_schema(syn, table_name=name,
                                              parentid=parent, columns=cols)
    return schema
コード例 #11
0
def createColumnsFromJson(path, defaultMaximumSize=250):
    """
    Create a list of Synapse Table Columns from a Synapse annotations JSON file.
    This creates a list of columns; if the column is a 'STRING' and
    defaultMaximumSize is specified, change the default maximum size for that
    column.

    :param json_file:
    :param defaultMaximumSize:
    :return:
    """
    with open(path) as json_file:
        data = json.load(json_file)

    cols = []
    for d in data:
        d['enumValues'] = [a['value'] for a in d['enumValues']]

        if d['columnType'] == 'STRING' and defaultMaximumSize:
            d['maximumSize'] = defaultMaximumSize

        cols.append(synapseclient.Column(**d))

    return cols
コード例 #12
0
ファイル: utils.py プロジェクト: milen-sage/annotator
def addToScope(syn, target, scope):
    """ Add further Folders/Projects to the scope of a file view.

    Parameters
    ----------
    syn : synapseclient.Synapse
    target : str, synapseclient.Schema
        The Synapse ID of the file view to update or its schema.
    scope : str, list
        The Synapse IDs of the entites to add to the scope.

    Returns
    -------
    synapseclient.Schema
    """
    scope = [scope] if isinstance(scope, str) else scope
    target = syn.get(target) if isinstance(target, str) else target
    cols = list(syn.getTableColumns(target.id))
    totalScope = target['scopeIds']
    for s in scope:
        totalScope.append(s)
    # We need to preserve columns that are currently in the file view
    # but aren't automatically created when synapseclient.EntityViewSchema'ing.
    defaultCols = getDefaultColumnsForScope(syn, totalScope)
    defaultCols = [sc.Column(**c) for c in defaultCols]
    colNames = [c['name'] for c in cols]
    for c in defaultCols:  # Preexisting columns have priority over defaults
        if c['name'] not in colNames:
            cols.append(c)
    schema = sc.EntityViewSchema(name=target.name,
                                 parent=target.parentId,
                                 columns=cols,
                                 scopes=totalScope,
                                 add_default_columns=False)
    schema = syn.store(schema)
    return schema
コード例 #13
0
ファイル: Pipeline.py プロジェクト: milen-sage/annotator
    def createFileView(self, name, parent, scope, addCols=None, schema=None):
        """ Create and store a file view for further manipulation.

        Parameters
        ----------
        name : str
            The name of the file view.
        parent : str
            Synapse ID of project to store file view within.
        scope : str or list
            Synapse IDs of items to include in file view.
        addCols : dict, list, or str
            Columns to add in addition to the default file view columns.
        schema : str or pandas.DataFrame
            A path to a .json file specifying a schema the file view should
            conform to -- or a pandas.DataFrame alreay in flattened format.
            (See `schema.flattenJson`).

        If `addCols` is a dict:
            Add keys as columns. If a key's value is `None`, then insert an empty
            column. Otherwise, set the `defaultValue` of the column to that value.
            After setting `self.view` to the pandas DataFrame version of the newly
            created file view, all rows in each column will be set to its
            `defaultValue` (unless there is no `defaultValue`, in which case the
            column will be empty). The file view will not be updated on Synapse
            until `self.publish` is called.
        If `addCols` is a list:
            Add columns to schema with no `defaultValue`. `self.view` will be
            unchanged from the file view that is stored on Synapse.
        If 'addCols is a str:
            Assumes the string is a filepath. Attempts to read in the filepath as
            a two-column .csv file, and then proceeds as if `addCols` was a dict,
            where the first column are the keys and the second column are the
            values.

        Returns
        -------
        Synapse ID of newly created fileview.
        """
        self.backup("createFileView")

        # Fetch default keys, plus any preexisting annotation keys
        cols = utils.getDefaultColumnsForScope(self.syn, scope)

        # Store flattened schema, add keys to active columns list.
        if self.schema is None:
            self.schema = (schemaModule.flattenJson(schema) if isinstance(
                schema, str) else schema)
        if self.schema is not None:
            for k in self.schema.index.unique():
                self.addActiveCols(k)
            schemaCols = utils.makeColumns(list(self.schema.index.unique()),
                                           asSynapseCols=False)
            cols = self._getUniqueCols(schemaCols, cols)

        # Add keys defined during initialization
        if self._activeCols:
            activeCols = utils.makeColumns(self._activeCols,
                                           asSynapseCols=False)
            cols = self._getUniqueCols(activeCols, cols)

        # Add keys passed to addCols
        if addCols:
            if isinstance(addCols, dict):
                unspecifiedCols = [k for k in addCols if addCols[k] is None]
                self.addActiveCols(unspecifiedCols)
            elif isinstance(addCols, list):
                self.addActiveCols(addCols)
            newCols = utils.makeColumns(addCols, asSynapseCols=False)
            cols = self._getUniqueCols(newCols, cols)

        # Store columns to Synapse as EntityViewSchema. Default column values
        # are added to `self.view` but not yet stored to Synapse.
        cols = [sc.Column(**c) for c in cols]
        entityViewSchema = sc.EntityViewSchema(name=name,
                                               columns=cols,
                                               parent=parent,
                                               scopes=scope)
        self._entityViewSchema = self.syn.store(entityViewSchema)
        self.view = utils.synread(self.syn, self._entityViewSchema.id)
        self._index = self.view.index
        if isinstance(addCols, dict):
            self.addDefaultValues(addCols, False)
        return self._entityViewSchema.id
コード例 #14
0
ファイル: bootstrap.py プロジェクト: thomasyu888/synapsegenie
def main(syn):

    # Basic setup of the project
    project_name = "Testing Synapse Genie"

    # Determine the short and long names of the centers.
    center_abbreviations = ['AAA', 'BBB', 'CCC']
    center_names = center_abbreviations

    # Create the project
    project = synapseclient.Project(project_name)
    project = syn.store(project)

    # Create a folder for log files generated by the GENIE processes
    # of validation and updating the database tables
    logs_folder = synapseclient.Folder(name='Logs', parent=project)
    logs_folder = syn.store(logs_folder)

    # Folder for individual center folders
    root_center_folder = synapseclient.Folder(name='Centers', parent=project)
    root_center_folder = syn.store(root_center_folder)

    # The folders for each center where they will upload files for validation
    # and submission. There is one folder per center.
    # This currently deviates from the original GENIE setup of having an
    # 'Input' and 'Staging' folder for each center.
    center_folders = [
        synapseclient.Folder(name=name, parent=root_center_folder)
        for name in center_abbreviations
    ]
    center_folders = [syn.store(folder) for folder in center_folders]

    # Make some fake data that only contains basic text to check
    # for validation.

    n_files = 5  # number of files per center to create

    for folder in center_folders:
        for idx in range(n_files):
            tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}',
                                              suffix='.txt')
            with open(tmp.name, mode='w') as fh:
                fh.write(random.choice(['ERROR', 'VALID', 'NOPE']))
            synfile = syn.store(synapseclient.File(tmp.name, parent=folder))

    # Set up the table that holds the validation status of all submitted files.
    status_schema = create_status_table(syn, project)

    # Set up the table that maps the center abbreviation to the folder where
    # their data is uploaded. This is used by the GENIE framework to find the
    # files to validate for a center.
    center_map_table_defs = [
        {
            'name': 'name',
            'columnType': 'STRING',
            'maximumSize': 250
        },
        {
            'name': 'center',
            'columnType': 'STRING',
            'maximumSize': 50
        },
        {
            'name': 'inputSynId',
            'columnType': 'ENTITYID'
        },
        # {'name': 'stagingSynId',
        #  'columnType': 'ENTITYID'},
        {
            'name': 'release',
            'defaultValue': 'false',
            'columnType': 'BOOLEAN'
        }
        # {'id': '68438',
        #  'name': 'mutationInCisFilter',
        #  'defaultValue': 'true',
        #  'columnType': 'BOOLEAN',
        #  'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}
    ]

    center_map_cols = [
        synapseclient.Column(**col) for col in center_map_table_defs
    ]

    center_schema = synapseclient.Schema(name='Center Table',
                                         columns=center_map_cols,
                                         parent=project)
    center_schema = syn.store(center_schema)

    # Add the center folders created above to this table.
    center_folder_ids = [folder.id for folder in center_folders]
    center_df = pandas.DataFrame(
        dict(name=center_names,
             center=center_abbreviations,
             inputSynId=center_folder_ids))

    tbl = synapseclient.Table(schema=center_schema, values=center_df)
    tbl = syn.store(tbl)

    # Create a table that stores the error logs for each submitted file.
    error_col_defs = [
        {
            'name': 'id',
            'columnType': 'ENTITYID'
        },
        {
            'name': 'center',
            'columnType': 'STRING',
            'maximumSize': 50,
            'facetType': 'enumeration'
        },
        {
            'name': 'errors',
            'columnType': 'LARGETEXT'
        },
        {
            'name': 'name',
            'columnType': 'STRING',
            'maximumSize': 500
        },
        # {'name': 'versionNumber',
        #  'columnType': 'STRING',
        #  'maximumSize': 50},
        {
            'name': 'fileType',
            'columnType': 'STRING',
            'maximumSize': 50
        }
    ]

    error_map_cols = [synapseclient.Column(**col) for col in error_col_defs]
    error_schema = synapseclient.Schema(name='Error Table',
                                        columns=error_map_cols,
                                        parent=project)
    error_schema = syn.store(error_schema)

    # Create a table that maps the various database tables to a short name.
    # This table is used in many GENIE functions to find the correct table to update
    # or get the state of something from.

    db_map_col_defs = [{
        'name': 'Database',
        'columnType': 'STRING',
        'maximumSize': 50
    }, {
        'name': 'Id',
        'columnType': 'ENTITYID'
    }]

    db_map_cols = [synapseclient.Column(**col) for col in db_map_col_defs]
    db_map_schema = synapseclient.Schema(name='DB Mapping Table',
                                         columns=db_map_cols,
                                         parent=project)
    db_map_schema = syn.store(db_map_schema)

    # Add dbMapping annotation
    project.annotations.dbMapping = db_map_schema.tableId
    project = syn.store(project)
    # Add the tables we already created to the mapping table.
    dbmap_df = pandas.DataFrame(
        dict(Database=[
            'centerMapping', 'validationStatus', 'errorTracker', 'dbMapping',
            'logs'
        ],
             Id=[
                 center_schema.id, status_schema.id, error_schema.id,
                 db_map_schema.id, logs_folder.id
             ]))

    db_map_tbl = synapseclient.Table(schema=db_map_schema, values=dbmap_df)
    db_map_tbl = syn.store(db_map_tbl)

    # Make a top level folder for output. Some processing for
    # file types copy a file from one place to another.
    output_folder = synapseclient.Folder(name='Output', parent=project)
    output_folder = syn.store(output_folder)

    output_folder_map = []

    # default_table_col_defs = status_table_col_defs = [
    #     {'name': 'PRIMARY_KEY',
    #      'columnType': 'STRING'}
    # ]
    # default_table_cols = [synapseclient.Column(**col)
    #                       for col in default_table_col_defs]

    default_primary_key = 'PRIMARY_KEY'

    # For each file type format in the format registry, create an output folder and a table.
    # Some GENIE file types copy a file to a new place, and some update a table. Having both
    # means that both of these operations will be available at the beginning.
    # The mapping between the file type and the folder or table have a consistent naming.
    # The key ('Database' value) is {file_type}_folder or {file_type}_table.
    # Determine which file formats are going to be used.
    format_registry = config.collect_format_types(['example_registry'])

    for file_type, obj in format_registry.items():
        file_type_folder = synapseclient.Folder(name=file_type,
                                                parent=output_folder)
        file_type_folder = syn.store(file_type_folder)
        output_folder_map.append(
            dict(Database=f"{file_type}_folder", Id=file_type_folder.id))

        file_type_schema = synapseclient.Schema(name=file_type, parent=project)
        file_type_schema.annotations.primaryKey = default_primary_key
        file_type_schema = syn.store(file_type_schema)

        output_folder_map.append(
            dict(Database=f"{file_type}_table", Id=file_type_schema.id))

    # Add the folders and tables created to the mapping table.
    db_map_tbl = synapseclient.Table(
        schema=db_map_schema, values=pandas.DataFrame(output_folder_map))
    db_map_tbl = syn.store(db_map_tbl)
コード例 #15
0
def test_create_and_update_file_view():

    ## Create a folder
    folder = Folder(str(uuid.uuid4()),
                    parent=project,
                    description='creating a file-view')
    folder = syn.store(folder)

    ## Create dummy file with annotations in our folder
    path = utils.make_bogus_data_file()
    file_annotations = dict(fileFormat='jpg',
                            dataType='image',
                            artist='Banksy',
                            medium='print',
                            title='Girl With Ballon')
    schedule_for_cleanup(path)
    a_file = File(path, parent=folder, annotations=file_annotations)
    a_file = syn.store(a_file)
    schedule_for_cleanup(a_file)

    # Add new columns for the annotations on this file and get their IDs
    my_added_cols = [
        syn.store(synapseclient.Column(name=k, columnType="STRING"))
        for k in file_annotations.keys()
    ]
    my_added_cols_ids = [c['id'] for c in my_added_cols]
    view_default_ids = [
        c['id'] for c in syn._get_default_entity_view_columns('file')
    ]
    col_ids = my_added_cols_ids + view_default_ids
    scopeIds = [folder['id'].lstrip('syn')]

    ## Create an empty entity-view with defined scope as folder

    entity_view = EntityViewSchema(name=str(uuid.uuid4()),
                                   scopeIds=scopeIds,
                                   addDefaultViewColumns=True,
                                   addAnnotationColumns=False,
                                   type='file',
                                   columns=my_added_cols,
                                   parent=project)

    entity_view = syn.store(entity_view)
    schedule_for_cleanup(entity_view)

    assert_equals(set(scopeIds), set(entity_view.scopeIds))
    assert_equals(set(col_ids), set(entity_view.columnIds))
    assert_equals('file', entity_view.type)

    ## get the current view-schema
    view = syn.tableQuery("select * from %s" % entity_view.id)
    schedule_for_cleanup(view.filepath)

    view_dict = list(
        csv.DictReader(io.open(view.filepath, encoding="utf-8", newline='')))

    # check that all of the annotations were retrieved from the view
    assert set(file_annotations.keys()).issubset(set(view_dict[0].keys()))

    updated_a_file = syn.get(a_file.id, downloadFile=False)

    # Check that the values are the same as what was set
    # Both in the view and on the entity itself
    for k, v in file_annotations.items():
        assert_equals(view_dict[0][k], v)
        assert_equals(updated_a_file.annotations[k][0], v)

    # Make a change to the view and store
    view_dict[0]['fileFormat'] = 'PNG'

    with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as temp:
        schedule_for_cleanup(temp.name)
        temp_filename = temp.name

    with io.open(temp_filename, mode='w', encoding="utf-8",
                 newline='') as temp_file:
        dw = csv.DictWriter(temp_file,
                            fieldnames=view_dict[0].keys(),
                            quoting=csv.QUOTE_NONNUMERIC,
                            lineterminator=str(os.linesep))
        dw.writeheader()
        dw.writerows(view_dict)
        temp_file.flush()
    new_view = syn.store(synapseclient.Table(entity_view.id, temp_filename))
    new_view_dict = list(
        csv.DictReader(io.open(temp_filename, encoding="utf-8", newline='')))
    assert_equals(new_view_dict[0]['fileFormat'], 'PNG')

    #query for the change
    start_time = time.time()

    new_view_results = syn.tableQuery("select * from %s" % entity_view.id)
    schedule_for_cleanup(new_view_results.filepath)
    new_view_dict = list(
        csv.DictReader(
            io.open(new_view_results.filepath, encoding="utf-8", newline='')))
    #query until change is seen.
    while new_view_dict[0]['fileFormat'] != 'PNG':
        #check timeout
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        #query again
        new_view_results = syn.tableQuery("select * from %s" % entity_view.id)
        new_view_dict = list(
            csv.DictReader(
                io.open(new_view_results.filepath,
                        encoding="utf-8",
                        newline='')))
    #paranoid check
    assert_equals(new_view_dict[0]['fileFormat'], 'PNG')
コード例 #16
0
def create_cols(table_type, syn=None):
    if table_type == MC10_SENSOR_NAME:
        cols = [sc.Column(name="measurement_id", columnType="STRING"),
                sc.Column(name="sensor_location", columnType="STRING"),
                sc.Column(name="mc10_accelerometer", columnType="FILEHANDLEID"),
                sc.Column(name="mc10_gyroscope", columnType="FILEHANDLEID"),
                sc.Column(name="mc10_emg", columnType="FILEHANDLEID")]
    elif table_type == SMARTWATCH_SENSOR_NAME:
        cols = [sc.Column(name="measurement_id", columnType="STRING"),
                sc.Column(name="smartwatch_accelerometer", columnType="FILEHANDLEID")]
    elif table_type == "diary":
        cols = list(syn.getTableColumns(DIARY))
        cols = [sc.Column(name="measurement_id", columnType="STRING"),
                sc.Column(name="subject_id", columnType="INTEGER"),
                sc.Column(name="timestamp", columnType="DATE"),
                sc.Column(name="activity_intensity", columnType="INTEGER"),
                sc.Column(name="dyskinesia", columnType="INTEGER"),
                sc.Column(name="on_off", columnType="INTEGER"),
                sc.Column(name="tremor", columnType="INTEGER"),
                sc.Column(name="activity_intensity_reported_timestamp", columnType="DATE"),
                sc.Column(name="dyskinesia_reported_timestamp", columnType="DATE"),
                sc.Column(name="on_off_reported_timestamp", columnType="DATE"),
                sc.Column(name="tremor_reported_timestamp", columnType="DATE")]
    else:
        raise TypeError("table_type must be one of [{}, {}, {}]".format(
            MC10_SENSOR_NAME, SMARTWATCH_SENSOR_NAME, "diary"))
    return cols
コード例 #17
0
def test_migrate_project(request, syn, schedule_for_cleanup,
                         storage_location_id):
    test_name = request.node.name
    project_name = "{}-{}".format(test_name, uuid.uuid4())
    project = synapseclient.Project(name=project_name)
    project_entity = syn.store(project)

    file_0_path = _create_temp_file()
    schedule_for_cleanup(file_0_path)
    file_0_name = "{}-{}".format(test_name, 1)
    file_0 = synapseclient.File(name=file_0_name,
                                path=file_0_path,
                                parent=project_entity)
    file_0_entity = syn.store(file_0)
    default_storage_location_id = file_0_entity._file_handle[
        'storageLocationId']

    folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4())
    folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name)
    folder_1_entity = syn.store(folder_1)

    file_1_path = _create_temp_file()
    schedule_for_cleanup(file_1_path)
    file_1_name = "{}-{}".format(test_name, 1)
    file_1 = synapseclient.File(name=file_1_name,
                                path=file_1_path,
                                parent=folder_1_entity)
    file_1_entity = syn.store(file_1)

    file_2_path = _create_temp_file()
    schedule_for_cleanup(file_2_path)
    file_2_name = "{}-{}".format(test_name, 2)
    file_2 = synapseclient.File(name=file_2_name,
                                path=file_2_path,
                                parent=folder_1_entity)
    file_2_entity = syn.store(file_2)

    # file 3 shares the same file handle id as file 1
    file_3_path = file_1_path
    file_3_name = "{}-{}".format(test_name, 3)
    file_3 = synapseclient.File(name=file_3_name,
                                path=file_3_path,
                                parent=folder_1_entity)
    file_3.dataFileHandleId = file_1_entity.dataFileHandleId
    file_3_entity = syn.store(file_3)

    table_1_cols = [
        synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'),
        synapseclient.Column(name='num', columnType='INTEGER'),
        synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'),
    ]
    table_1 = syn.store(
        synapseclient.Schema(name=test_name,
                             columns=table_1_cols,
                             parent=folder_1_entity))
    table_1_file_col_1_1 = _create_temp_file()
    table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1)
    table_1_file_col_1_2 = _create_temp_file()
    table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1)
    table_1_file_col_2_1 = _create_temp_file()
    table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1)
    table_1_file_col_2_2 = _create_temp_file()
    table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1)

    data = [
        [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']],
        [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']],
    ]

    table_1_entity = syn.store(
        synapseclient.RowSet(schema=table_1,
                             rows=[synapseclient.Row(r) for r in data]))

    db_path = tempfile.NamedTemporaryFile(delete=False).name
    schedule_for_cleanup(db_path)

    index_result = synapseutils.index_files_for_migration(
        syn,
        project_entity,
        storage_location_id,
        db_path,
        file_version_strategy='new',
        include_table_files=True,
    )

    counts_by_status = index_result.get_counts_by_status()
    assert counts_by_status['INDEXED'] == 8
    assert counts_by_status['ERRORED'] == 0

    migration_result = synapseutils.migrate_indexed_files(syn,
                                                          db_path,
                                                          force=True)

    file_0_entity_updated = syn.get(utils.id_of(file_0_entity),
                                    downloadFile=False)
    file_1_entity_updated = syn.get(utils.id_of(file_1_entity),
                                    downloadFile=False)
    file_2_entity_updated = syn.get(utils.id_of(file_2_entity),
                                    downloadFile=False)
    file_3_entity_updated = syn.get(utils.id_of(file_3_entity),
                                    downloadFile=False)
    file_handles = [
        f['_file_handle'] for f in (
            file_0_entity_updated,
            file_1_entity_updated,
            file_2_entity_updated,
            file_3_entity_updated,
        )
    ]

    table_1_id = utils.id_of(table_1_entity)
    results = syn.tableQuery("select file_col_1, file_col_2 from {}".format(
        utils.id_of(table_1_entity)))
    table_file_handles = []
    for row in results:
        for file_handle_id in row[2:]:
            file_handle = syn._getFileHandleDownload(
                file_handle_id, table_1_id,
                objectType='TableEntity')['fileHandle']
            table_file_handles.append(file_handle)
    file_handles.extend(table_file_handles)

    _assert_storage_location(file_handles, storage_location_id)
    assert storage_location_id != default_storage_location_id

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query_result = cursor.execute(
            "select status, count(*) from migrations where type in (?, ?) group by status",
            (_MigrationType.FILE.value,
             _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall()

        counts = {r[0]: r[1] for r in query_result}

        # should only be one status and they should all be migrated
        # should be 3 migrated files entities + 4 migrated table attached files
        assert len(counts) == 1
        assert counts[_MigrationStatus.MIGRATED.value] == 8

    csv_file = tempfile.NamedTemporaryFile(delete=False)
    schedule_for_cleanup(csv_file.name)
    migration_result.as_csv(csv_file.name)
    with open(csv_file.name, 'r') as csv_file_in:
        csv_contents = csv_file_in.read()

    table_1_id = table_1_entity['tableId']

    # assert the content of the csv. we don't assert any particular order of the lines
    # but the presence of the expected lines and the correct # of lines
    csv_lines = csv_contents.split('\n')
    assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines  # noqa
    assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines  # noqa
    assert "" in csv_lines  # expect trailing newline in a csv
コード例 #18
0
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity))

    folder_entity2 = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)

    # Add a file in the Folder as well
    f = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent,
    # but faster than chunked queries.
    time.sleep(2)
    # Test recursive get
    run('synapse', '--skip-checks', 'get', '-r', folder_entity.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', folder_entity2.name, os.path.basename(f))
        for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    # Test query get using a Table with an entity column
    # This should be replaced when Table File Views are implemented in the client
    cols = [synapseclient.Column(name='id', columnType='ENTITYID')]

    schema1 = syn.store(
        synapseclient.Schema(name='Foo Table',
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [[x.id] for x in file_entities]

    syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    time.sleep(3)  # get -q are eventually consistent
    # Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])
コード例 #19
0
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                   parent=project_entity))

    folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                    parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f  = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)


    #Add a file in the Folder as well
    f  = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    #function under test uses queries which are eventually consistent but not immediately after creating the entities
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    ### Test recursive get
    output = run('synapse', '--skip-checks',
                 'get', '-r',
                 folder_entity.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)


    ### Test query get
    ### Note: We're not querying on annotations because tests can fail if there
    ###       are lots of jobs queued as happens when staging is syncing
    output = run('synapse', '--skip-checks',
                 'get', '-q', "select id from file where parentId=='%s'" %
                 folder_entity2.id)
    #Verify that we downloaded files from folder_entity2
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])

    ### Test query get using a Table with an entity column
    ### This should be replaced when Table File Views are implemented in the client
    cols = []
    cols.append(synapseclient.Column(name='id', columnType='ENTITYID'))

    schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 =[[x.id] for x in file_entities]

    print(data1)

    row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1,
                                   rows=[synapseclient.Row(r) for r in data1]))

    ### Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])
コード例 #20
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description='Convert JSON to Synapse Table Schema')
    parser.add_argument('path', type=str, help='Path (or URL) to JSON file')
    parser.add_argument('--projectId',
                        type=str,
                        help='Synapse Project ID to store schema')
    parser.add_argument('-n',
                        '--dry_run',
                        action="store_true",
                        default=False,
                        help='Dry run')
    parser.add_argument('--synapseJSONSchema',
                        action="store_true",
                        default=False,
                        help="JSON is already in Synapse Table Schema format")
    args = parser.parse_args()

    syn = synapseclient.login(silent=True)

    project = syn.get(args.projectId)

    f = urllib.urlopen(path2url(args.path))
    data = json.load(f)

    url_path = urllib.splittype(args.path)[1]
    filename = os.path.split(url_path)[1]
    schema_name = os.path.splitext(filename)[0]

    if args.synapseJSONSchema:
        schema = synapseclient.Schema(name=schema_name, parent=project)
        schema.columns_to_store = data
    else:
        cols = []

        for k, v in data.iteritems():

            # Handle null values, assume that they will be strings
            if not v:
                column_type = "STRING"
            elif bool in map(type, v):
                column_type = "BOOLEAN"
            elif int in map(type, v):
                column_type = "INTEGER"
            elif float in map(type, v):
                column_type = "DOUBLE"
            else:
                column_type = "STRING"

            cols.append(
                synapseclient.Column(name=k,
                                     columnType=column_type,
                                     enumValues=v,
                                     maximumSize=250))

        schema = synapseclient.Schema(name=schema_name,
                                      columns=cols,
                                      parent=project)

    if args.dry_run:

        schema_as_list = map(dict, schema.columns_to_store)
        new_schema_as_list = []

        _key_order = [
            'name', 'description', 'columnType', 'maximumSize', 'enumValues'
        ]

        for col in schema_as_list:
            col['description'] = ""
            col['source'] = ""

            new_enum_values = []

            for v in col['enumValues']:

                new_value_ordered_dict = collections.OrderedDict()

                new_value_ordered_dict['value'] = v
                new_value_ordered_dict['description'] = ""
                new_value_ordered_dict['source'] = ""

                new_enum_values.append(new_value_ordered_dict)

            col['enumValues'] = new_enum_values

            new_ordered_dict = collections.OrderedDict()
            for k in _key_order:
                new_ordered_dict[k] = col[k]

            new_schema_as_list.append(new_ordered_dict)

        print json.dumps(new_schema_as_list, indent=2)
    else:
        schema = syn.store(schema)
コード例 #21
0
def store_tables(syn, raw_data_curated, scores_curated, meds_curated,
                 sleep_curated, feedback_curated):
    # sensor measurements
    raw_data_cols = [
        sc.Column(name="subject_id", columnType="STRING", maximumSize=6),
        sc.Column(name="device", columnType="STRING", maximumSize=10),
        sc.Column(name="participant_day", columnType="INTEGER"),
        sc.Column(name="timestamp_start", columnType="DOUBLE"),
        sc.Column(name="timestamp_end", columnType="DOUBLE"),
        sc.Column(name="source_file", columnType="ENTITYID"),
        sc.Column(name="data_file_handle_id", columnType="FILEHANDLEID")
    ]
    raw_data_schema = sc.Schema(name="Sensor Measurements",
                                columns=raw_data_cols,
                                parent=PROJECT)
    raw_data_table = sc.Table(raw_data_schema, raw_data_curated)
    syn.store(raw_data_table)
    # task scores
    scores_curated_table = sc.table.build_table("Task Scores", PROJECT,
                                                scores_curated)
    syn.store(scores_curated_table)
    # medication diary
    meds_cols = [
        sc.Column(name="subject_id", columnType="STRING", maximumSize=6),
        sc.Column(name="timestamp", columnType="INTEGER"),
        sc.Column(name="pd_related_medications",
                  columnType="STRING",
                  maximumSize=120),
        sc.Column(name="other_medications",
                  columnType="STRING",
                  maximumSize=120)
    ]
    meds_curated_clean = clean_numeric_cols(meds_curated, ["timestamp"])
    meds_schema = sc.Schema(name="Medication Diary",
                            columns=meds_cols,
                            parent=PROJECT)
    meds_table = sc.Table(meds_schema, meds_curated_clean)
    syn.store(meds_table)
    # sleep diary
    sleep_cols = [
        sc.Column(name="subject_id", columnType="STRING", maximumSize=6),
        sc.Column(name="sleep", columnType="INTEGER"),
        sc.Column(name="wake", columnType="INTEGER")
    ]
    sleep_curated_clean = clean_numeric_cols(sleep_curated, ["sleep", "wake"])
    sleep_schema = sc.Schema(name="Sleep Diary",
                             columns=sleep_cols,
                             parent=PROJECT)
    sleep_table = sc.Table(sleep_schema, sleep_curated_clean)
    syn.store(sleep_table)
    # feedback survey
    feedback_cols = [
        sc.Column(name="subject_id", columnType="STRING", maximumSize=6),
        sc.Column(name="charge_smartphone", columnType="INTEGER"),
        sc.Column(name="charge_pebble", columnType="INTEGER"),
        sc.Column(name="experience_watches", columnType="INTEGER"),
        sc.Column(name="experience_devices", columnType="INTEGER"),
        sc.Column(name="clearness_diary", columnType="INTEGER"),
        sc.Column(name="accuracy_diary", columnType="INTEGER"),
        sc.Column(name="additional_feedback_device_phone",
                  columnType="LARGETEXT"),
        sc.Column(name="additional_feedback_diary", columnType="LARGETEXT"),
        sc.Column(name="additional_feedback_experiment",
                  columnType="LARGETEXT")
    ]
    feedback_curated_clean = clean_numeric_cols(feedback_curated, [
        "charge_smartphone", "charge_pebble", "charge_pebble",
        "experience_watches", "experience_devices", "clearness_diary",
        "accuracy_diary"
    ])
    feedback_schema = sc.Schema(name="Feedback Survey",
                                columns=feedback_cols,
                                parent=PROJECT)
    feedback_table = sc.Table(feedback_schema, feedback_curated_clean)
    syn.store(feedback_table)