Ejemplo n.º 1
0
def test_get_or_create_schema__call():
    """Makes sure correct parameters are called"""
    schema_name = str(uuid.uuid1())
    parentid = str(uuid.uuid1())
    schema_ent = synapseclient.Schema(name=schema_name,
                                      parentId=parentid)
    returned = synapseclient.Schema(name=schema_name,
                                    id=str(uuid.uuid1()),
                                    parentId=parentid)
    with patch.object(CREATE_CLS,
                      "_find_by_obj_or_create",
                      return_value=returned) as patch_find_or_create:
        new_schema = CREATE_CLS.get_or_create_schema(name=schema_name,
                                                     parentId=parentid)
        assert new_schema == returned
        patch_find_or_create.assert_called_once_with(schema_ent)
def test_schema_change(syn, tables, new_project, sample_table):
    source_table = tables["schema"][0]["id"]
    target_table_cols = deepcopy(tables["columns"][0])
    added_col = target_table_cols.pop(2)
    renamed_original_name = target_table_cols[2]["name"]
    target_table_cols[2]["name"] = "renamed_col"
    target_table_cols[3]["maximumSize"] = 100
    schema = sc.Schema(name=tables["schema"][0]["name"],
                       columns=target_table_cols,
                       parent=new_project["id"])
    incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2])
    incomplete_table = incomplete_table.drop(added_col["name"], axis=1)
    incomplete_table = incomplete_table.rename(
        {renamed_original_name: "renamed_col"}, axis=1)
    table = syn.store(sc.Table(schema, incomplete_table))
    exported_table = export_tables(syn,
                                   table_mapping={source_table: table.tableId},
                                   update=False)
    updated_table = syn.tableQuery("select * from {}".format(table.tableId))
    updated_table = updated_table.asDataFrame().reset_index(drop=True)
    updated_table_no_fh = updated_table.drop("raw_data", axis=1)
    comparison_table = sample_table.drop("raw_data",
                                         axis=1).reset_index(drop=True)
    updated_table_no_fh = updated_table_no_fh[comparison_table.columns]
    print(updated_table_no_fh)
    print(comparison_table)
    pd.testing.assert_frame_equal(updated_table_no_fh, comparison_table)
def update_validated_submissions_table(syn, project_id, valid_df):
    """
    Push the latest version of the combined validated submissions 
    table to Synapse.
    """
    try:
        print("Searching for existing 'ValidatedSubmissions' table...")
        schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table'])
                     if t['name'] == 'ValidatedSubmissions'][0]['id']
        schema = syn.get(schema_id)
        validated_subs_table = syn.tableQuery('select * from {}'.format(schema_id))
        if validated_subs_table.asDataFrame().shape[0] == valid_df.shape[0]:
            print("No new valid submissions since last update.")
        validated_subs_table.schema = schema
        print("Updating 'ValidatedSubmissions' table...")
        update_table = synapseclient.Table(schema, valid_df)
        validated_subs_table = _update_syn_table(validated_subs_table, update_table, 'objectId')
    except IndexError:
        print("Creating 'ValidatedSubmissions' table...")
        project = syn.get(project_id)
        cols = synapseclient.as_table_columns(valid_df)
        schema = synapseclient.Schema(name='ValidatedSubmissions', columns=cols, parent=project)
        validated_subs_table = synapseclient.Table(schema, valid_df)
    print("Storing 'ValidatedSubmissions' table...")
    validated_subs_table = syn.store(validated_subs_table)
Ejemplo n.º 4
0
def createMafDatabase(syn,
                      databaseToSynIdMappingDf,
                      testing=False,
                      staging=False):
    mafDatabaseSynId = process_functions.getDatabaseSynId(
        syn, "vcf2maf", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    mafDatabaseEnt = syn.get(mafDatabaseSynId)
    mafCols = list(syn.getTableColumns(mafDatabaseSynId))
    schema = synapseclient.Schema(
        name='Narrow MAF %s Database' % time.time(),
        columns=mafCols,
        parent=process_functions.getDatabaseSynId(
            syn, "main", databaseToSynIdMappingDf=databaseToSynIdMappingDf))
    schema.primaryKey = mafDatabaseEnt.primaryKey
    newMafDb = syn.store(schema)
    #Store in the new database synid
    databaseToSynIdMappingDf['Id'][0] = newMafDb.id
    syn.store(
        synapseclient.Table(
            process_functions.getDatabaseSynId(syn, "dbMapping", test=testing),
            databaseToSynIdMappingDf))
    if not staging and not testing:
        #Make sure to store the newly created maf db synid into the staging synapse mapping
        databaseToSynIdMapping = syn.tableQuery(
            "SELECT * FROM syn12094210 where Database = 'vcf2maf'")
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()
        databaseToSynIdMappingDf['Id'][0] = newMafDb.id
        syn.store(synapseclient.Table("syn12094210", databaseToSynIdMappingDf))
    #Move and archive old mafdatabase
    mafDatabaseEnt.parentId = "syn7208886"
    mafDatabaseEnt.name = "ARCHIVED " + mafDatabaseEnt.name
    syn.store(mafDatabaseEnt)
    mafDatabaseSynId = newMafDb.id
    #Remove can download permissions from project GENIE team
    syn.setPermissions(mafDatabaseSynId, 3326313, [])
def test_manually_pass_source_tables_dict(syn, tables, new_project,
                                          sample_table):
    source_table = tables["schema"][0]["id"]
    schema = sc.Schema(name=tables["schema"][0]["name"],
                       columns=tables["columns"][0],
                       parent=new_project["id"])
    incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2])
    rest_of_the_table = deepcopy(sample_table.iloc[len(sample_table) // 2:])
    table = syn.store(sc.Table(schema, incomplete_table))
    source_tables = {source_table: rest_of_the_table}
    exported_table = export_tables(syn,
                                   table_mapping={source_table: table.tableId},
                                   source_tables=source_tables,
                                   update=True)
    updated_table = syn.tableQuery("select * from {}".format(table.tableId))
    updated_table = updated_table.asDataFrame().reset_index(drop=True)
    updated_table_no_fh = updated_table.drop("raw_data", axis=1)
    update = exported_table[source_table][1]
    correct_table_no_fh = incomplete_table.append(update,
                                                  ignore_index=True,
                                                  sort=False)
    correct_table_no_fh = correct_table_no_fh.drop(
        "raw_data", axis=1).reset_index(drop=True)
    print("returned results \n", updated_table_no_fh)
    print("correct result \n", correct_table_no_fh)
    pd.testing.assert_frame_equal(updated_table_no_fh, correct_table_no_fh)
Ejemplo n.º 6
0
def getOrCreateSchema(syn, parent, name, columns):
    """Get an existing table schema by name and parent or create a new one."""

    schema = synapseclient.Schema(name=name, parent=parent, columns=columns)

    schema = findByNameOrCreate(syn, schema)

    return schema
Ejemplo n.º 7
0
def get_or_create_schema(parent, name, columns):
    """Get an existing table schema by name and parent or create a new one."""

    schema = synapseclient.Schema(name=name, parent=parent, columns=columns)

    schema = find_by_name_or_create(schema)

    return schema
Ejemplo n.º 8
0
def table(syn, parent, obj):
    df = read(obj)
    cols = synapseclient.as_table_columns(df)
    schema = synapseclient.Schema(name=str(uuid.uuid4()),
                                  columns=cols,
                                  parent=parent)
    schema = syn.store(schema)
    table = syn.store(synapseclient.Table(schema, df))
    return schema
def test_export_multiple_tables_to_preexisting_update(syn, new_project, tables,
                                                      sample_table):
    source_table = tables["schema"][0]["id"]
    source_table_2 = tables["schema"][1]["id"]
    schema = sc.Schema(name=tables["schema"][0]["name"],
                       columns=tables["columns"][0],
                       parent=new_project["id"])
    incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2])
    table = syn.store(sc.Table(schema, incomplete_table))
    schema_2 = sc.Schema(name=tables["schema"][1]["name"],
                         columns=tables["columns"][1],
                         parent=new_project["id"])
    incomplete_table_2 = deepcopy(sample_table.iloc[:len(sample_table) // 3])
    table_2 = syn.store(sc.Table(schema_2, incomplete_table_2))
    exported_table = export_tables(syn,
                                   table_mapping={
                                       source_table: table.tableId,
                                       source_table_2: table_2.tableId
                                   },
                                   update=True)
    updated_table = syn.tableQuery("select * from {}".format(table.tableId))
    updated_table = updated_table.asDataFrame().reset_index(drop=True)
    updated_table_no_fh = updated_table.drop("raw_data", axis=1)
    update = exported_table[source_table][1]
    correct_table_no_fh = incomplete_table.append(update,
                                                  ignore_index=True,
                                                  sort=False)
    correct_table_no_fh = correct_table_no_fh.drop(
        "raw_data", axis=1).reset_index(drop=True)
    updated_table_2 = syn.tableQuery("select * from {}".format(
        table_2.tableId))
    updated_table_2 = updated_table_2.asDataFrame().reset_index(drop=True)
    updated_table_2_no_fh = updated_table_2.drop("raw_data", axis=1)
    update_2 = exported_table[source_table_2][1]
    correct_table_no_fh_2 = incomplete_table_2.append(update_2,
                                                      ignore_index=True,
                                                      sort=False)
    correct_table_no_fh_2 = correct_table_no_fh_2.drop(
        "raw_data", axis=1).reset_index(drop=True)
    print("returned results \n", updated_table_no_fh)
    print("correct result \n", correct_table_no_fh)
    assert (updated_table_no_fh.equals(correct_table_no_fh)
            and updated_table_2_no_fh.equals(correct_table_no_fh_2))
def table_schema(project_obj):
    cols = [synapseclient.Column(name="recordId", columnType="INTEGER"),
            synapseclient.Column(name="externalId", columnType="STRING"),
            synapseclient.Column(name="substudyMemberships", columnType="STRING"),
            synapseclient.Column(name="bool_property", columnType="BOOLEAN"),
            synapseclient.Column(name="str_property", columnType="STRING"),
            synapseclient.Column(name="raw_data", columnType="FILEHANDLEID")]
    schema = synapseclient.Schema(name = str(uuid.uuid4()),
                                  columns = cols,
                                  parent = project_obj["id"])
    return schema
Ejemplo n.º 11
0
def test_table_query():
    """Test command line ability to do table query.

    """

    cols = []
    cols.append(
        synapseclient.Column(name='name',
                             columnType='STRING',
                             maximumSize=1000))
    cols.append(
        synapseclient.Column(name='foo',
                             columnType='STRING',
                             enumValues=['foo', 'bar', 'bat']))
    cols.append(synapseclient.Column(name='x', columnType='DOUBLE'))
    cols.append(synapseclient.Column(name='age', columnType='INTEGER'))
    cols.append(synapseclient.Column(name='cartoon', columnType='BOOLEAN'))

    project_entity = project

    schema1 = syn.store(
        synapseclient.Schema(name=str(uuid.uuid4()),
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [['Chris', 'bar', 11.23, 45, False],
             ['Jen', 'bat', 14.56, 40,
              False], ['Jane', 'bat', 17.89, 6, False],
             ['Henry', 'bar', 10.12, 1, False]]

    row_reference_set1 = syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    # Test query
    output = run('synapse', '--skip-checks', 'query',
                 'select * from %s' % schema1.id)

    output_rows = output.rstrip("\n").split("\n")

    # Check the length of the output
    assert len(output_rows) == 5, "got %s rows" % (len(output_rows), )

    # Check that headers are correct.
    # Should be column names in schema plus the ROW_ID and ROW_VERSION
    my_headers_set = output_rows[0].split("\t")
    expected_headers_set = ["ROW_ID", "ROW_VERSION"] + list(
        map(lambda x: x.name, cols))
    assert my_headers_set == expected_headers_set, "%r != %r" % (
        my_headers_set, expected_headers_set)
def test__create_schema():
    """Tests calling of create schema"""
    table_name = str(uuid.uuid1())
    parentid = str(uuid.uuid1())
    columns = [str(uuid.uuid1())]
    annotations = {"foo": "bar"}

    schema = synapseclient.Schema(table_name, columns=columns,
                                  parent=parentid, annotations=annotations)
    with patch.object(syn, "store",
                      return_value=schema) as patch_syn_store:

        new_schema = process_functions._create_schema(syn, table_name, parentid,
                                                      columns=columns,
                                                      annotations=annotations)
        patch_syn_store.assert_called_once_with(schema)
        assert new_schema == schema
Ejemplo n.º 13
0
def _create_table(syn: Synapse, name: str, col_config: List[dict],
                  parent: str) -> Schema:
    """Create Synapse Table

    Args:
        syn: Synapse connection
        name: Table name
        col_config: Column dict configuration
        parent: Synapse id of project

    Returns:
        Stored Synapse Table

    """
    cols = [synapseclient.Column(**col) for col in col_config]
    schema = synapseclient.Schema(name=name, columns=cols, parent=parent)
    schema = syn.store(schema)
    return schema
def _store_dataframe_to_table(syn,
                              df,
                              df_cols,
                              table_id=None,
                              parent_id=None,
                              table_name=None,
                              **kwargs):
    """Store a pandas DataFrame to Synapse in a safe way by formatting the
    the values so that the store operation is not rejected by Synapse.

    Parameters
    ----------
    syn : synapseclient.Synapse
    df : pandas.DataFrame
    df_cols : list of synapseclient.Column objects
    table_id : str, default None
        Synapse ID of a preexisting Synapse Table to store `df` to.
        Either `table_id` or both `parent_id` and `table_name` must
        be supplied as arguments.
    parent_id : str, default None
        Synapse ID of the project to store `df` to as a table.
        Either `table_id` or both `parent_id` and `table_name` must
        be supplied as arguments.
    table_name : str, default None
        Either `table_id` or both `parent_id` and `table_name` must
        be supplied as arguments.
    **kwargs :
        Keyword arguments to provide to syn.store (useful for provenance)
    """
    if table_id is None and parent_id is None and table_name is None:
        raise TypeError("Either the table Synapse ID must be set or "
                        "the parent ID and table name must be set.")
    sanitized_dataframe = _sanitize_dataframe(syn, records=df, cols=df_cols)
    if table_id is None:
        target_table_schema = sc.Schema(name=table_name,
                                        parent=parent_id,
                                        columns=df_cols)
        target_table = sc.Table(schema=target_table_schema,
                                values=sanitized_dataframe,
                                headers=df_cols)
    else:
        target_table = sc.Table(table_id, sanitized_dataframe, headers=df_cols)
    target_table = syn.store(target_table, **kwargs)
    return target_table
def _create_schema(syn, table_name, parentid, columns=None, annotations=None):
    """Creates Table Schema

    Args:
        syn: Synapse object
        table_name: Name of table
        parentid: Project synapse id
        columns: Columns of Table
        annotations: Dictionary of annotations to add

    Returns:
        Schema
    """
    schema = synapseclient.Schema(name=table_name,
                                  columns=columns,
                                  parent=parentid,
                                  annotations=annotations)
    new_schema = syn.store(schema)
    return new_schema
def test_export_one_table_to_preexisting_no_update(syn, new_project, tables,
                                                   sample_table):
    source_table = tables["schema"][0]["id"]
    schema = sc.Schema(name=tables["schema"][0]["name"],
                       columns=tables["columns"][0],
                       parent=new_project["id"])
    incomplete_table = deepcopy(sample_table.iloc[:len(sample_table) // 2])
    table = syn.store(sc.Table(schema, incomplete_table))
    exported_table = export_tables(syn,
                                   table_mapping={source_table: table.tableId},
                                   update=False)
    updated_table = syn.tableQuery("select * from {}".format(table.tableId))
    updated_table = updated_table.asDataFrame().reset_index(drop=True)
    updated_table_no_fh = updated_table.drop("raw_data", axis=1)
    comparison_table = sample_table.drop("raw_data",
                                         axis=1).reset_index(drop=True)
    print(updated_table_no_fh)
    print(comparison_table)
    pd.testing.assert_frame_equal(updated_table_no_fh, comparison_table)
Ejemplo n.º 17
0
def create_and_archive_maf_database(syn, database_synid_mappingdf):
    '''
    Creates new MAF database and archives the old database in the staging site

    Args:
        syn: Synapse object
        databaseToSynIdMappingDf: Database to synapse id mapping dataframe

    Return:
        Editted database to synapse id mapping dataframe
    '''
    maf_database_synid = process_functions.getDatabaseSynId(
        syn,
        "vcf2maf",
        project_id=None,
        databaseToSynIdMappingDf=database_synid_mappingdf)
    maf_database_ent = syn.get(maf_database_synid)
    maf_columns = list(syn.getTableColumns(maf_database_synid))
    schema = synapseclient.Schema(
        name='Narrow MAF {current_time} Database'.format(
            current_time=time.time()),
        columns=maf_columns,
        parent=process_functions.getDatabaseSynId(
            syn, "main", databaseToSynIdMappingDf=database_synid_mappingdf))
    schema.primaryKey = maf_database_ent.primaryKey
    new_maf_database = syn.store(schema)
    # Store in the new database synid
    database_synid_mappingdf['Id'][database_synid_mappingdf['Database'] ==
                                   'vcf2maf'] = new_maf_database.id

    vcf2maf_mappingdf = database_synid_mappingdf[
        database_synid_mappingdf['Database'] == 'vcf2maf']
    # vcf2maf_mappingdf['Id'][0] = newMafDb.id
    syn.store(synapseclient.Table("syn10967259", vcf2maf_mappingdf))
    # Move and archive old mafdatabase (This is the staging synid)
    maf_database_ent.parentId = "syn7208886"
    maf_database_ent.name = "ARCHIVED " + maf_database_ent.name
    syn.store(maf_database_ent)
    # maf_database_synid = new_maf_database.id
    # Remove can download permissions from project GENIE team
    syn.setPermissions(new_maf_database.id, 3326313, [])
    return (database_synid_mappingdf)
def update_team_stats_table(syn, project_id, team_stats_df):
    """
    Push the latest version of the team stats table to Synapse.
    """
    try:
        print("Searching for existing 'TeamStats' table...")
        schema_id = [t for t in syn.getChildren(project_id, includeTypes=['table'])
                     if t['name'] == 'TeamStats'][0]['id']
        schema = syn.get(schema_id)
        team_stats_table = syn.tableQuery('select * from {}'.format(schema_id))
        team_stats_table.schema = schema
        print("Updating 'TeamStats' table...")
        update_table = synapseclient.Table(schema, team_stats_df)
        team_stats_table = _update_syn_table(team_stats_table, update_table, 'team')
    except IndexError:
        print("Creating 'TeamStats' table...")
        project = syn.get(project_id)
        cols = synapseclient.as_table_columns(team_stats_df)
        schema = synapseclient.Schema(name='TeamStats', columns=cols, parent=project)
        team_stats_table = synapseclient.Table(schema, team_stats_df)
    print("Storing 'TeamStats' table...")
    team_stats_table = syn.store(team_stats_table)
Ejemplo n.º 19
0
SYN = create_autospec(synapseclient.Synapse)
SET_PERMS = {"set"}


@pytest.mark.parametrize(
    "entity,principalid,permission_level,mapped",
    [
        # tuple with (input, expectedOutput)
        (synapseclient.Project(), None, "view",
         permissions.ENTITY_PERMS_MAPPINGS['view']),
        (synapseclient.Folder(parentId="syn123"), None, "download",
         permissions.ENTITY_PERMS_MAPPINGS['download']),
        (synapseclient.Entity(), None, "edit",
         permissions.ENTITY_PERMS_MAPPINGS['edit']),
        (synapseclient.Schema(parentId="syn123"), None, "edit_and_delete",
         permissions.ENTITY_PERMS_MAPPINGS['edit_and_delete']),
        (synapseclient.File(parentId="syn123"), None, "admin",
         permissions.ENTITY_PERMS_MAPPINGS['admin']),
        (synapseclient.Entity(), None, "remove",
         permissions.ENTITY_PERMS_MAPPINGS['remove']),
        (synapseclient.Evaluation(contentSource="syn123"), None, "view",
         permissions.EVALUATION_PERMS_MAPPINGS['view']),
        (synapseclient.Evaluation(contentSource="syn123"), None, "submit",
         permissions.EVALUATION_PERMS_MAPPINGS['submit']),
        (synapseclient.Evaluation(contentSource="syn123"), None, "score",
         permissions.EVALUATION_PERMS_MAPPINGS['score']),
        (synapseclient.Evaluation(contentSource="syn123"), None, "admin",
         permissions.EVALUATION_PERMS_MAPPINGS['admin']),
        (synapseclient.Evaluation(contentSource="syn123"), None, "remove",
         permissions.EVALUATION_PERMS_MAPPINGS['remove'])
Ejemplo n.º 20
0
def main(syn):

    # Basic setup of the project
    project_name = "Testing Synapse Genie"

    # Determine the short and long names of the centers.
    center_abbreviations = ['AAA', 'BBB', 'CCC']
    center_names = center_abbreviations

    # Create the project
    project = synapseclient.Project(project_name)
    project = syn.store(project)

    # Create a folder for log files generated by the GENIE processes
    # of validation and updating the database tables
    logs_folder = synapseclient.Folder(name='Logs', parent=project)
    logs_folder = syn.store(logs_folder)

    # Folder for individual center folders
    root_center_folder = synapseclient.Folder(name='Centers', parent=project)
    root_center_folder = syn.store(root_center_folder)

    # The folders for each center where they will upload files for validation
    # and submission. There is one folder per center.
    # This currently deviates from the original GENIE setup of having an
    # 'Input' and 'Staging' folder for each center.
    center_folders = [
        synapseclient.Folder(name=name, parent=root_center_folder)
        for name in center_abbreviations
    ]
    center_folders = [syn.store(folder) for folder in center_folders]

    # Make some fake data that only contains basic text to check
    # for validation.

    n_files = 5  # number of files per center to create

    for folder in center_folders:
        for idx in range(n_files):
            tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}',
                                              suffix='.txt')
            with open(tmp.name, mode='w') as fh:
                fh.write(random.choice(['ERROR', 'VALID', 'NOPE']))
            synfile = syn.store(synapseclient.File(tmp.name, parent=folder))

    # Set up the table that holds the validation status of all submitted files.
    status_schema = create_status_table(syn, project)

    # Set up the table that maps the center abbreviation to the folder where
    # their data is uploaded. This is used by the GENIE framework to find the
    # files to validate for a center.
    center_map_table_defs = [
        {
            'name': 'name',
            'columnType': 'STRING',
            'maximumSize': 250
        },
        {
            'name': 'center',
            'columnType': 'STRING',
            'maximumSize': 50
        },
        {
            'name': 'inputSynId',
            'columnType': 'ENTITYID'
        },
        # {'name': 'stagingSynId',
        #  'columnType': 'ENTITYID'},
        {
            'name': 'release',
            'defaultValue': 'false',
            'columnType': 'BOOLEAN'
        }
        # {'id': '68438',
        #  'name': 'mutationInCisFilter',
        #  'defaultValue': 'true',
        #  'columnType': 'BOOLEAN',
        #  'concreteType': 'org.sagebionetworks.repo.model.table.ColumnModel'}
    ]

    center_map_cols = [
        synapseclient.Column(**col) for col in center_map_table_defs
    ]

    center_schema = synapseclient.Schema(name='Center Table',
                                         columns=center_map_cols,
                                         parent=project)
    center_schema = syn.store(center_schema)

    # Add the center folders created above to this table.
    center_folder_ids = [folder.id for folder in center_folders]
    center_df = pandas.DataFrame(
        dict(name=center_names,
             center=center_abbreviations,
             inputSynId=center_folder_ids))

    tbl = synapseclient.Table(schema=center_schema, values=center_df)
    tbl = syn.store(tbl)

    # Create a table that stores the error logs for each submitted file.
    error_col_defs = [
        {
            'name': 'id',
            'columnType': 'ENTITYID'
        },
        {
            'name': 'center',
            'columnType': 'STRING',
            'maximumSize': 50,
            'facetType': 'enumeration'
        },
        {
            'name': 'errors',
            'columnType': 'LARGETEXT'
        },
        {
            'name': 'name',
            'columnType': 'STRING',
            'maximumSize': 500
        },
        # {'name': 'versionNumber',
        #  'columnType': 'STRING',
        #  'maximumSize': 50},
        {
            'name': 'fileType',
            'columnType': 'STRING',
            'maximumSize': 50
        }
    ]

    error_map_cols = [synapseclient.Column(**col) for col in error_col_defs]
    error_schema = synapseclient.Schema(name='Error Table',
                                        columns=error_map_cols,
                                        parent=project)
    error_schema = syn.store(error_schema)

    # Create a table that maps the various database tables to a short name.
    # This table is used in many GENIE functions to find the correct table to update
    # or get the state of something from.

    db_map_col_defs = [{
        'name': 'Database',
        'columnType': 'STRING',
        'maximumSize': 50
    }, {
        'name': 'Id',
        'columnType': 'ENTITYID'
    }]

    db_map_cols = [synapseclient.Column(**col) for col in db_map_col_defs]
    db_map_schema = synapseclient.Schema(name='DB Mapping Table',
                                         columns=db_map_cols,
                                         parent=project)
    db_map_schema = syn.store(db_map_schema)

    # Add dbMapping annotation
    project.annotations.dbMapping = db_map_schema.tableId
    project = syn.store(project)
    # Add the tables we already created to the mapping table.
    dbmap_df = pandas.DataFrame(
        dict(Database=[
            'centerMapping', 'validationStatus', 'errorTracker', 'dbMapping',
            'logs'
        ],
             Id=[
                 center_schema.id, status_schema.id, error_schema.id,
                 db_map_schema.id, logs_folder.id
             ]))

    db_map_tbl = synapseclient.Table(schema=db_map_schema, values=dbmap_df)
    db_map_tbl = syn.store(db_map_tbl)

    # Make a top level folder for output. Some processing for
    # file types copy a file from one place to another.
    output_folder = synapseclient.Folder(name='Output', parent=project)
    output_folder = syn.store(output_folder)

    output_folder_map = []

    # default_table_col_defs = status_table_col_defs = [
    #     {'name': 'PRIMARY_KEY',
    #      'columnType': 'STRING'}
    # ]
    # default_table_cols = [synapseclient.Column(**col)
    #                       for col in default_table_col_defs]

    default_primary_key = 'PRIMARY_KEY'

    # For each file type format in the format registry, create an output folder and a table.
    # Some GENIE file types copy a file to a new place, and some update a table. Having both
    # means that both of these operations will be available at the beginning.
    # The mapping between the file type and the folder or table have a consistent naming.
    # The key ('Database' value) is {file_type}_folder or {file_type}_table.
    # Determine which file formats are going to be used.
    format_registry = config.collect_format_types(['example_registry'])

    for file_type, obj in format_registry.items():
        file_type_folder = synapseclient.Folder(name=file_type,
                                                parent=output_folder)
        file_type_folder = syn.store(file_type_folder)
        output_folder_map.append(
            dict(Database=f"{file_type}_folder", Id=file_type_folder.id))

        file_type_schema = synapseclient.Schema(name=file_type, parent=project)
        file_type_schema.annotations.primaryKey = default_primary_key
        file_type_schema = syn.store(file_type_schema)

        output_folder_map.append(
            dict(Database=f"{file_type}_table", Id=file_type_schema.id))

    # Add the folders and tables created to the mapping table.
    db_map_tbl = synapseclient.Table(
        schema=db_map_schema, values=pandas.DataFrame(output_folder_map))
    db_map_tbl = syn.store(db_map_tbl)
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                   parent=project_entity))

    folder_entity2 = syn.store(synapseclient.Folder(name=str(uuid.uuid4()),
                                                    parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f  = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)


    #Add a file in the Folder as well
    f  = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    #function under test uses queries which are eventually consistent but not immediately after creating the entities
    start_time = time.time()
    while syn.query("select id from entity where id=='%s'" % file_entity.id).get('totalNumberOfResults') <= 0:
        assert_less(time.time() - start_time, QUERY_TIMEOUT_SEC)
        time.sleep(2)

    ### Test recursive get
    output = run('synapse', '--skip-checks',
                 'get', '-r',
                 folder_entity.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', folder_entity2.name, os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)


    ### Test query get
    ### Note: We're not querying on annotations because tests can fail if there
    ###       are lots of jobs queued as happens when staging is syncing
    output = run('synapse', '--skip-checks',
                 'get', '-q', "select id from file where parentId=='%s'" %
                 folder_entity2.id)
    #Verify that we downloaded files from folder_entity2
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    for downloaded, uploaded in zip(new_paths, uploaded_paths[:-1]):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])

    ### Test query get using a Table with an entity column
    ### This should be replaced when Table File Views are implemented in the client
    cols = []
    cols.append(synapseclient.Column(name='id', columnType='ENTITYID'))

    schema1 = syn.store(synapseclient.Schema(name='Foo Table', columns=cols, parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 =[[x.id] for x in file_entities]

    print(data1)

    row_reference_set1 = syn.store(synapseclient.RowSet(columns=cols, schema=schema1,
                                   rows=[synapseclient.Row(r) for r in data1]))

    ### Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    #Verify that we downloaded files:
    new_paths = [os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        print(uploaded, downloaded)
        assert os.path.exists(downloaded)
        assert filecmp.cmp(downloaded, uploaded)
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])
Ejemplo n.º 22
0
def store_dataframe_to_synapse(syn, df, parent, name, cols):
    df = df[[c['name'] for c in cols]]
    schema = sc.Schema(name = name, columns = cols, parent = parent)
    table = sc.Table(schema, df)
    table = syn.store(table)
    return table
Ejemplo n.º 23
0
def test_migrate_project(request, syn, schedule_for_cleanup,
                         storage_location_id):
    test_name = request.node.name
    project_name = "{}-{}".format(test_name, uuid.uuid4())
    project = synapseclient.Project(name=project_name)
    project_entity = syn.store(project)

    file_0_path = _create_temp_file()
    schedule_for_cleanup(file_0_path)
    file_0_name = "{}-{}".format(test_name, 1)
    file_0 = synapseclient.File(name=file_0_name,
                                path=file_0_path,
                                parent=project_entity)
    file_0_entity = syn.store(file_0)
    default_storage_location_id = file_0_entity._file_handle[
        'storageLocationId']

    folder_1_name = "{}-{}-{}".format(test_name, 1, uuid.uuid4())
    folder_1 = synapseclient.Folder(parent=project_entity, name=folder_1_name)
    folder_1_entity = syn.store(folder_1)

    file_1_path = _create_temp_file()
    schedule_for_cleanup(file_1_path)
    file_1_name = "{}-{}".format(test_name, 1)
    file_1 = synapseclient.File(name=file_1_name,
                                path=file_1_path,
                                parent=folder_1_entity)
    file_1_entity = syn.store(file_1)

    file_2_path = _create_temp_file()
    schedule_for_cleanup(file_2_path)
    file_2_name = "{}-{}".format(test_name, 2)
    file_2 = synapseclient.File(name=file_2_name,
                                path=file_2_path,
                                parent=folder_1_entity)
    file_2_entity = syn.store(file_2)

    # file 3 shares the same file handle id as file 1
    file_3_path = file_1_path
    file_3_name = "{}-{}".format(test_name, 3)
    file_3 = synapseclient.File(name=file_3_name,
                                path=file_3_path,
                                parent=folder_1_entity)
    file_3.dataFileHandleId = file_1_entity.dataFileHandleId
    file_3_entity = syn.store(file_3)

    table_1_cols = [
        synapseclient.Column(name='file_col_1', columnType='FILEHANDLEID'),
        synapseclient.Column(name='num', columnType='INTEGER'),
        synapseclient.Column(name='file_col_2', columnType='FILEHANDLEID'),
    ]
    table_1 = syn.store(
        synapseclient.Schema(name=test_name,
                             columns=table_1_cols,
                             parent=folder_1_entity))
    table_1_file_col_1_1 = _create_temp_file()
    table_1_file_handle_1 = syn.uploadFileHandle(table_1_file_col_1_1, table_1)
    table_1_file_col_1_2 = _create_temp_file()
    table_1_file_handle_2 = syn.uploadFileHandle(table_1_file_col_1_2, table_1)
    table_1_file_col_2_1 = _create_temp_file()
    table_1_file_handle_3 = syn.uploadFileHandle(table_1_file_col_2_1, table_1)
    table_1_file_col_2_2 = _create_temp_file()
    table_1_file_handle_4 = syn.uploadFileHandle(table_1_file_col_2_2, table_1)

    data = [
        [table_1_file_handle_1['id'], 1, table_1_file_handle_2['id']],
        [table_1_file_handle_3['id'], 2, table_1_file_handle_4['id']],
    ]

    table_1_entity = syn.store(
        synapseclient.RowSet(schema=table_1,
                             rows=[synapseclient.Row(r) for r in data]))

    db_path = tempfile.NamedTemporaryFile(delete=False).name
    schedule_for_cleanup(db_path)

    index_result = synapseutils.index_files_for_migration(
        syn,
        project_entity,
        storage_location_id,
        db_path,
        file_version_strategy='new',
        include_table_files=True,
    )

    counts_by_status = index_result.get_counts_by_status()
    assert counts_by_status['INDEXED'] == 8
    assert counts_by_status['ERRORED'] == 0

    migration_result = synapseutils.migrate_indexed_files(syn,
                                                          db_path,
                                                          force=True)

    file_0_entity_updated = syn.get(utils.id_of(file_0_entity),
                                    downloadFile=False)
    file_1_entity_updated = syn.get(utils.id_of(file_1_entity),
                                    downloadFile=False)
    file_2_entity_updated = syn.get(utils.id_of(file_2_entity),
                                    downloadFile=False)
    file_3_entity_updated = syn.get(utils.id_of(file_3_entity),
                                    downloadFile=False)
    file_handles = [
        f['_file_handle'] for f in (
            file_0_entity_updated,
            file_1_entity_updated,
            file_2_entity_updated,
            file_3_entity_updated,
        )
    ]

    table_1_id = utils.id_of(table_1_entity)
    results = syn.tableQuery("select file_col_1, file_col_2 from {}".format(
        utils.id_of(table_1_entity)))
    table_file_handles = []
    for row in results:
        for file_handle_id in row[2:]:
            file_handle = syn._getFileHandleDownload(
                file_handle_id, table_1_id,
                objectType='TableEntity')['fileHandle']
            table_file_handles.append(file_handle)
    file_handles.extend(table_file_handles)

    _assert_storage_location(file_handles, storage_location_id)
    assert storage_location_id != default_storage_location_id

    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        query_result = cursor.execute(
            "select status, count(*) from migrations where type in (?, ?) group by status",
            (_MigrationType.FILE.value,
             _MigrationType.TABLE_ATTACHED_FILE.value)).fetchall()

        counts = {r[0]: r[1] for r in query_result}

        # should only be one status and they should all be migrated
        # should be 3 migrated files entities + 4 migrated table attached files
        assert len(counts) == 1
        assert counts[_MigrationStatus.MIGRATED.value] == 8

    csv_file = tempfile.NamedTemporaryFile(delete=False)
    schedule_for_cleanup(csv_file.name)
    migration_result.as_csv(csv_file.name)
    with open(csv_file.name, 'r') as csv_file_in:
        csv_contents = csv_file_in.read()

    table_1_id = table_1_entity['tableId']

    # assert the content of the csv. we don't assert any particular order of the lines
    # but the presence of the expected lines and the correct # of lines
    csv_lines = csv_contents.split('\n')
    assert "id,type,version,row_id,col_name,from_storage_location_id,from_file_handle_id,to_file_handle_id,status,exception" in csv_lines  # noqa
    assert f"{file_0_entity.id},file,,,,{default_storage_location_id},{file_0_entity.dataFileHandleId},{file_0_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_1_entity.id},file,,,,{default_storage_location_id},{file_1_entity.dataFileHandleId},{file_1_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_2_entity.id},file,,,,{default_storage_location_id},{file_2_entity.dataFileHandleId},{file_2_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{file_3_entity.id},file,,,,{default_storage_location_id},{file_3_entity.dataFileHandleId},{file_3_entity_updated.dataFileHandleId},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_1,{default_storage_location_id},{table_1_file_handle_1['id']},{table_file_handles[0]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,1,file_col_2,{default_storage_location_id},{table_1_file_handle_2['id']},{table_file_handles[1]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_1,{default_storage_location_id},{table_1_file_handle_3['id']},{table_file_handles[2]['id']},MIGRATED," in csv_lines  # noqa
    assert f"{table_1_id},table,1,2,file_col_2,{default_storage_location_id},{table_1_file_handle_4['id']},{table_file_handles[3]['id']},MIGRATED," in csv_lines  # noqa
    assert "" in csv_lines  # expect trailing newline in a csv
def test_command_get_recursive_and_query():
    """Tests the 'synapse get -r' and 'synapse get -q' functions"""

    project_entity = project

    # Create Folders in Project
    folder_entity = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=project_entity))

    folder_entity2 = syn.store(
        synapseclient.Folder(name=str(uuid.uuid4()), parent=folder_entity))

    # Create and upload two files in sub-Folder
    uploaded_paths = []
    file_entities = []

    for i in range(2):
        f = utils.make_bogus_data_file()
        uploaded_paths.append(f)
        schedule_for_cleanup(f)
        file_entity = synapseclient.File(f, parent=folder_entity2)
        file_entity = syn.store(file_entity)
        file_entities.append(file_entity)
        schedule_for_cleanup(f)

    # Add a file in the Folder as well
    f = utils.make_bogus_data_file()
    uploaded_paths.append(f)
    schedule_for_cleanup(f)
    file_entity = synapseclient.File(f, parent=folder_entity)
    file_entity = syn.store(file_entity)
    file_entities.append(file_entity)

    # get -r uses syncFromSynapse() which uses getChildren(), which is not immediately consistent,
    # but faster than chunked queries.
    time.sleep(2)
    # Test recursive get
    run('synapse', '--skip-checks', 'get', '-r', folder_entity.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', folder_entity2.name, os.path.basename(f))
        for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    # Test query get using a Table with an entity column
    # This should be replaced when Table File Views are implemented in the client
    cols = [synapseclient.Column(name='id', columnType='ENTITYID')]

    schema1 = syn.store(
        synapseclient.Schema(name='Foo Table',
                             columns=cols,
                             parent=project_entity))
    schedule_for_cleanup(schema1.id)

    data1 = [[x.id] for x in file_entities]

    syn.store(
        synapseclient.RowSet(schema=schema1,
                             rows=[synapseclient.Row(r) for r in data1]))

    time.sleep(3)  # get -q are eventually consistent
    # Test Table/View query get
    output = run('synapse', '--skip-checks', 'get', '-q',
                 "select id from %s" % schema1.id)
    # Verify that we downloaded files:
    new_paths = [
        os.path.join('.', os.path.basename(f)) for f in uploaded_paths[:-1]
    ]
    new_paths.append(os.path.join('.', os.path.basename(uploaded_paths[-1])))
    schedule_for_cleanup(folder_entity.name)
    for downloaded, uploaded in zip(new_paths, uploaded_paths):
        assert_true(os.path.exists(downloaded))
        assert_true(filecmp.cmp(downloaded, uploaded))
        schedule_for_cleanup(downloaded)

    schedule_for_cleanup(new_paths[0])
Ejemplo n.º 25
0
def pubmed(args, syn):
    """
    Given a list of grant numbers pulled from a synapse table column, utilizes a pubmed API to generate a search query.
    This query is constructed by the union ('or' logic) of all the grant numbers, which would aid in pulling down a list
    of all PubMed publication id's associated with the grants. Then it will go through the PubMed id's and scrape the
    publication for basic informative information.

    :param args:
    :param syn:
    :return:
    """
    projectId = args.projectId
    project = syn.get(projectId)

    if args.grantviewId is not None:
        grantviewId = args.grantviewId
    else:
        grantviewId = "syn10142562"

    csbcGrants = csbcGrantList(syn, grantviewId)
    grantIds = getGrantQuery(csbcGrants)
    pubmedIds = getPubMedIds(grantIds)
    csbcView = getCenterIdsView(syn, grantviewId)

    # for utf encoding and debugging
    # finalTable.to_csv("csbc.csv", sep=',', index=False, encoding="utf-8")
    # finalTable = pandas.read_csv("csbc.csv", delimiter=',', encoding="utf-8")
    # os.remove("csbc.csv")

    if args.tableId:
        # update existing schema
        tableId = args.tableId
        schema = syn.get(tableId)

        publicationTable = syn.tableQuery("select * from %s" % tableId)
        currentTable = publicationTable.asDataFrame()

        new_pubmed_ids = list(
            set(pubmedIds) -
            set([i.split("=")[1] for i in list(currentTable.PubMed)]))
        finalTable = getPMIDDF(new_pubmed_ids, csbcGrants, csbcView)

        table = synapseclient.Table(schema, finalTable.values.tolist())
        table = syn.store(table)

    else:
        # create a new schema
        # cols = synapseclient.as_table_columns(finalTable)
        finalTable = getPMIDDF(pubmedIds, csbcGrants, csbcView)

        cols = [
            Column(name='CSBC PSON Center',
                   columnType='ENTITYID',
                   maximumSize=50),
            Column(name='Consortium', columnType='STRING', maximumSize=100),
            Column(name='PubMed', columnType='LINK', maximumSize=100),
            Column(name='Journal', columnType='STRING', maximumSize=100),
            Column(name='Publication Year', columnType='DATE'),
            Column(name='Title', columnType='STRING', maximumSize=500),
            Column(name='Authors', columnType='STRING', maximumSize=990),
            Column(name='Grant', columnType='STRING', maximumSize=50),
            Column(name='Data Location', columnType='LINK', maximumSize=1000),
            Column(name='Synapse Location',
                   columnType='ENTITYID',
                   maximumSize=50),
            Column(name='Keywords', columnType='STRING', maximumSize=250)
        ]

        schema = synapseclient.Schema(name=args.tableName,
                                      columns=cols,
                                      parent=project)
        table = synapseclient.Table(schema, finalTable)
        table = syn.store(table)
Ejemplo n.º 26
0
    with patch.object(SYN, "findEntityId", return_value=post_return),\
         patch.object(SYN, "get", return_value=obj),\
         pytest.raises(AssertionError,
                       match="Retrieved .* had type .* rather than .*"):
        GET_CLS._find_entity_by_name(
            parentid="syn12345",
            entity_name="foo.txt",
            concrete_type="Test"
        )


@pytest.mark.parametrize(
    "obj", [synapseclient.Project(name="foo"),
            synapseclient.File(path="foo.txt", parentId="syn12345"),
            synapseclient.Folder(name="foo", parentId="syn12345"),
            synapseclient.Schema(name="foo", parentId="syn12345")]
)
def test__get_obj__entity(obj):
    """Test getting of entities"""
    with patch.object(GET_CLS, "_find_entity_by_name",
                      return_value=obj) as patch_get:
        return_obj = GET_CLS._get_obj(obj)
        patch_get.assert_called_once_with(
            parentid=obj.properties.get("parentId", None),
            entity_name=obj.name,
            concrete_type=obj.properties.concreteType)
        assert obj == return_obj


@pytest.mark.parametrize("obj,get_func",
                         [(synapseclient.Team(name="foo"), "getTeam"),
Ejemplo n.º 27
0
def main():
    import argparse

    parser = argparse.ArgumentParser(
        description='Convert JSON to Synapse Table Schema')
    parser.add_argument('path', type=str, help='Path (or URL) to JSON file')
    parser.add_argument('--projectId',
                        type=str,
                        help='Synapse Project ID to store schema')
    parser.add_argument('-n',
                        '--dry_run',
                        action="store_true",
                        default=False,
                        help='Dry run')
    parser.add_argument('--synapseJSONSchema',
                        action="store_true",
                        default=False,
                        help="JSON is already in Synapse Table Schema format")
    args = parser.parse_args()

    syn = synapseclient.login(silent=True)

    project = syn.get(args.projectId)

    f = urllib.urlopen(path2url(args.path))
    data = json.load(f)

    url_path = urllib.splittype(args.path)[1]
    filename = os.path.split(url_path)[1]
    schema_name = os.path.splitext(filename)[0]

    if args.synapseJSONSchema:
        schema = synapseclient.Schema(name=schema_name, parent=project)
        schema.columns_to_store = data
    else:
        cols = []

        for k, v in data.iteritems():

            # Handle null values, assume that they will be strings
            if not v:
                column_type = "STRING"
            elif bool in map(type, v):
                column_type = "BOOLEAN"
            elif int in map(type, v):
                column_type = "INTEGER"
            elif float in map(type, v):
                column_type = "DOUBLE"
            else:
                column_type = "STRING"

            cols.append(
                synapseclient.Column(name=k,
                                     columnType=column_type,
                                     enumValues=v,
                                     maximumSize=250))

        schema = synapseclient.Schema(name=schema_name,
                                      columns=cols,
                                      parent=project)

    if args.dry_run:

        schema_as_list = map(dict, schema.columns_to_store)
        new_schema_as_list = []

        _key_order = [
            'name', 'description', 'columnType', 'maximumSize', 'enumValues'
        ]

        for col in schema_as_list:
            col['description'] = ""
            col['source'] = ""

            new_enum_values = []

            for v in col['enumValues']:

                new_value_ordered_dict = collections.OrderedDict()

                new_value_ordered_dict['value'] = v
                new_value_ordered_dict['description'] = ""
                new_value_ordered_dict['source'] = ""

                new_enum_values.append(new_value_ordered_dict)

            col['enumValues'] = new_enum_values

            new_ordered_dict = collections.OrderedDict()
            for k in _key_order:
                new_ordered_dict[k] = col[k]

            new_schema_as_list.append(new_ordered_dict)

        print json.dumps(new_schema_as_list, indent=2)
    else:
        schema = syn.store(schema)
Ejemplo n.º 28
0
###current table is NTAP
existing_table="syn18496443"
rowset=syn.tableQuery("select * from "+existing_table)
syn.delete(rowset)

#table = synapseclient.table.build_table("NTAP Project Information Integration", 'syn4939478', final_df)
### Table takes in the schema and values (here as a dataframe )
table=syn.store(synapseclient.table.Table(existing_table,final_df))

# %%
cols = syn.getTableColumns(existing_table)

# %%
lst = list(cols)

# In[41]:
#table = syn.store(table)
schema = synapseclient.Schema(columns=lst, parent = "syn4939478")

# %%
print(schema)

# %%
synapseclient.table.Table(schema,final_df)


# %%
synapseclient.table.Table( synapseclient.Schema(columns=lst, parent = "syn4939478") , final_df)


Ejemplo n.º 29
0
def main(syn, project, format_registry=None, centers=None):
    # TODO: add PRIMARY_KEY annotation to each of the tables
    # Dangerous to have lists as default values
    if format_registry is None:
        format_registry = ['example_registry']
    if centers is None:
        centers = []
    # Determine the short and long names of the centers.
    center_abbreviations = centers
    center_names = center_abbreviations

    # Create a folder for log files generated by the GENIE processes
    # of validation and updating the database tables
    logs_folder = synapseclient.Folder(name='Logs', parent=project)
    logs_folder = syn.store(logs_folder)

    # Folder for individual center folders
    root_center_folder = synapseclient.Folder(name='Centers', parent=project)
    root_center_folder = syn.store(root_center_folder)

    # The folders for each center where they will upload files for validation
    # and submission. There is one folder per center.
    # This currently deviates from the original GENIE setup of having an
    # 'Input' and 'Staging' folder for each center.
    center_folders = [
        synapseclient.Folder(name=name, parent=root_center_folder)
        for name in center_abbreviations
    ]
    center_folders = [syn.store(folder) for folder in center_folders]

    # Make some fake data that only contains basic text to check
    # for validation.

    n_files = 2 # number of files per center to create

    for folder in center_folders:
        for _ in range(n_files):
            tmp = tempfile.NamedTemporaryFile(prefix=f'TEST-{folder.name}',
                                              suffix='.txt')
            with open(tmp.name, mode='w') as file_h:
                file_h.write(random.choice(['ERROR', 'VALID', 'NOPE']))
            syn.store(synapseclient.File(tmp.name, parent=folder))

    # Set up the table that holds the validation status of all submitted
    # files.
    status_schema = create_status_table(syn, project)

    # Set up the table that maps the center abbreviation to the folder where
    # their data is uploaded. This is used by the GENIE framework to find the
    # files to validate for a center.
    center_schema = create_center_map_table(syn, project)

    # Add the center folders created above to this table.
    center_folder_ids = [folder.id for folder in center_folders]
    center_df = pandas.DataFrame(dict(name=center_names,
                                      center=center_abbreviations,
                                      inputSynId=center_folder_ids))
    center_df['release'] = True
    existing_center = syn.tableQuery(f"select * from {center_schema.id}")
    existing_centerdf = existing_center.asDataFrame()
    process_functions.updateDatabase(syn, existing_centerdf, center_df,
                                     center_schema.id, ["center"],
                                     to_delete=True)
    # TODO: Remove centers that aren't part of the list

    # Create a table that stores the error logs for each submitted file.
    error_schema = create_error_tracking_table(syn, project)

    # Create a table that maps the various database tables to a short name.
    # This table is used in many GENIE functions to find the correct table
    # to update or get the state of something from.
    db_map_schema = create_db_mapping_table(syn, project)

    # Add dbMapping annotation
    project.annotations.dbMapping = db_map_schema.id
    project = syn.store(project)
    # Add the tables we already created to the mapping table.
    dbmap_df = pandas.DataFrame(
        dict(Database=['centerMapping', 'validationStatus', 'errorTracker',
                       'dbMapping', 'logs'],
             Id=[center_schema.id, status_schema.id, error_schema.id,
                 db_map_schema.id, logs_folder.id])
    )

    # Make a top level folder for output. Some processing for
    # file types copy a file from one place to another.
    output_folder = synapseclient.Folder(name='Output', parent=project)
    output_folder = syn.store(output_folder)

    output_folder_map = []

    # default_table_col_defs = status_table_col_defs = [
    #     {'name': 'PRIMARY_KEY',
    #      'columnType': 'STRING'}
    # ]
    # default_table_cols = [synapseclient.Column(**col)
    #                       for col in default_table_col_defs]

    default_primary_key = 'PRIMARY_KEY'

    # For each file type format in the format registry, create an output 
    # folder and a table.
    # Some GENIE file types copy a file to a new place, and some update a
    # table. Having both means that both of these operations will be available
    # at the beginning.
    # The mapping between the file type and the folder or table have a
    # consistent naming.
    # The key ('Database' value) is {file_type}_folder or {file_type}_table.
    # Determine which file formats are going to be used.
    format_registry = config.collect_format_types(format_registry)
    # Get existing database tables
    existing_dbmap = syn.tableQuery(f"select * from {db_map_schema.id}")
    existing_dbmapdf = existing_dbmap.asDataFrame()

    for file_type, obj in format_registry.items():
        if file_type not in existing_dbmapdf['Database'].tolist():
            file_type_folder = synapseclient.Folder(name=file_type,
                                                    parent=output_folder)
            file_type_folder = syn.store(file_type_folder)
            output_folder_map.append(dict(Database=f"{file_type}_folder",
                                          Id=file_type_folder.id))

            file_type_schema = synapseclient.Schema(name=file_type,
                                                    parent=project)
            # The DCC will have to set the schema and primary key
            # after this is created.
            file_type_schema.annotations.primaryKey = default_primary_key
            file_type_schema = syn.store(file_type_schema)

            output_folder_map.append(dict(Database=file_type,
                                          Id=file_type_schema.id))
        else:
            print("Database already exists")

    # Add the folders and tables created to the mapping table.
    dbmap_df = dbmap_df.append(pandas.DataFrame(output_folder_map))

    process_functions.updateDatabase(syn, existing_dbmapdf, dbmap_df,
                                     db_map_schema.id, ["Database"])
def transferTables(syn,
                   sourceProjId,
                   uploadProjId,
                   extId_Str='',
                   simpleNameFilters=[],
                   healthCodeList=None):
    """ This function transfers tables from a source project to the upload project (target project) 
    sorted by external Ids which contain extId_Str, group tables with simpleNameFilters, also can filter
    tables by healthcodes and then group by activity"""

    # dataframe of all tables using get_tables from synapsebridgehelper.tableHelpers
    all_tables = synapsebridgehelpers.get_tables(syn, sourceProjId,
                                                 simpleNameFilters)

    # Converting externalIds to healthCodes
    if extId_Str != '':
        res = synapsebridgehelpers.externalIds2healthCodes(
            syn, list(all_tables['table.id']))
        res = res[res['externalId'].str.contains(extId_Str)]
        healthCodeList = list(res['healthCode'])

    # List of tables sorted by activity and filtered using healthcodes
    tables_dict = synapsebridgehelpers.filterTablesByActivity(
        syn, all_tables, healthCodes=healthCodeList)

    # Iterate over each activity in tables_dict
    for activity_, activityTableIds in tables_dict.items():
        df_list = []  # list of dataframes corresponding to that activity
        cols_filehandleid = [
        ]  # list of columns that have type FILEHANDLEID across all dataframes for that activity

        # looping over all tables corresponding to that activity
        for table_index in range(0, len(activityTableIds)):
            result = synapsebridgehelpers.tableWithFileIds(
                syn,
                table_id=activityTableIds[table_index],
                healthcodes=healthCodeList)
            cols_filehandleid = cols_filehandleid + list(
                set(result['cols']) - set(cols_filehandleid))
            df_list.append(result['df'])

        # Concatenating all tables to form one table for the activity
        df_main = pd.concat(df_list)
        cols = synapseclient.as_table_columns(df_main)

        # Change the type of columns that are FILEHANDLEIDs as calculated before
        for col in cols:
            if col.name in cols_filehandleid:
                col.columnType = 'FILEHANDLEID'

        # If different datatypes happen while merging tables this will change the column type in the resulting dataframe
        # The following code sets it right and casts the data into its original form / form that syn.store would accept
        # (for FILEHANDLEID type columns, the input needs to be an integer)
        for col in cols:
            if col.columnType == 'STRING':
                df_main[col.name] = [
                    str(item) if item == item else ''
                    for item in df_main[col.name]
                ]
            elif col.columnType == 'INTEGER':
                df_main[col.name] = [
                    int(item) if item == item else ''
                    for item in df_main[col.name]
                ]
            elif col.columnType == 'FILEHANDLEID':
                df_main[col.name] = [
                    int(item) if (item != '' and item == item) else ''
                    for item in df_main[col.name]
                ]
            else:
                df_main[col.name] = [
                    item if item == item else '' for item in df_main[col.name]
                ]

        # Updaing schema and uploading
        schema = synapseclient.Schema(name=activity_,
                                      columns=cols,
                                      parent=uploadProjId)
        table = synapseclient.Table(schema, df_main)
        table = syn.store(table)
        table = syn.setProvenance(table.schema.id,
                                  activity=synapseclient.activity.Activity(
                                      used=tables_dict[activity_]))