Example #1
0
def test_one_table_one_identifier(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 identifier="ABC")
    reference = sample_table.query("externalId == 'ABC'").reset_index(
        drop=True)
    result[0].reset_index(drop=True).equals(reference)
Example #2
0
def test_one_table_one_query(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 query=["bool_property = true"])
    reference = sample_table.query("bool_property == True").reset_index(
        drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #3
0
def test_one_table_multiple_identifier(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 identifier=["ABC", "FGH"])
    reference = sample_table.query(
        "externalId == 'ABC' or "
        "externalId == 'FGH'").reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #4
0
def test_change_identifier_col(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 identifier="red",
                                 identifier_col="str_property")
    reference = sample_table.query("str_property == 'red'").reset_index(
        drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #5
0
def test_change_substudy_col(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 substudy="blue",
                                 substudy_col="str_property")
    reference = sample_table[['blue' in s for s in sample_table.str_property]]
    reference = reference.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #6
0
def test_one_table_function_identifier(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 identifier=lambda s: s.startswith("A"))
    reference = sample_table[[
        s.startswith("A") for s in sample_table.externalId
    ]]
    reference = reference.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #7
0
def test_one_table_one_substudy(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 substudy="my-study")
    reference = sample_table[[
        'my-study' in s for s in sample_table.substudyMemberships
    ]]
    reference = reference.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #8
0
def test_one_table_multiple_substudy(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 substudy=["my-study", "other-study"])
    reference = sample_table.query(
        "'my_study' in substudyMemberships or "
        "'other-study' in substudyMemberships").reset_index(drop=True)
    reference = sample_table[pd.np.array(
        ['my-study' in s
         for s in sample_table.substudyMemberships]) | pd.np.array(
             ['other-study' in s for s in sample_table.substudyMemberships])]
    reference = reference.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #9
0
def test_one_everything(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=tables["schema"][0]["id"],
                                 query=["str_property = 'blue'"],
                                 substudy="other-study",
                                 identifier="DEF")
    reference = sample_table.query(
        "str_property == 'blue' and externalId == 'DEF'")
    reference = reference[[
        'other-study' in s for s in reference.substudyMemberships
    ]]
    reference = reference.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #10
0
def test_mix_everything(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=[s["id"] for s in tables["schema"]],
                                 query=["bool_property = true"],
                                 substudy="other-study",
                                 identifier=["DEF", "ABC", "FGH"])
    reference = sample_table.query("bool_property == True")
    reference = reference[[
        'other-study' in s for s in reference.substudyMemberships
    ]]
    reference = reference[[
        s in ["DEF", "ABC", "FGH"] for s in reference.externalId
    ]]
    reference = reference.reset_index(drop=True)
    assert (result[0].reset_index(drop=True).equals(reference)
            and result[1].reset_index(drop=True).equals(reference))
def export_tables(syn,
                  table_mapping,
                  source_tables=None,
                  target_project=None,
                  update=True,
                  reference_col="recordId",
                  copy_file_handles=None,
                  **kwargs):
    """Copy rows from one Synapse table to another. Or copy tables
    to a new table in a separate project.

    Parameters
    ----------
    syn : synapseclient.Synapse
    table_mapping : dict, list, or str
        If exporting records of one or more tables to other, preexisting tables,
        table_mapping is a dictionary containing Synapse ID key/value mappings
        from source to target tables. If exporting table records to not yet
        created tables in a seperate project, table_mapping can be a list or
        string.
    source_tables : dict
        A mapping from Synapse ID (str) values of source tables to pandas.DataFrame
        where each DataFrame consists of rows to be copied from the source table.
        This is a more explicit way to specify which rows will be copied from
        source to target tables. To implicitly specify which rows will be copied,
        pass named arguments to **kwargs that will be used in a call to
        synapsebridgehelpers.query_across_tables.
    target_project : str, default None
        If exporting table records to not yet created tables in a seperate
        project, specify the target project's Synapse ID here.
    update : bool, default True
        When exporting records of one or more tables to other, preexisting
        tables, whether to append new records to the target tables or completely
        overwrite the table records. Note that rows in the target table that
        match on the `reference_col` of rows in the source table will not be
        updated to match the values in the source table even if `update` is True.
    reference_col : str or list
        If `update` is True, use this column(s) as the table index to determine
        which records are already present in the target table.
    copy_file_handles : bool, default None
        Whether to copy the file handles from the source table to the target
        table. By default, we will attempt to copy the source table records
        without copying any file handles (copy_file_handles = None) and, if
        an error is thrown, we will then copy the file handles before attempting
        to store the table again. If the user explicitly sets
        copy_file_handles = False, an exception will be raised if any of the file
        handles in the source table are not owned by the user. Setting
        copy_file_handles = True always creates copies of file handles, whether
        the user owns them or not.
    **kwargs
        Additional named arguments to pass to synapsebridgehelpers.query_across_tables

    Returns
    -------
    """
    results = {}
    if isinstance(table_mapping, (list, str)):  # export to brand new tables
        if target_project is None:
            raise TypeError("If passing a list to table_mapping, "
                            "target_project must be set.")
        if source_tables is None:
            new_records = synapsebridgehelpers.query_across_tables(
                syn, tables=table_mapping, as_data_frame=True, **kwargs)
            if isinstance(table_mapping, str):
                source_tables = {table_mapping: new_records[0]}
            else:
                source_tables = {
                    t: df
                    for t, df in zip(table_mapping, new_records)
                }
        for source_id, source_table in source_tables.items():
            source_table_info = syn.get(source_id)
            source_table_cols = list(syn.getTableColumns(source_id))
            if copy_file_handles:
                source_table = replace_file_handles(
                    syn,
                    df=source_table,
                    source_table_id=source_id,
                    source_table_cols=source_table_cols)
            try:
                target_table = _store_dataframe_to_table(
                    syn,
                    df=source_table,
                    df_cols=source_table_cols,
                    parent_id=target_project,
                    table_name=source_table_info["name"],
                    used=source_id)
            except sc.core.exceptions.SynapseHTTPError as e:  # we don't own the file handles
                if copy_file_handles:  # actually we do, something else is wrong
                    raise sc.core.exceptions.SynapseHTTPError(
                        "There was an issue storing records from {} "
                        "to {}.".format(source_id, target_project)) from e
                elif copy_file_handles is False:  # user explicitly specified no copies
                    raise e
                else:
                    source_table = replace_file_handles(
                        syn,
                        df=source_table,
                        source_table_id=source_id,
                        source_table_cols=source_table_cols)
                    target_table = _store_dataframe_to_table(
                        syn,
                        df=source_table,
                        df_cols=source_table_cols,
                        parent_id=target_project,
                        table_name=source_table_info["name"],
                        used=source_id)
            results[source_id] = (target_table.tableId, source_table)
    elif isinstance(table_mapping, dict):  # export to preexisting tables
        tables = list(table_mapping)
        if source_tables is None:
            new_records = synapsebridgehelpers.query_across_tables(
                syn, tables, **kwargs)
            source_tables = {t: df for t, df in zip(tables, new_records)}
        for source, target in table_mapping.items():
            source_table = source_tables[source]
            if source_table.shape[0] == 0:
                continue
            target_table = syn.tableQuery("select * from {}".format(target))
            target_table = target_table.asDataFrame()
            # has the schema changed?
            source_cols = list(syn.getTableColumns(source))
            target_cols = list(syn.getTableColumns(target))
            schema_comparison = compare_schemas(source_cols=source_cols,
                                                target_cols=target_cols,
                                                source_table=source_table,
                                                target_table=target_table)
            try:  # error after updating schema -> data may be lost from target table
                if sum(list(map(len, schema_comparison.values()))) > 0:
                    synchronize_schemas(syn,
                                        schema_comparison=schema_comparison,
                                        source=source,
                                        target=target,
                                        source_cols=source_cols,
                                        target_cols=target_cols)
                    # synchronize schema of pandas DataFrame with Synapse
                    for col in schema_comparison["removed"]:
                        target_table = target_table.drop(col, axis=1)
                    target_table = target_table.rename(
                        schema_comparison["renamed"], axis=1)
                    target_table = _sanitize_dataframe(syn, target_table,
                                                       target)
                    target_table = target_table.reset_index(drop=True)
                    syn.store(
                        sc.Table(target, target_table, headers=source_cols))
            except Exception as e:
                dump_on_error(target_table, e, syn, source, target)
            if update:
                if reference_col is not None:
                    source_table = source_table.set_index(reference_col,
                                                          drop=False)
                    target_table = target_table.set_index(reference_col,
                                                          drop=False)
                else:
                    raise TypeError(
                        "If updating target tables with new records "
                        "from a source table, you must specify a "
                        "reference column as a basis for comparison.")
                new_records = source_table.loc[source_table.index.difference(
                    target_table.index)]
                if len(new_records):
                    source_table_info = syn.get(source)
                    source_table_cols = list(syn.getTableColumns(source))
                    if (copy_file_handles):
                        new_records = replace_file_handles(
                            syn,
                            df=new_records,
                            source_table_id=source,
                            source_table_cols=source_table_cols)
                    try:
                        target_table = _store_dataframe_to_table(
                            syn,
                            df=new_records,
                            df_cols=source_table_cols,
                            table_id=target,
                            used=source)
                    except sc.core.exceptions.SynapseHTTPError as e:  # we don't own the file handles
                        if copy_file_handles:  # actually we do, something else is wrong
                            raise sc.core.exceptions.SynapseHTTPError(
                                "There was an issue storing records from {} "
                                "to {}.".format(source, target)) from e
                        elif copy_file_handles is False:  # user specified no copies
                            raise e
                        else:
                            source_table = replace_file_handles(
                                syn,
                                df=new_records,
                                source_table_id=source,
                                source_table_cols=source_table_cols)
                            target_table = _store_dataframe_to_table(
                                syn,
                                df=new_records,
                                df_cols=source_table_cols,
                                table_id=target,
                                used=source)
                    results[source] = (target, new_records)
            else:  # delete existing rows, store upstream rows
                target_table = syn.tableQuery(
                    "select * from {}".format(target))
                syn.delete(target_table.asRowSet())
                source_cols = list(syn.getTableColumns(source))
                table_to_store = source_table
                if copy_file_handles:
                    table_to_store = replace_file_handles(
                        syn,
                        df=source_table,
                        source_table_id=source,
                        source_table_cols=source_cols)
                try:
                    target_table = _store_dataframe_to_table(
                        syn,
                        df=table_to_store,
                        df_cols=source_cols,
                        table_id=target,
                        used=source)
                except sc.core.exceptions.SynapseHTTPError as e:  # we don't own the file handles
                    if copy_file_handles:  # actually we do, something else is wrong
                        raise sc.core.exceptions.SynapseHTTPError(
                            "There was an issue storing records from {} "
                            "to {}.".format(source, target)) from e
                    elif copy_file_handles is False:  # user specified no copies
                        raise e
                    else:
                        table_to_store = replace_file_handles(
                            syn,
                            df=table_to_store,
                            source_table_id=source,
                            source_table_cols=source_cols)
                        target_table = _store_dataframe_to_table(
                            syn,
                            df=table_to_store,
                            df_cols=source_cols,
                            table_id=target,
                            used=source)
                results[source] = (target, table_to_store)
    else:
        raise TypeError(
            "table_mapping must be either a list (if exporting "
            "tables to a target_project), str (if exporting a single "
            "table to a project), or a dict (if exporting "
            "tables to preexisting tables).")
    return results
Example #12
0
def test_str_query(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=[s["id"] for s in tables["schema"]],
                                 query="select * from {}")
    reference = sample_table.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True), reference)
Example #13
0
def test_one_table(syn, tables, sample_table):
    result = query_across_tables(syn, tables=tables["schema"][0]["id"])
    reference = sample_table.reset_index(drop=True)
    pd.testing.assert_frame_equal(result[0].reset_index(drop=True),
                                  sample_table)
Example #14
0
def test_str_query_exception(syn, tables):
    with pytest.raises(TypeError):
        query_across_tables(syn, [s["id"] for s in tables["schema"]],
                            query="select * from {}",
                            substudy="my-study")
Example #15
0
def test_two_tables(syn, tables, sample_table):
    result = query_across_tables(syn,
                                 tables=[s["id"] for s in tables["schema"]])
    reference = sample_table.reset_index(drop=True)
    assert (result[0].reset_index(drop=True).equals(reference)
            and result[1].reset_index(drop=True).equals(reference))