def create_new_fileformat_table(
    syn: Synapse,
    file_format: str,
    newdb_name: str,
    projectid: str,
    archive_projectid: str,
) -> dict:
    """Creates new database table based on old database table and archives
    old database table

    Args:
        syn: Synapse object
        file_format: File format to update
        newdb_name: Name of new database table
        projectid: Project id where new database should live
        archive_projectid: Project id where old database should be moved

    Returns:
        {"newdb_ent": New database synapseclient.Table,
         "newdb_mappingdf": new databse pd.DataFrame,
         "moved_ent": old database synpaseclient.Table}
    """
    db_info = get_dbmapping(syn, projectid)
    database_mappingdf = db_info["df"]
    dbmapping_synid = db_info["synid"]

    olddb_synid = getDatabaseSynId(syn,
                                   file_format,
                                   databaseToSynIdMappingDf=database_mappingdf)
    olddb_ent = syn.get(olddb_synid)
    olddb_columns = list(syn.getTableColumns(olddb_synid))

    newdb_ent = _create_schema(
        syn,
        table_name=newdb_name,
        columns=olddb_columns,
        parentid=projectid,
        annotations=olddb_ent.annotations,
    )

    newdb_mappingdf = _update_database_mapping(syn, database_mappingdf,
                                               dbmapping_synid, file_format,
                                               newdb_ent.id)
    # Automatically rename the archived entity with ARCHIVED
    # This will attempt to resolve any issues if the table already exists at
    # location
    new_table_name = f"ARCHIVED {time.time()}-{olddb_ent.name}"
    moved_ent = _move_entity(syn,
                             olddb_ent,
                             archive_projectid,
                             name=new_table_name)
    return {
        "newdb_ent": newdb_ent,
        "newdb_mappingdf": newdb_mappingdf,
        "moved_ent": moved_ent,
    }
Exemple #2
0
def mirror(syn: Synapse,
           entity: Union[File, Folder, Project],
           destination: Union[File, Folder, Project],
           force: bool = False,
           dryrun: bool = False):
    """Mirrors (sync) wiki pages by using the wikipage titles between two
    Synapse Entities.  This function only works if `entity` and `destination`
    are the same type and both must have wiki pages.  Only wiki pages with the
    same titles will be copied from `entity` to `destination` - if there is
    a wiki page that you want to add, you will have to create a wiki page
    first in the `destination` with the same name.

    Args:
        entity: Synapse File, Project, Folder Entity or Id with
                Wiki you want to copy
        destination: Synapse File, Project, Folder Entity or Id
                     with Wiki that matches entity
        force: Update a page even if its the same. Default to False.
        dryrun: Show the pages that have changed but don't update. Default
                is False.

    """
    entity = syn.get(entity, downloadFile=False)
    destination = syn.get(destination, downloadFile=False)
    if type(entity) is not type(destination):
        raise ValueError("Can only mirror wiki pages between similar "
                         "entity types")

    # Get entity/destination pages and mapping of wiki pages
    pages_and_mappings = _get_wikipages_and_mapping(syn, entity, destination)

    if dryrun:
        logger.info("Your wiki pages will not be mirrored. `dryrun` is True")
    _update_wiki(syn,
                 **pages_and_mappings,
                 force=force,
                 dryrun=dryrun,
                 entity=entity,
                 destination=destination)
def get_dbmapping(syn: Synapse, project_id: str) -> dict:
    """Gets database mapping information
    Args:
        syn: Synapse connection
        project_id: Project id where new data lives
    Returns:
        {'synid': database mapping syn id,
         'df': database mapping pd.DataFrame}
    """
    project_ent = syn.get(project_id)
    dbmapping_synid = project_ent.annotations.get("dbMapping", "")[0]
    database_mappingdf = get_syntabledf(syn,
                                        f'select * from {dbmapping_synid}')
    return {'synid': dbmapping_synid, 'df': database_mappingdf}
def get_dbmapping(syn: Synapse, projectid: str) -> dict:
    """Gets database mapping information

    Args:
        syn: Synapse connection
        projectid: Project id where new data lives

    Returns:
        {'synid': database mapping syn id,
         'df': database mapping pd.DataFrame}

    """
    project_ent = syn.get(projectid)
    dbmapping_synid = project_ent.annotations.get("dbMapping", "")[0]
    database_mapping = syn.tableQuery(f"select * from {dbmapping_synid}")
    database_mappingdf = database_mapping.asDataFrame()
    return {"synid": dbmapping_synid, "df": database_mappingdf}
Exemple #5
0
def _combine_center_file_errors(syn: Synapse,
                                center_errorsdf: pd.DataFrame) -> str:
    """Combine all center errors into one printable string

    Args:
        syn: Synapse connection
        center_errorsdf: Center errors dataframe

    Returns:
        Center errors in a pretty formatted string

    """
    center_errors = ""
    for _, row in center_errorsdf.iterrows():
        ent = syn.get(row["id"], downloadFile=False)
        file_errors = row["errors"].replace("|", "\n")
        error_text = f"\t{ent.name} ({ent.id}):\n\n{file_errors}\n\n"
        center_errors += error_text
    return center_errors
def main():
    # Parse command-line arguments
    args = parse_arguments()
    # Set up Synapse
    syn = Synapse()
    syn.login(args.username, args.password, rememberMe=args.remember)
    # Retrieve Synapse entity (e.g., project, folder)
    entity = syn.get(args.synid, downloadFile=False)
    log("Entity", entity)
    # Retrieve team
    team = syn.getTeam(args.team)  # TODO: Handle users with try-catch
    log("Team", team)
    # Assign specified permissions for given entity and team
    permissions = syn.setPermissions(entity,
                                     team.id,
                                     accessType=args.permissions)
    log("Permissions", permissions)
    # Celebrate
    print("Success!")
def get_registered_challenges(syn: Synapse,
                              userid: str = None) -> Iterator[Project]:
    """Get the Synapse Challenge Projects a user is registered to.
    Defaults to the logged in synapse user.

    Args:
        syn: Synapse connection
        userid: Specify userid if you want to know the challenges
                another Synapse user is registered to.

    Yields:
        A synapseclient.Project

    """
    challenge_api = ChallengeApi(syn=syn)
    # This will return the logged in user profile if None is passed in
    profile = syn.getUserProfile(userid)
    userid = profile.ownerId
    registered = challenge_api.get_registered_challenges(participantId=userid)
    for challenge in registered:
        challenge_ent = syn.get(challenge.projectId)
        print(challenge_ent.name)
        yield challenge_ent
Exemple #8
0
class KrakenDownload(object):
    """Utility to download Kraken DB and place them in a local directory

    ::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download('toydb')
        kd.download('minikraken')

    A large database (8Gb) is available on synapse and has the following DOI::

        doi:10.7303/syn6171000

    It can be downloaded manually or if you have a Synapse login
    (https://www.synapse.org), you can use::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.downloaded("sequana_db1")
    """
    dv = DevTools()

    def download(self, name, verbose=True):
        if name == "minikraken":
            self._download_minikraken(verbose=verbose)
        elif name == "toydb":
            self._download_kraken_toydb(verbose=verbose)
        elif name == "sequana_db1":
            self._download_sequana_db1(verbose=verbose)
        else:
            raise ValueError(
                "name must be toydb or minikraken, or sequana_db1")

    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"
        ]
        filenames = [
            "database.idx", "database.kdb", "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"
        ]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)

    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(
                filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz",
                 filename)
        # unzipping. requires tar and gzip

    def _download_from_synapse(self, synid, target_dir):
        try:
            from synapseclient import Synapse
        except ImportError:
            raise ImportError(
                "Please install synapseclient using 'pip install synapseclient'"
            )
        try:
            self._synapse.get(synid, downloadLocation=target_dir)
        except:
            self._synapse = Synapse()
            self._synapse.login()
            self._synapse.get(synid, downloadLocation=target_dir)

    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info(
            "Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(
                filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(
                filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(
                filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(
                filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(
                filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(
                filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget(
            "https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
from msda import process_phospho_ms as pm
from msda import preprocessing as pr
from msda import pca
from msda import kmeans
from msda.clustering import plot_clustermap as pc
from msda import enrichr_api as ai
from msda import mapping

# Load individual 10-plexes, process and
# normalize data into a single dataset.
# --------------------------------------
syn = Synapse()
syn.login()

set1_ids = ['syn10534323', 'syn10534325', 'syn10534331', 'syn10534329']
set1_df_list = [pd.read_excel((syn.get(id).path)) for id in set1_ids]
# no recorded value for syn1053432
df_set1, _ = pm.merge(set1_df_list[:-1])
# Filter peptides with localization score less than 13
df_set1 = pm.filter_max_score(df_set1, max_score_cutoff=13.0)
set1_columns = [
    str(s).replace('default', 'Set1') for s in df_set1.columns.tolist()
]
set1_columns = [s.replace('max_score', 'set1_max_score') for s in set1_columns]
df_set1.columns = set1_columns

set2_ids = ['syn10534326', 'syn10534328', 'syn10534332', 'syn10534333']
set2_df_list = [pd.read_excel((syn.get(id).path)) for id in set2_ids]
df_set2, _ = pm.merge(set2_df_list)
# Filter peptides with localization score less than 13
df_set2 = pm.filter_max_score(df_set2, max_score_cutoff=13.0)
Exemple #10
0
class KrakenDownload(object):
    """Utility to download Kraken DB and place them in a local directory

    ::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.download('toydb')
        kd.download('minikraken')

    A large database (8Gb) is available on synapse and has the following DOI::

        doi:10.7303/syn6171000

    It can be downloaded manually or if you have a Synapse login
    (https://www.synapse.org), you can use::

        from sequana import KrakenDownload
        kd = KrakenDownload()
        kd.downloaded("sequana_db1")
    """
    dv = DevTools()
    def download(self, name, verbose=True):
        if name == "minikraken":
            self._download_minikraken(verbose=verbose)
        elif name == "toydb":
            self._download_kraken_toydb(verbose=verbose)
        elif name == "sequana_db1":
            self._download_sequana_db1(verbose=verbose)
        else:
            raise ValueError("name must be toydb or minikraken, or sequana_db1")


    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)

    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
        # unzipping. requires tar and gzip

    def _download_from_synapse(self, synid, target_dir):
        try:
            from synapseclient import Synapse
        except ImportError:
            raise ImportError("Please install synapseclient using 'pip install synapseclient'")
        try:
            self._synapse.get(synid, downloadLocation=target_dir)
        except:
            self._synapse = Synapse()
            self._synapse.login()
            self._synapse.get(synid, downloadLocation=target_dir)

    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Exemple #11
0
# Download Synapse reference and example data using inputs.json
# into "./data"
from getpass import getpass
from synapseclient import Synapse
import json

print '''
This script helps download the reference and example data
from synapse.org. You need a Synapse user account.
https://www.synapse.org/

Data will be downloaded to ./data
'''
username = raw_input("Enter your Synapse account email: ")
password = getpass()

syn = Synapse()

print 'Logging in to synapse.org...'
syn.login(username, password, rememberMe=True)

print 'Loading synapse_inputs.json'
inputs = json.load(open('synapse_inputs.json'))

for k, v in inputs.items():
    print 'Downloading "{}"'.format(k)
    syn.get(v, downloadLocation='data')
def process_mutation_workflow(
    syn: Synapse,
    center: str,
    validfiles: pd.DataFrame,
    genie_annotation_pkg: str,
    database_mappingdf: str,
    workdir: str,
) -> str:
    """Process vcf/maf workflow

    Args:
        syn: Synapse connection
        center: Center name
        validfiles: Center validated files
        genie_annotation_pkg: Genome Nexus annotation tools
        database_mappingdf: Database to synapse id mapping dataframe
        workdir: Working directory

    Returns:
        Annotated Maf Path

    """
    # Get valid files
    mutation_files = validfiles["fileType"].isin(["maf", "vcf"])
    valid_mutation_files = validfiles["path"][mutation_files].tolist()
    # If there are no valid mutation files, return
    if not valid_mutation_files:
        logger.info("No mutation data")
        return
    # Certificate to use GENIE Genome Nexus
    syn.get(
        "syn22053204",
        ifcollision="overwrite.local",
        downloadLocation=genie_annotation_pkg,
    )
    # Genome Nexus Jar file
    syn.get(
        "syn22084320",
        ifcollision="overwrite.local",
        downloadLocation=genie_annotation_pkg,
    )

    annotated_maf_path = annotate_mutation(
        center=center,
        mutation_files=valid_mutation_files,
        genie_annotation_pkg=genie_annotation_pkg,
        workdir=workdir,
    )

    maf_tableid = database_mappingdf.Id[database_mappingdf["Database"] ==
                                        "vcf2maf"].iloc[0]
    flatfiles_synid = database_mappingdf.Id[database_mappingdf["Database"] ==
                                            "centerMaf"].iloc[0]
    # Split into narrow maf and store into db / flat file
    split_and_store_maf(
        syn=syn,
        center=center,
        maf_tableid=maf_tableid,
        annotated_maf_path=annotated_maf_path,
        flatfiles_synid=flatfiles_synid,
        workdir=workdir,
    )

    return annotated_maf_path