Example #1
0
# for local dev, set credentials
from dotenv import load_dotenv
from os import environ

load_dotenv()

# host=environ['MOLGENIS_PROD_HOST']
host = environ['MOLGENIS_ACC_HOST']
rd3 = Molgenis(url=host)
rd3.login(username=environ['MOLGENIS_ACC_USR'],
          password=environ['MOLGENIS_ACC_PWD'])

# pull RD3 data
files = rd3.get(entity='rd3_portal_cluster',
                q='type=="phenopacket"',
                attributes='release,name,type',
                batch_size=10000)

subjects = rd3.get(entity='rd3_freeze1_subject',
                   attributes='id,subjectID,patch',
                   batch_size=10000)

statusMsg('File metadata entries pulled: {}'.format(len(files)))
statusMsg('Subject metadata entries pulled: {}'.format(len(subjects)))

# extract subject ID
for file in files:
    file['subject'] = re.sub(
        pattern=r'((.[0-9]{4}-[0-9]{2}-[0-9]{2})?(.json))$',
        repl='',
        string=file['name'])
Example #2
0
# /////////////////////////////////////////////////////////////////////////////

# ~ 1 ~
# Start Molgenis Session and Pull Required Data
#
# In order to process new phenopacket files, it is important to compare values
# with new exiting RD3 metadata. This allows us to import the values that have
# changed rather than everything. Once the contents of the files have been
# processed and evaluated, we can import them into RD3. These values will be
# imported into the `subject` and `subjectinfo` tables. The attributes that
# are managed by this script are listed in the GET requests below.

# pull subject metadata for the current freeze
freeze = rd3.get(
    entity=paths['rd3_subjects'],
    attributes=
    'id,subjectID,clinical_status,disease,phenotype,hasNotPhenotype,phenopacketsID,patch',
    batch_size=10000)

# pull subjectinfo data
freeze_info = rd3.get(entity=paths['rd3_subjectinfo'],
                      attributes='id,dateofBirth,ageOfOnset,patch',
                      batch_size=10000)

# extract subject IDs for later
# freeze_ids = rd3tools.flatten_attr(freeze, 'id')
freeze_ids = [row['id'] for row in freeze]

# pull HPO and disease codes, and then flatten
hpo_codes_raw = rd3.get(entity='rd3_phenotype', batch_size=10000)
disease_codes_raw = rd3.get(entity='rd3_disease', batch_size=10000)
# migrate data from one server to the other:
# pull data then switch tokens and restart connection
# portalData = rd3.get(releaseName,batch_size=10000)
# rd3.importData(entity='rd3_portal_release_freeze3', data=portalData)

#//////////////////////////////////////////////////////////////////////////////

# ~ 0 ~
# Create Reference Datasets
# Pull reference tables to create mapping tables for recoding raw values into
# RD3 terminology. Add additional mappings as needed.

# ~ 0a ~
# Create ERN Mapping
erns = dt.Frame(rd3.get('rd3_ERN'))
del erns['_href']

# as key pair dictionary
ernMappings = toKeyPairs(data=erns[:, {
    'from': f.identifier,
    'to': f.identifier
}].to_pandas().to_dict('records'),
                         keyAttr='from',
                         valueAttr='to')

# define additional ERN mappings based on past/present values the variation
# must be mapped to an existing ERN identifier. The format you should use is:
# `'variation' : 'RD3 ERN identifier'`
ernMappings.update({
    'ERN-CRANIO': 'ERNCRANIO',
# Pull Data
# The source of the novelomics releases come from rd3_portal_novelomics. Data
# is sent from EGA and Tubingen, and sometimes supplied by CNAG. To run this
# script, pull both novelomics portal tables, reference entities, and create
# a list of existing subject and sample IDs.
#
# Pull mapping tables or define them below.

# ~ 0a ~
# Pull portal tables
# After the initial run, make sure the query param is uncommented.
statusMsg('Pulling data from the portal....')

shipment = dt.Frame(
    rd3.get(entity='rd3_portal_novelomics_shipment',
            q='processed==False',
            batch_size=10000))

experiment = dt.Frame(
    rd3.get(entity='rd3_portal_novelomics_experiment',
            q='processed==False',
            batch_size=10000))

del shipment['_href']
del experiment['_href']

# ~ 0b ~
# Build Patch Information
# Determine if there are any new releases based on type of analysis. If there
# are, stop this script and complete the following
#   1. Determine if this is an actual new study or if this data should be
    'sample_id': 'sampleID',
    'participant_subject': 'subjectID',
    'pathological state': 'pathologicalState',
    'tumor cell fraction': 'percentageTumorCells'
}

newData[:, dt.update(sampleID=as_type(f.sampleID, str))]

newData.key = 'sampleID'

# ~ 1b ~
# Pull the deepwes data from RD3
# Unnest reference attributes and set key

samples = rd3.get(entity='rd3_noveldeepwes_sample',
                  attributes='id,sampleID,subject',
                  batch_size=10000)

for row in samples:
    row['subject'] = row['subject']['subjectID']

samples = dt.Frame(samples)
del samples['_href']

samples.key = 'sampleID'

# ~ 1c ~
# Join datasets
newSamplesData = samples[:, :, dt.join(newData)]

# recode attribute
# RD3 `rd3_freeze[x]_subject` where `[x]` is the freeze that the new PED files
# are tied to (e.g., `rd3_freeze2_subject`).
#
#   - `id`: the molgenis row ID; a concatenation of subject ID and release
#   - `subjectID`: RD3 P number
#   - `sex`: patient's sex
#   - `fid`: family ID
#
# It isn't necessary to run extensive checks that compare PED file data with
# the values that are in RD3 as PED files should be considered the most
# up to date.

# pull subject metadata for the current freeze
freeze_subject_metadata = rd3.get(
    entity=paths['rd3_subjects'],
    # q = 'patch=freeze1_patch1',
    attributes='id,subjectID,sex1,fid',
    batch_size=10000)

# flatten subjectIDs for faster comparison later on
subject_ids = [row['subjectID'] for row in freeze_subject_metadata]

# In addition to subject metadata, it is import to pull file metadata to identify
# which have changed and should be processed. We will pull the following
# attributes:
#
#   - `EGA`: the EGA file ID
#   - `name`: the full name of the file
#   - `md5`: checksum
#
availableReleases = regularReleases + novelomicsReleases

statusMsg('Pulling metadata....')

# fetch subject metadata
subjects=[]
for release in availableReleases:
    statusMsg('Fetching subject metadata for',release)
    data=rd3.get(
        entity=f"rd3_{release}_subject",
        batch_size=10000,
        attributes=','.join([
            'id', 'subjectID',
            'sex1',
            'fid', 'mid', 'pid',
            'clinical_status', 'disease', 'phenotype', 'hasNotPhenotype',
            'organisation', 'ERN',
            'solved',
            'patch'
        ])
    )

    # clean data
    for row in data:
        row['sex1']=row.get('sex1',{}).get('identifier')
        row['mid']=row.get('mid',{}).get('id')
        row['pid']=row.get('pid',{}).get('id')
        if row.get('disease'):
            row['disease']=','.join([record['id'] for record in row['disease']])
        else: