def listFiles(path, filter: str = None):
     """List Files
     List all files at a given path
     
     @param path location to directory
     @param filter a string containing a file type used to filter results
         (can also be a pattern)
     
     @return a list of dictionaries
     """
     available_files = subprocess.Popen(
         ['ssh', 'corridor+fender', 'ls', path],
         stdout=subprocess.PIPE,
         stdin=subprocess.PIPE,
         stderr=subprocess.PIPE,
         universal_newlines=True)
     path = addForwardSlash(path)
     data = []
     for f in available_files.stdout:
         data.append({'filename': f.strip(), 'filepath': path + f.strip()})
     available_files.kill()
     if filter:
         filtered = []
         for d in data:
             q = re.compile(filter)
             if re.search(q, d['filename']):
                 filtered.append(d)
         statusMsg('Found {} files'.format(len(filtered)))
         return filtered
     else:
         statusMsg('Found {} files'.format(len(data)))
         return data
Example #2
0
 def _checkResponseStatus(self, response, label):
     if (response.status_code // 100) != 2:
         err = response.json().get('errors')[0].get('message')
         statusMsg(
             f'Failed to import data into {label} ({response.status_code}): {err}'
         )
     else:
         statusMsg(f'Imported data into {label}')
 def readJson(path: str = None):
     """Read Json file
     Read a json file located on the cluster
     
     @param path location of the file
     @return list containing contents of a json file
     """
     proc = subprocess.Popen(['ssh', 'corridor+fender', 'cat', path],
                             stdout=subprocess.PIPE,
                             stdin=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True)
     try:
         raw = proc.communicate(timeout=15)
         data = json.loads(raw[0])
         proc.kill()
         return data
     except subprocess.TimeoutExpired:
         statusMsg('Error: unable to fetch file {}'.format(
             str(basename(path))))
         proc.kill()
         return ''
Example #4
0
# manual review. These codes will either need to be added to the appropriate
# lookup table or need to be mapped to a new value. These cases should be
# reconciled before importing into RD3.
#
# Using the object `shouldProcess`, you can process certain data elements
# in the phenopacket files. This may be useful for if you need to refresh
# only disease codes or disease codes.
#

# create list of all available JSON files
# allFiles = rd3tools.cluster_list_files(path = paths['cluster_phenopacket'])
allFiles = clustertools.listFiles(path=paths['cluster_phenopacket'])
phenopacketFiles = [
    file for file in allFiles if re.search(r'(.json)$', file['filename'])
]
statusMsg('Found', len(phenopacketFiles), 'phenopacket files')

# init loop params and objects
shouldProcess = {
    'subject': True,
    'pheno': True,
    'dx': True
}  # set props to process
hpo_codes_not_found = []
disease_codes_not_found = []
onset_codes_not_found = []
unavailable = []
phenopackets = []

# start file processing
statusMsg('Starting file processing...')
Example #5
0
host = environ['MOLGENIS_ACC_HOST']
rd3 = Molgenis(url=host)
rd3.login(username=environ['MOLGENIS_ACC_USR'],
          password=environ['MOLGENIS_ACC_PWD'])

# pull RD3 data
files = rd3.get(entity='rd3_portal_cluster',
                q='type=="phenopacket"',
                attributes='release,name,type',
                batch_size=10000)

subjects = rd3.get(entity='rd3_freeze1_subject',
                   attributes='id,subjectID,patch',
                   batch_size=10000)

statusMsg('File metadata entries pulled: {}'.format(len(files)))
statusMsg('Subject metadata entries pulled: {}'.format(len(subjects)))

# extract subject ID
for file in files:
    file['subject'] = re.sub(
        pattern=r'((.[0-9]{4}-[0-9]{2}-[0-9]{2})?(.json))$',
        repl='',
        string=file['name'])
ids = [file['subject'] for file in files]

# update ptch
for s in subjects:
    if s['subjectID'] in ids:
        patches = ['freeze1_patch1']
        for patch in s.get('patch'):
# pull unique values
rawOrgs = dt.unique(release[:, 'organisation_name'])

# check organisations
rawOrgs['orgExists'] = dt.Frame([
    d in organisations['identifier'].to_list()[0]
    for d in rawOrgs['organisation_name'].to_list()[0]
])

# flag cases
if rawOrgs[f.orgExists == False, :].nrows:
    statusMsg(
        'Error in Organisation Validation:',
        rawOrgs[f.orgExists == False, :].nrows, 'values do not exist.',
        ','.join(
            map(
                str, rawOrgs[f.orgExists == False,
                             f.organisation_name].to_list()[0])))

# if all organisations have been reviewed, add the values to RD3 and recode
# the main dataset
newOrgs = rawOrgs[f.orgExists == False, {'name': f.organisation_name}]

# clean up values
newOrgs[['name', 'identifier']] = dt.Frame([
    d.strip().lower().replace(' ', '-') for d in newOrgs['name'].to_list()[0]
])

# recode organizations if applicable
# release['organisation_name'] = dt.Frame([
#//////////////////////////////////////////////////////////////////////////////

# ~ 0 ~
# Pull Data
# The source of the novelomics releases come from rd3_portal_novelomics. Data
# is sent from EGA and Tubingen, and sometimes supplied by CNAG. To run this
# script, pull both novelomics portal tables, reference entities, and create
# a list of existing subject and sample IDs.
#
# Pull mapping tables or define them below.

# ~ 0a ~
# Pull portal tables
# After the initial run, make sure the query param is uncommented.
statusMsg('Pulling data from the portal....')

shipment = dt.Frame(
    rd3.get(entity='rd3_portal_novelomics_shipment',
            q='processed==False',
            batch_size=10000))

experiment = dt.Frame(
    rd3.get(entity='rd3_portal_novelomics_experiment',
            q='processed==False',
            batch_size=10000))

del shipment['_href']
del experiment['_href']

# ~ 0b ~
#
# See https://zzz.bwh.harvard.edu/plink/data.shtml for more information.
#
# By default all rows are given an "upload" status. If any of the validation
# steps fail, the upload status is set to FALSE. See the script `rd3tools.py`
# for more information on how values are validated.

# Create a list of all PED files that are stored at a specific path on the
# cluster. Make sure all non-PED files are removed.
available_ped_files_raw = clustertools.listFiles(path=paths['cluster_ped'])
available_ped_files = []
for file in available_ped_files_raw:
    if re.search(r'(\.ped|\.ped.cip)$', file.get('filename')):
        available_ped_files.append(file)

statusMsg(f'Processing {len(available_ped_files)} PED files')

# ~ 2a ~
# For each file, extract contents, validate data and transform into RD3
# terminology.
raw_ped_data = []
starttime = datetime.utcnow().strftime('%H:%M:%S.%f')[:-4]
for pedfile in available_ped_files:
    statusMsg('Processing file {}'.format(pedfile['filename']))
    # result = rd3tools.find_dict(
    #     data = freeze_files_metadata,
    #     attr = 'filename',
    #     value = pedfile['filename'] + '.cip'
    # )
    # should_process = False
    # if result:
    createUrlFilter
)

from datatable import dt, f, first
from os import environ
from dotenv import load_dotenv
import numpy as np
import urllib
from tqdm import tqdm


# ~ 0 ~ 
# Fetch Metadata for all releases

# init database connection
statusMsg('Connecting to RD3....')
load_dotenv()

# rd3=Molgenis(url=environ['MOLGENIS_ACC_HOST'])
# rd3.login(
#     username=environ['MOLGENIS_ACC_USR'],
#     password=environ['MOLGENIS_ACC_PWD']
# )

rd3=Molgenis(url=environ['MOLGENIS_PROD_HOST'])
rd3.login(
    username=environ['MOLGENIS_PROD_USR'],
    password=environ['MOLGENIS_PROD_PWD']
)