def listFiles(path, filter: str = None): """List Files List all files at a given path @param path location to directory @param filter a string containing a file type used to filter results (can also be a pattern) @return a list of dictionaries """ available_files = subprocess.Popen( ['ssh', 'corridor+fender', 'ls', path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) path = addForwardSlash(path) data = [] for f in available_files.stdout: data.append({'filename': f.strip(), 'filepath': path + f.strip()}) available_files.kill() if filter: filtered = [] for d in data: q = re.compile(filter) if re.search(q, d['filename']): filtered.append(d) statusMsg('Found {} files'.format(len(filtered))) return filtered else: statusMsg('Found {} files'.format(len(data))) return data
def _checkResponseStatus(self, response, label): if (response.status_code // 100) != 2: err = response.json().get('errors')[0].get('message') statusMsg( f'Failed to import data into {label} ({response.status_code}): {err}' ) else: statusMsg(f'Imported data into {label}')
def readJson(path: str = None): """Read Json file Read a json file located on the cluster @param path location of the file @return list containing contents of a json file """ proc = subprocess.Popen(['ssh', 'corridor+fender', 'cat', path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) try: raw = proc.communicate(timeout=15) data = json.loads(raw[0]) proc.kill() return data except subprocess.TimeoutExpired: statusMsg('Error: unable to fetch file {}'.format( str(basename(path)))) proc.kill() return ''
# manual review. These codes will either need to be added to the appropriate # lookup table or need to be mapped to a new value. These cases should be # reconciled before importing into RD3. # # Using the object `shouldProcess`, you can process certain data elements # in the phenopacket files. This may be useful for if you need to refresh # only disease codes or disease codes. # # create list of all available JSON files # allFiles = rd3tools.cluster_list_files(path = paths['cluster_phenopacket']) allFiles = clustertools.listFiles(path=paths['cluster_phenopacket']) phenopacketFiles = [ file for file in allFiles if re.search(r'(.json)$', file['filename']) ] statusMsg('Found', len(phenopacketFiles), 'phenopacket files') # init loop params and objects shouldProcess = { 'subject': True, 'pheno': True, 'dx': True } # set props to process hpo_codes_not_found = [] disease_codes_not_found = [] onset_codes_not_found = [] unavailable = [] phenopackets = [] # start file processing statusMsg('Starting file processing...')
host = environ['MOLGENIS_ACC_HOST'] rd3 = Molgenis(url=host) rd3.login(username=environ['MOLGENIS_ACC_USR'], password=environ['MOLGENIS_ACC_PWD']) # pull RD3 data files = rd3.get(entity='rd3_portal_cluster', q='type=="phenopacket"', attributes='release,name,type', batch_size=10000) subjects = rd3.get(entity='rd3_freeze1_subject', attributes='id,subjectID,patch', batch_size=10000) statusMsg('File metadata entries pulled: {}'.format(len(files))) statusMsg('Subject metadata entries pulled: {}'.format(len(subjects))) # extract subject ID for file in files: file['subject'] = re.sub( pattern=r'((.[0-9]{4}-[0-9]{2}-[0-9]{2})?(.json))$', repl='', string=file['name']) ids = [file['subject'] for file in files] # update ptch for s in subjects: if s['subjectID'] in ids: patches = ['freeze1_patch1'] for patch in s.get('patch'):
# pull unique values rawOrgs = dt.unique(release[:, 'organisation_name']) # check organisations rawOrgs['orgExists'] = dt.Frame([ d in organisations['identifier'].to_list()[0] for d in rawOrgs['organisation_name'].to_list()[0] ]) # flag cases if rawOrgs[f.orgExists == False, :].nrows: statusMsg( 'Error in Organisation Validation:', rawOrgs[f.orgExists == False, :].nrows, 'values do not exist.', ','.join( map( str, rawOrgs[f.orgExists == False, f.organisation_name].to_list()[0]))) # if all organisations have been reviewed, add the values to RD3 and recode # the main dataset newOrgs = rawOrgs[f.orgExists == False, {'name': f.organisation_name}] # clean up values newOrgs[['name', 'identifier']] = dt.Frame([ d.strip().lower().replace(' ', '-') for d in newOrgs['name'].to_list()[0] ]) # recode organizations if applicable # release['organisation_name'] = dt.Frame([
#////////////////////////////////////////////////////////////////////////////// # ~ 0 ~ # Pull Data # The source of the novelomics releases come from rd3_portal_novelomics. Data # is sent from EGA and Tubingen, and sometimes supplied by CNAG. To run this # script, pull both novelomics portal tables, reference entities, and create # a list of existing subject and sample IDs. # # Pull mapping tables or define them below. # ~ 0a ~ # Pull portal tables # After the initial run, make sure the query param is uncommented. statusMsg('Pulling data from the portal....') shipment = dt.Frame( rd3.get(entity='rd3_portal_novelomics_shipment', q='processed==False', batch_size=10000)) experiment = dt.Frame( rd3.get(entity='rd3_portal_novelomics_experiment', q='processed==False', batch_size=10000)) del shipment['_href'] del experiment['_href'] # ~ 0b ~
# # See https://zzz.bwh.harvard.edu/plink/data.shtml for more information. # # By default all rows are given an "upload" status. If any of the validation # steps fail, the upload status is set to FALSE. See the script `rd3tools.py` # for more information on how values are validated. # Create a list of all PED files that are stored at a specific path on the # cluster. Make sure all non-PED files are removed. available_ped_files_raw = clustertools.listFiles(path=paths['cluster_ped']) available_ped_files = [] for file in available_ped_files_raw: if re.search(r'(\.ped|\.ped.cip)$', file.get('filename')): available_ped_files.append(file) statusMsg(f'Processing {len(available_ped_files)} PED files') # ~ 2a ~ # For each file, extract contents, validate data and transform into RD3 # terminology. raw_ped_data = [] starttime = datetime.utcnow().strftime('%H:%M:%S.%f')[:-4] for pedfile in available_ped_files: statusMsg('Processing file {}'.format(pedfile['filename'])) # result = rd3tools.find_dict( # data = freeze_files_metadata, # attr = 'filename', # value = pedfile['filename'] + '.cip' # ) # should_process = False # if result:
createUrlFilter ) from datatable import dt, f, first from os import environ from dotenv import load_dotenv import numpy as np import urllib from tqdm import tqdm # ~ 0 ~ # Fetch Metadata for all releases # init database connection statusMsg('Connecting to RD3....') load_dotenv() # rd3=Molgenis(url=environ['MOLGENIS_ACC_HOST']) # rd3.login( # username=environ['MOLGENIS_ACC_USR'], # password=environ['MOLGENIS_ACC_PWD'] # ) rd3=Molgenis(url=environ['MOLGENIS_PROD_HOST']) rd3.login( username=environ['MOLGENIS_PROD_USR'], password=environ['MOLGENIS_PROD_PWD'] )