[antigen_type]_where_homozygous.npy: a boolean array indicating which columns (tiles) were kept from the population after removing poorly sequenced columns. If settings.REMOVE_HOMOZYGOUS==False, the array will be entirely true python_variables.py: python file defining training_names and test_names """ import helping_functions as fns from sklearn import preprocessing import arvados # Import the Arvados sdk module import imp import os import time import numpy as np ######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' % (t1 - t0)) ######################################################################################################################## #Get path lengths and path integers
#!/usr/bin/env python # Import the hashlib module (part of the Python standard library) to compute md5. import hashlib # Import the Arvados sdk module import arvados # Get information about the task from the environment this_task = arvados.current_task() # Get the "input" field from "script_parameters" on the job creation object this_job_input = arvados.getjobparam('input') # Create the object access to the collection referred to in the input collection = arvados.CollectionReader(this_job_input) # Create an object to write a new collection as output out = arvados.CollectionWriter() # Set the name of output file within the collection out.set_current_file_name("md5sum.txt") # Get an iterator over the files listed in the collection all_files = collection.all_files() # Iterate over each file for input_file in all_files: # Create the object that will actually compute the md5 hash digestor = hashlib.new('md5')
import json import os import time import numpy as np def get_bool(param): if param.lower() == 'true' or param.lower() == 't': return True elif param.lower() == 'false' or param.lower() == 'f': return False raise Exception("%s cannot be parsed as boolean. Please use true/false." % (param)) # Read constants ######################################################################################################################## #Get name to save png file under PNG_NAME = arvados.getjobparam('png-name') #Get whether we should use quality or regular numpy files QUALITY = get_bool(arvados.getjobparam('quality')) #Get percentage of the tiles we should use in PCA PERCENT_TO_RETRIEVE = float(arvados.getjobparam('percent-to-analyze')) #Get if we should only load certain parts of the genome ACCEPTED_PATHS = [] input_accepted_paths = arvados.getjobparam('accepted-paths') if input_accepted_paths != None: try: ACCEPTED_PATHS = json.loads(input_accepted_paths) except ValueError: raise Exception("Unable to read 'accepted-paths' input as json input: %s" % (input_accepted_paths)) for lower_path, upper_path in ACCEPTED_PATHS: assert re.match('^[0-9a-f]+$', lower_path) != None, \ "'accepted-paths' input contains an incorrect path hex string (%s) (it does not match '^[0-9a-f]+$')" % (lower_path)
#!/usr/bin/python import arvados import re import hashlib import string api = arvados.api('v1') piece = 0 manifest_text = "" # Look for paired reads inp = arvados.CollectionReader(arvados.getjobparam('reads')) manifest_list = [] def nextline(reader, start): n = -1 while True: r = reader.readfrom(start, 128) if r == '': break n = string.find(r, "\n") if n > -1: break else: start += 128 return n
import matplotlib as mpl # must be imported first to ensure matplotlib visualization works mpl.use('Agg') import matplotlib.pyplot as plt import helping_functions as fns from sklearn import cross_validation, preprocessing, grid_search, base import arvados # Import the Arvados sdk module import re # used for error checking import imp import os import time import numpy as np ######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') info_fh = out.open('log.txt', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' %(t1-t0)) ########################################################################################################################
#!/usr/bin/python import arvados import re import hashlib import string api = arvados.api('v1') piece = 0 manifest_text = "" # Look for paired reads inp = arvados.CollectionReader(arvados.getjobparam('reads')) manifest_list = [] chunking = False #arvados.getjobparam('chunking') def nextline(reader, start): n = -1 while True: r = reader.readfrom(start, 128) if r == '': break n = string.find(r, "\n") if n > -1: break else:
import matplotlib as mpl # must be imported first to ensure matplotlib visualization works mpl.use('Agg') import matplotlib.pyplot as plt import helping_functions as fns from sklearn import cross_validation, preprocessing, grid_search, base import arvados # Import the Arvados sdk module import re # used for error checking import imp import os import time import numpy as np ######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') info_fh = out.open('log.txt', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' % (t1 - t0))
import numpy as np def convert_to_tile_int(position, path_lengths): trunc_path_lengths = path_lengths[1:] i = 0 while position > trunc_path_lengths[i]: i += 1 path = hex(i).lstrip('0x').rstrip('L').zfill(3) step = hex(position - path_lengths[i]).lstrip('0x').rstrip('L').zfill(4) return int(path + step, 16) ######################################################################################################################## # Read constants NUM_RETRIES = int(arvados.getjobparam('num-retries')) assert NUM_RETRIES > 0, "'num-retries' must be strictly positive" antigen_type = str(arvados.getjobparam('antigen-type')) ######################################################################################################################## #Set-up collection and logging file to write out to out = arvados.collection.Collection(num_retries=NUM_RETRIES) time_logging_fh = out.open('time_log.txt', 'w') info_fh = out.open('log.txt', 'w') variable_fh = out.open('classifier_variables.py', 'w') ######################################################################################################################## # Load settings t0 = time.time() settings = imp.load_source('settings', arvados.get_job_param_mount('settings')) t1 = time.time() time_logging_fh.write('Loading settings %fs\n' % (t1 - t0))