def extract_seq(args): # Download the file from Shock to the working directory. if args['nodeId'] is not None: shockClient = ShockClient(args['shockUrl'], args['auth']) shockClient.download_to_path(args['nodeId'], args['sourceFile']) # Extract the sequences from the source file. numReads = 0 with open(args['destFile'], 'w') as f: if args['sequenceLen'] > 0: # A length to trim to was specified for seqRecord in SeqIO.parse(args['sourceFile'], args['format']): seq = str(seqRecord.seq) if len(seq) < args['sequenceLen']: continue if len(seq) > args['sequenceLen']: seq = seq[:args['sequenceLen']] f.write(str(seq) + '\n') numReads += 1 if numReads == args['maxReads']: break elif args['maxReads'] > 0: for seqRecord in SeqIO.parse(args['sourceFile'], args['format']): f.write(str(seqRecord.seq) + '\n') numReads += 1 if numReads == args['maxReads']: break else: for seqRecord in SeqIO.parse(args['sourceFile'], args['format']): f.write(str(seqRecord.seq) + '\n') # Delete the file if it does not have enough reads. if args['minReads'] > 0 and numReads < args['minReads']: os.remove(args['destFile']) return 0
def loadDatabaseFiles(self, mylog): ''' Load the static database files from Shock. The static database files are stored in the directory specified by the data_folder_path configuration variable. A file is only downloaded if the file is not available on this system or the file has been updated in Shock. @param mylog Log object for messages @return Nothing @raise MissingFileError when database file is not found in Shock ''' # Get the current info about the static database files from the cache file. cacheFilename = self.StatusFiles['cache_file'] if os.path.exists(cacheFilename): fileCache = json.load(open(cacheFilename, "r")) else: fileCache = dict() # Create a shock client. shockClient = ShockClient(self.shockURL) # See if the static database files on this system are up-to-date with files stored in Shock. shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items()) for key in shockFiles: # Get info about the file stored in Shock. localPath = shockFiles[key] name = os.path.basename(localPath) nodelist = shockClient.query_node( {'lookupname': 'ProbAnnoData/' + name}) if len(nodelist) == 0: message = "Database file %s is not available from %s\n" % ( name, self.shockURL) mylog.log_message(log.ERR, message) # MBM raise MissingFileError(message) node = nodelist[0] # Download the file if the checksum does not match or the file is not available on this system. download = False if key in fileCache: if node['file']['checksum']['md5'] != fileCache[key]['file'][ 'checksum']['md5']: download = True else: download = True if os.path.exists(localPath) == False: download = True if download: sys.stderr.write("Downloading %s to %s\n" % (key, localPath)) shockClient.download_to_path(node["id"], localPath) fileCache[key] = node mylog.log_message(log.INFO, 'Downloaded %s to %s' % (key, localPath)) # Save the updated cache file. json.dump(fileCache, open(cacheFilename, "w"), indent=4) return
def loadDatabaseFiles(self, mylog): ''' Load the static database files from Shock. The static database files are stored in the directory specified by the data_folder_path configuration variable. A file is only downloaded if the file is not available on this system or the file has been updated in Shock. @param mylog Log object for messages @return Nothing @raise MissingFileError when database file is not found in Shock ''' # Get the current info about the static database files from the cache file. cacheFilename = self.StatusFiles['cache_file'] if os.path.exists(cacheFilename): fileCache = json.load(open(cacheFilename, "r")) else: fileCache = dict() # Create a shock client. shockClient = ShockClient(self.shockURL) # See if the static database files on this system are up-to-date with files stored in Shock. shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items()) for key in shockFiles: # Get info about the file stored in Shock. localPath = shockFiles[key] name = os.path.basename(localPath) nodelist = shockClient.query_node( { 'lookupname': 'ProbAnnoData/'+name } ) if len(nodelist) == 0: message = "Database file %s is not available from %s\n" %(name, self.shockURL) mylog.log_message(log.ERR, message) # MBM raise MissingFileError(message) node = nodelist[0] # Download the file if the checksum does not match or the file is not available on this system. download = False if key in fileCache: if node['file']['checksum']['md5'] != fileCache[key]['file']['checksum']['md5']: download = True else: download = True if os.path.exists(localPath) == False: download = True if download: sys.stderr.write("Downloading %s to %s\n" %(key, localPath)) shockClient.download_to_path(node["id"], localPath) fileCache[key] = node mylog.log_message(log.INFO, 'Downloaded %s to %s' %(key, localPath)) # Save the updated cache file. json.dump(fileCache, open(cacheFilename, "w"), indent=4) return
def test_buildmatrix(self): ''' Run build_matrix() with four simple sequence files and verify the returned distance matrix.''' # Create a client. cbdClient = CompressionBasedDistance(self._config['cbd_url'], user_id=self._config['test_user'], password=self._config['test_pwd']) token = cbdClient._headers['AUTHORIZATION'] # Create the input parameters. input = dict() input['format'] = 'fasta' input['scale'] = 'std' input['sequence_length'] = 0 input['min_reads'] = 0 input['max_reads'] = 0 input['extreme'] = 1 input['node_ids'] = list() # Upload the files to Shock. shockClient = ShockClient(self._config['shock_url'], token) for filename in InputFiles: node = shockClient.create_node(filename, '') input['node_ids'].append(node['id']) # Run the buildmatrix() function to generate a distance matrix. jobid = cbdClient.build_matrix(input) # Wait for the distance matrix to be built. time.sleep(30) # Get the distance matrix and save to a file. outputPath = 'client-tests/unittest.csv' args = [ os.path.join(os.environ['KB_TOP'], 'bin/cbd-getmatrix'), jobid, outputPath ] proc = subprocess.Popen(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE) (so, se) = proc.communicate() if proc.returncode != 0: print so print se self.assertEqual(proc.returncode, 0) # Confirm the returned distance matrix matches the saved valid output. vf = open('client-tests/output.csv', 'r') tf = open(outputPath, 'r') for vline in vf: tline = tf.readline() self.assertEqual(vline, tline) self.assertEqual(tf.readline(), '') vf.close() tf.close() os.remove(outputPath)
def storeDatabaseFiles(self, token): ''' Store the static database files to Shock. @param token: Authorization token for authenticating to shock @return Nothing ''' # Create a shock client. shockClient = ShockClient(self.shockURL, token=token) # Upload all of the static database files to shock. fileCache = dict() shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items()) for key in shockFiles: localPath = shockFiles[key] name = os.path.basename(localPath) if os.path.exists(localPath): sys.stderr.write('Saving "%s"...' %(localPath)) # See if the file already exists in Shock. query = { 'lookupname': 'ProbAnnoData/'+name } nodelist = shockClient.query_node(query) # Remove all instances of the file in Shock. if nodelist != None: for node in nodelist: shockClient.delete_node(node['id']) # Build the attributes for this file and store as json in a separate file. moddate = time.ctime(os.path.getmtime(localPath)) attr = { 'lookupname': 'ProbAnnoData/'+name, 'moddate': moddate } attrFilename = os.path.join(self.dataFolderPath, name+'.attr') attrFid = open(attrFilename, 'w') json.dump(attr, attrFid, indent=4) attrFid.close() # Upload the file to Shock. metadata = shockClient.create_node(localPath, attrFilename) fileCache[key] = metadata os.remove(attrFilename) # Remove the list of users from the read ACL to give the file public read permission. # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs. readacl = shockClient.get_acl(metadata['id']) shockClient.delete_acl(metadata['id'], 'read', readacl['read'][0]) sys.stderr.write('done\n') else: sys.stderr.write('Could not find "%s" so it was not saved\n' %(localPath)) # Save the metadata on all of the database files. cacheFilename = os.path.join(self.dataFolderPath, StatusFiles['cache_file']) json.dump(fileCache, open(cacheFilename, 'w'), indent=4) return
def storeDatabaseFiles(self, token): ''' Store the static database files to Shock. @param token: Authorization token for authenticating to shock @return Nothing ''' # Create a shock client. shockClient = ShockClient(self.shockURL, token=token) # Upload all of the static database files to shock. fileCache = dict() shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items()) for key in shockFiles: localPath = shockFiles[key] name = os.path.basename(localPath) if os.path.exists(localPath): sys.stderr.write('Saving "%s"...' % (localPath)) # See if the file already exists in Shock. query = {'lookupname': LOOKUP_NAME_PREFIX + '/' + name} nodelist = shockClient.query_node(query) # Remove all instances of the file in Shock. if nodelist != None: for node in nodelist: shockClient.delete_node(node['id']) # Build the attributes for this file and store as json in a separate file. moddate = time.ctime(os.path.getmtime(localPath)) attr = { 'lookupname': LOOKUP_NAME_PREFIX + '/' + name, 'moddate': moddate } attrFilename = os.path.join(self.dataFolderPath, name + '.attr') attrFid = open(attrFilename, 'w') json.dump(attr, attrFid, indent=4) attrFid.close() # Upload the file to Shock. metadata = shockClient.create_node(localPath, attrFilename) fileCache[key] = metadata os.remove(attrFilename) # Remove the list of users from the read ACL to give the file public read permission. # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs. readacl = shockClient.get_acl(metadata['id']) shockClient.delete_acl(metadata['id'], 'read', readacl['read'][0]) sys.stderr.write('done\n') else: sys.stderr.write('Could not find "%s" so it was not saved\n' % (localPath)) # Save the metadata on all of the database files. cacheFilename = os.path.join(self.dataFolderPath, self.StatusFiles['cache_file']) json.dump(fileCache, open(cacheFilename, 'w'), indent=4) return
input['sequence_length'] = args.sequenceLen input['min_reads'] = args.minReads input['max_reads'] = args.maxReads if args.extreme: input['extreme'] = 1 else: input['extreme'] = 0 input['node_ids'] = list() # Create a cbd client (which must be authenticated). if args.url is None: args.url = get_url() cbdClient = CompressionBasedDistance(url=args.url) # Create a shock client. shockClient = ShockClient(args.shockurl, cbdClient._headers['AUTHORIZATION']) # Parse the input file with the list of sequence files. (fileList, extensions, numMissingFiles) = parse_input_file(args.inputPath) if numMissingFiles > 0: exit(1) # Set the format based on the sequence file extension if the format argument was not specified. if args.format is None: if len(extensions) == 1: input['format'] = extensions.keys()[0] else: print "The format of the sequence files could not be determined. Set the format with the --format argument." exit(1) else: input['format'] = args.format
class CompressionBasedDistance: ''' Calculate the compression based distance metric and save distance matrix to a file. @param fileList List of paths to compressed files @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity @param outputFile Path to file with output distance matrix @return Nothing ''' def _cbdCalculator(self, fileList, scale, outputFile): # Parse the files. single_sizes = dict() pair_sizes = dict() for sourceFile in fileList: # Should strip prefix too fbase = os.path.basename(sourceFile) # This works as long as '.sorted.xz' only occurs at the end of the path. fname = fbase.replace('.sorted.xz', '') if PairSeparator in fname: pair_sizes[fname] = os.path.getsize(sourceFile) else: single_sizes[fname] = os.path.getsize(sourceFile) # Map file names to indices. fnames = single_sizes.keys() fnames.sort() indices = dict() for name, i in zip(fnames, range(len(fnames))): indices[name] = i # Compute the distance scores. pair_names = pair_sizes.keys() cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float) for pair in pair_names: name1, name2 = pair.split(PairSeparator) c1 = float(single_sizes[name1]) c2 = float(single_sizes[name2]) c12 = float(pair_sizes[pair]) distance = 1.0 - (2.0 * ((c1 + c2 - c12) / (c1 + c2))) if distance > 1.0: part1 = "Distance %f is greater than 1.0. " % (distance) part2 = "Check sequence read lengths and relative number of sequence reads. " part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" % (c1, name1, c2, name2, c12, pair) raise ValueError(part1 + part2 + part3) if scale == 'inf': distance = distance / (1.0 - distance) cbd_array[indices[name1], indices[name2]] = distance cbd_array[indices[name2], indices[name1]] = distance # Build the output file in CSV format. outf = open(outputFile, 'w') outf.write('ID,' + ','.join(fnames) + '\n') for i in range(len(fnames)): outf.write(fnames[i] + ',' + ','.join(['{0:g}'.format(x) for x in cbd_array[i, :]]) + '\n') outf.close() return ''' Cleanup after running a job. @note All temporary files are removed even when there is an error. @return Nothing ''' def _cleanup(self): # Delete input fasta files from Shock. for nodeId in self.input['node_ids']: try: self.shockClient.delete_node(nodeId) except Exception as e: self._log( log.ERR, 'Error deleting node %s from Shock: %s' % (nodeId, e.message)) # Remove the work directory. shutil.rmtree(self.jobDirectory) # Stop the process pool. self.pool.close() self.pool.join() return ''' Log a message to the system log. @param level Message level (INFO, WARNING, etc.) @param message Message text @return Nothing ''' def _log(self, level, message): # Create a logger if this is the first time the method has been called. if self.logger is None: submod = os.environ.get('KB_SERVICE_NAME', 'CompressionBasedDistance') self.logger = log.log(submod, ip_address=True, authuser=True, module=True, method=True, call_id=True, config=os.getenv('KB_DEPLOYMENT_CONFIG')) # Log the message. self.logger.log_message(level, message, self.context['client_ip'], self.context['user_id'], self.context['module'], self.context['method'], self.context['call_id']) return def __init__(self): self.logger = None ''' Run a job to build a distance matrix. When successful the distance matrix csv file is stored in Shock. @param job Dictionary with configuration variables, context variables, and input variables for job @raise ExtractError: Error extracting sequences from input sequence file @raise SeqLenError: Error with lengths of sequences in input sequence file @raise SortError: Error sorting a raw sequence file @raise MergeError: Error merging a raw sequence file @raise CompressError: Error compressing a raw sequence file @raise ShockError: Error saving file to Shock @return Nothing ''' def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log( log.INFO, 'Job ' + job['id'] + ' running with work folder ' + self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file, result: %d" % (result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' % (self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file: %s" % (e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" % (sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError( "There are not enough sequence files that meet the sequence length or number of sequences criteria." ) # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError( "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress( job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p, q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError( "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile + '.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError( "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename( csvFile, '%s/%s.csv' % (self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError( "Error saving distance matrix file to Shock. A Shock node was not created." ) # Mark the job as complete. results = { 'shocknodes': [node['id']], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully') # Cleanup after ourselves. self._cleanup() return def calculate(self, listFilePath, scale, csvFile): # Each line of the list file is a path to a compressed file. compressedList = list() listFile = open(listFilePath, 'r') for line in listFile: compressedList.append(line.strip()) listFile.close() # Calculate the distance matrix. self._cbdCalculator(compressedList, scale, csvFile) return
def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log( log.INFO, 'Job ' + job['id'] + ' running with work folder ' + self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file, result: %d" % (result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' % (self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict( ) # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args, )) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError( "Error extracting sequences from input sequence file: %s" % (e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" % (sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError( "There are not enough sequence files that meet the sequence length or number of sequences criteria." ) # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError( "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress( job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p, q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError( "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile + '.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile] result = self.pool.apply_async(run_command, (args, )) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError( "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" % (e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename( csvFile, '%s/%s.csv' % (self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError( "Error saving distance matrix file to Shock. A Shock node was not created." ) # Mark the job as complete. results = { 'shocknodes': [node['id']], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully') # Cleanup after ourselves. self._cleanup() return
class CompressionBasedDistance: ''' Calculate the compression based distance metric and save distance matrix to a file. @param fileList List of paths to compressed files @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity @param outputFile Path to file with output distance matrix @return Nothing ''' def _cbdCalculator(self, fileList, scale, outputFile): # Parse the files. single_sizes = dict() pair_sizes = dict() for sourceFile in fileList: # Should strip prefix too fbase = os.path.basename(sourceFile) # This works as long as '.sorted.xz' only occurs at the end of the path. fname = fbase.replace('.sorted.xz', '') if PairSeparator in fname: pair_sizes[fname] = os.path.getsize(sourceFile) else: single_sizes[fname] = os.path.getsize(sourceFile) # Map file names to indices. fnames = single_sizes.keys() fnames.sort() indices = dict() for name,i in zip(fnames, range(len(fnames))): indices[name] = i # Compute the distance scores. pair_names = pair_sizes.keys() cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float) for pair in pair_names: name1, name2 = pair.split(PairSeparator) c1 = float(single_sizes[name1]) c2 = float(single_sizes[name2]) c12 = float(pair_sizes[pair]) distance = 1.0 - ( 2.0 * ( (c1 + c2 - c12) / (c1 + c2) ) ) if distance > 1.0: part1 = "Distance %f is greater than 1.0. " %(distance) part2 = "Check sequence read lengths and relative number of sequence reads. " part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" %(c1, name1, c2, name2, c12, pair) raise ValueError(part1+part2+part3) if scale == 'inf': distance = distance/(1.0 - distance) cbd_array[indices[name1],indices[name2]] = distance cbd_array[indices[name2],indices[name1]] = distance # Build the output file in CSV format. outf = open(outputFile, 'w') outf.write('ID,' + ','.join(fnames) + '\n') for i in range(len(fnames)): outf.write(fnames[i] + ',' + ','.join(['{0:g}'.format(x) for x in cbd_array[i,:]]) + '\n') outf.close() return ''' Cleanup after running a job. @note All temporary files are removed even when there is an error. @return Nothing ''' def _cleanup(self): # Delete input fasta files from Shock. for nodeId in self.input['node_ids']: try: self.shockClient.delete_node(nodeId) except Exception as e: self._log(log.ERR, 'Error deleting node %s from Shock: %s' %(nodeId, e.message)) # Remove the work directory. shutil.rmtree(self.jobDirectory) # Stop the process pool. self.pool.close() self.pool.join() return ''' Log a message to the system log. @param level Message level (INFO, WARNING, etc.) @param message Message text @return Nothing ''' def _log(self, level, message): # Create a logger if this is the first time the method has been called. if self.logger is None: submod = os.environ.get('KB_SERVICE_NAME', 'CompressionBasedDistance') self.logger = log.log(submod, ip_address=True, authuser=True, module=True, method=True, call_id=True, config=os.getenv('KB_DEPLOYMENT_CONFIG')) # Log the message. self.logger.log_message(level, message, self.context['client_ip'], self.context['user_id'], self.context['module'], self.context['method'], self.context['call_id']) return def __init__(self): self.logger = None ''' Run a job to build a distance matrix. When successful the distance matrix csv file is stored in Shock. @param job Dictionary with configuration variables, context variables, and input variables for job @raise ExtractError: Error extracting sequences from input sequence file @raise SeqLenError: Error with lengths of sequences in input sequence file @raise SortError: Error sorting a raw sequence file @raise MergeError: Error merging a raw sequence file @raise CompressError: Error compressing a raw sequence file @raise ShockError: Error saving file to Shock @return Nothing ''' def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.") # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p,q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile+'.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.") # Mark the job as complete. results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job '+job['id']+' completed successfully') # Cleanup after ourselves. self._cleanup() return def calculate(self, listFilePath, scale, csvFile): # Each line of the list file is a path to a compressed file. compressedList = list() listFile = open(listFilePath, 'r') for line in listFile: compressedList.append(line.strip()) listFile.close() # Calculate the distance matrix. self._cbdCalculator(compressedList, scale, csvFile) return
def runJob(self, job): self.config = job['config'] self.context = job['context'] self.input = job['input'] # Create a shock client and authenticate as the user. self.shockClient = ShockClient(self.config['shock_url'], self.context['token']) # Create a user and job state client and authenticate as the user. ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token']) # Create a process pool. self.pool = Pool(processes=int(self.config['num_pool_processes'])) # Create a work directory for storing intermediate files. self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id']) self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory) # Download input fasta files from Shock and extract sequences to work directory. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600)) except: pass resultList = [] sequenceList = [] for nodeId in self.input['node_ids']: node = self.shockClient.get_node(nodeId) sourceFile = os.path.join(self.jobDirectory, node['file']['name']) destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = nodeId args['sourceFile'] = sourceFile args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: if result.get() != 0: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get())) for path in self.input['file_paths']: sourceFile = os.path.basename(path) destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0]) if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed. destFile = destFile.replace(PairSeparator, '-') sequenceList.append(destFile) args = dict() # Needs to be scoped here so each process gets its own copy args['format'] = self.input['format'] args['shockUrl'] = self.config['shock_url'] args['auth'] = self.context['token'] args['sequenceLen'] = self.input['sequence_length'] args['minReads'] = self.input['min_reads'] args['maxReads'] = self.input['max_reads'] args['nodeId'] = None args['sourceFile'] = path args['destFile'] = destFile result = self.pool.apply_async(extract_seq, (args,)) resultList.append(result) for result in resultList: try: result.get() except Exception as e: self._cleanup() raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message)) # Confirm that each file met the criteria for sequence length and number of sequences. filesToRemove = list() for index in range(len(sequenceList)): # See if the file did not have the minimum number of sequences. if not os.path.exists(sequenceList[index]): filesToRemove.append(index) continue # See if the file has no data. if os.path.getsize(sequenceList[index]) == 0: self._cleanup() raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index])) filteredList = list() for index in range(len(sequenceList)): if index not in filesToRemove: filteredList.append(sequenceList[index]) if len(filteredList) < 2: self._cleanup() raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.") # Sort the sequences. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600)) except: pass resultList = [] sortedList = [] for sourceFile in filteredList: destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0]) sortedList.append(destFile) args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Create combined and sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600)) except: pass resultList = [] for p,q in combinations(sortedList, 2): pbase = os.path.basename(p) qbase = os.path.basename(q) dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0]) destFile = os.path.join(self.jobDirectory, dbase) sortedList.append(destFile) args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Compress all sorted files. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600)) except: pass resultList = [] compressedList = [] for sourceFile in sortedList: compressedList.append(sourceFile+'.xz') if self.input['extreme']: level = '-9e' else: level = '-9' args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ] result = self.pool.apply_async(run_command, (args,)) resultList.append(result) for result in resultList: try: result.get() except CommandError as e: self._cleanup() raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr)) # Calculate the distance matrix. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600)) except: pass csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id'])) self._cbdCalculator(compressedList, self.input['scale'], csvFile) # Store the output file in shock. try: ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600)) except: pass node = self.shockClient.create_node(csvFile, '') if not node['id']: # Shock let us down. Save the distance matrix in the work directory for possible recovery. os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id'])) self._cleanup() raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.") # Mark the job as complete. results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] } ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results) self._log(log.INFO, 'Job '+job['id']+' completed successfully') # Cleanup after ourselves. self._cleanup() return
ujsClient.delete_job(args.jobID) exit(1) # Check if the job is complete. if not info['complete']: print "Job '%s' has status '%s' and is working on task %s of %s. Check again later." \ %(args.jobID, info['status'], info['total_progress'], info['max_progress']) exit(1) # Show job info. if args.showTimes: print 'Job started at %s and finished at %s' % (info['started'], info['last_update']) # Create a shock client. shockClient = ShockClient(info['results']['shockurl'], ujsClient._headers['AUTHORIZATION']) # Download the output to the specified file and remove the file from shock. try: shockClient.download_to_path(info['results']['shocknodes'][0], args.outputPath) except Exception as e: print 'Error downloading distance matrix from %s: %s' % ( info['results']['shockurl'], e.message) traceback.print_exc(file=sys.stdout) try: shockClient.delete_node(info['results']['shocknodes'][0]) except Exception as e: print 'Error deleting distance matrix file from %s: ' % ( +info['results']['shockurl'], e.message) traceback.print_exc(file=sys.stdout)
print ujsClient.get_detailed_error(args.jobID) ujsClient.delete_job(args.jobID) exit(1) # Check if the job is complete. if not info['complete']: print "Job '%s' has status '%s' and is working on task %s of %s. Check again later." \ %(args.jobID, info['status'], info['total_progress'], info['max_progress']) exit(1) # Show job info. if args.showTimes: print 'Job started at %s and finished at %s' %(info['started'], info['last_update']) # Create a shock client. shockClient = ShockClient(info['results']['shockurl'], ujsClient._headers['AUTHORIZATION']) # Download the output to the specified file and remove the file from shock. try: shockClient.download_to_path(info['results']['shocknodes'][0], args.outputPath) except Exception as e: print 'Error downloading distance matrix from %s: %s' %(info['results']['shockurl'], e.message) traceback.print_exc(file=sys.stdout) try: shockClient.delete_node(info['results']['shocknodes'][0]) except Exception as e: print 'Error deleting distance matrix file from %s: ' %(+info['results']['shockurl'], e.message) traceback.print_exc(file=sys.stdout) # Delete the job. ujsClient.delete_job(args.jobID)