Beispiel #1
0
def extract_seq(args):
    # Download the file from Shock to the working directory.
    if args['nodeId'] is not None:
        shockClient = ShockClient(args['shockUrl'], args['auth'])
        shockClient.download_to_path(args['nodeId'], args['sourceFile'])

    # Extract the sequences from the source file.
    numReads = 0
    with open(args['destFile'], 'w') as f:
        if args['sequenceLen'] > 0: # A length to trim to was specified
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                seq = str(seqRecord.seq)
                if len(seq) < args['sequenceLen']:
                    continue
                if len(seq) > args['sequenceLen']:
                    seq = seq[:args['sequenceLen']]
                f.write(str(seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        elif args['maxReads'] > 0:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        else:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')

    # Delete the file if it does not have enough reads.
    if args['minReads'] > 0 and numReads < args['minReads']:
        os.remove(args['destFile'])
    return 0
Beispiel #2
0
def extract_seq(args):
    # Download the file from Shock to the working directory.
    if args['nodeId'] is not None:
        shockClient = ShockClient(args['shockUrl'], args['auth'])
        shockClient.download_to_path(args['nodeId'], args['sourceFile'])

    # Extract the sequences from the source file.
    numReads = 0
    with open(args['destFile'], 'w') as f:
        if args['sequenceLen'] > 0:  # A length to trim to was specified
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                seq = str(seqRecord.seq)
                if len(seq) < args['sequenceLen']:
                    continue
                if len(seq) > args['sequenceLen']:
                    seq = seq[:args['sequenceLen']]
                f.write(str(seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        elif args['maxReads'] > 0:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')
                numReads += 1
                if numReads == args['maxReads']:
                    break
        else:
            for seqRecord in SeqIO.parse(args['sourceFile'], args['format']):
                f.write(str(seqRecord.seq) + '\n')

    # Delete the file if it does not have enough reads.
    if args['minReads'] > 0 and numReads < args['minReads']:
        os.remove(args['destFile'])
    return 0
    def loadDatabaseFiles(self, mylog):
        ''' Load the static database files from Shock.

            The static database files are stored in the directory specified by the
            data_folder_path configuration variable.  A file is only downloaded if
            the file is not available on this system or the file has been updated
            in Shock.

            @param mylog Log object for messages
            @return Nothing
            @raise MissingFileError when database file is not found in Shock
        '''

        # Get the current info about the static database files from the cache file.
        cacheFilename = self.StatusFiles['cache_file']
        if os.path.exists(cacheFilename):
            fileCache = json.load(open(cacheFilename, "r"))
        else:
            fileCache = dict()

        # Create a shock client.
        shockClient = ShockClient(self.shockURL)

        # See if the static database files on this system are up-to-date with files stored in Shock.
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            # Get info about the file stored in Shock.
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            nodelist = shockClient.query_node(
                {'lookupname': 'ProbAnnoData/' + name})
            if len(nodelist) == 0:
                message = "Database file %s is not available from %s\n" % (
                    name, self.shockURL)
                mylog.log_message(log.ERR, message)  # MBM
                raise MissingFileError(message)
            node = nodelist[0]

            # Download the file if the checksum does not match or the file is not available on this system.
            download = False
            if key in fileCache:
                if node['file']['checksum']['md5'] != fileCache[key]['file'][
                        'checksum']['md5']:
                    download = True
            else:
                download = True
            if os.path.exists(localPath) == False:
                download = True
            if download:
                sys.stderr.write("Downloading %s to %s\n" % (key, localPath))
                shockClient.download_to_path(node["id"], localPath)
                fileCache[key] = node
                mylog.log_message(log.INFO,
                                  'Downloaded %s to %s' % (key, localPath))

        # Save the updated cache file.
        json.dump(fileCache, open(cacheFilename, "w"), indent=4)
        return
    def loadDatabaseFiles(self, mylog):
        ''' Load the static database files from Shock.

            The static database files are stored in the directory specified by the
            data_folder_path configuration variable.  A file is only downloaded if
            the file is not available on this system or the file has been updated
            in Shock.

            @param mylog Log object for messages
            @return Nothing
            @raise MissingFileError when database file is not found in Shock
        '''
        
        # Get the current info about the static database files from the cache file.
        cacheFilename = self.StatusFiles['cache_file']
        if os.path.exists(cacheFilename):
            fileCache = json.load(open(cacheFilename, "r"))
        else:
            fileCache = dict()
        
        # Create a shock client.
        shockClient = ShockClient(self.shockURL)

        # See if the static database files on this system are up-to-date with files stored in Shock.
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            # Get info about the file stored in Shock.
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            nodelist = shockClient.query_node( { 'lookupname': 'ProbAnnoData/'+name } )
            if len(nodelist) == 0:
                message = "Database file %s is not available from %s\n" %(name, self.shockURL)
                mylog.log_message(log.ERR, message) # MBM
                raise MissingFileError(message)
            node = nodelist[0]
            
            # Download the file if the checksum does not match or the file is not available on this system.
            download = False
            if key in fileCache:
                if node['file']['checksum']['md5'] != fileCache[key]['file']['checksum']['md5']:
                    download = True
            else:
                download = True
            if os.path.exists(localPath) == False:
                download = True
            if download:
                sys.stderr.write("Downloading %s to %s\n" %(key, localPath))
                shockClient.download_to_path(node["id"], localPath)
                fileCache[key] = node
                mylog.log_message(log.INFO, 'Downloaded %s to %s' %(key, localPath))
                
        # Save the updated cache file.
        json.dump(fileCache, open(cacheFilename, "w"), indent=4)
        return
Beispiel #5
0
    def test_buildmatrix(self):
        ''' Run build_matrix() with four simple sequence files and verify the returned distance matrix.'''

        # Create a client.
        cbdClient = CompressionBasedDistance(self._config['cbd_url'], user_id=self._config['test_user'], password=self._config['test_pwd'])
        token = cbdClient._headers['AUTHORIZATION']
        
        # Create the input parameters.
        input = dict()
        input['format'] = 'fasta'
        input['scale'] = 'std'
        input['sequence_length'] = 0
        input['min_reads'] = 0
        input['max_reads'] = 0
        input['extreme'] = 1
        input['node_ids'] = list()

        # Upload the files to Shock.
        shockClient = ShockClient(self._config['shock_url'], token)
        for filename in InputFiles:
            node = shockClient.create_node(filename, '')
            input['node_ids'].append(node['id'])
        
        # Run the buildmatrix() function to generate a distance matrix.
        jobid = cbdClient.build_matrix(input)

        # Wait for the distance matrix to be built.
        time.sleep(30)

        # Get the distance matrix and save to a file.
        outputPath = 'client-tests/unittest.csv'
        args = [ os.path.join(os.environ['KB_TOP'], 'bin/cbd-getmatrix'), jobid,  outputPath ]
        proc = subprocess.Popen(args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
        (so, se) = proc.communicate()
        if proc.returncode != 0:
            print so
            print se
        self.assertEqual(proc.returncode, 0)
        
        # Confirm the returned distance matrix matches the saved valid output.
        vf = open('client-tests/output.csv', 'r')
        tf = open(outputPath, 'r')
        for vline in vf: 
            tline = tf.readline()
            self.assertEqual(vline, tline)
        self.assertEqual(tf.readline(), '')
        vf.close()
        tf.close()
        os.remove(outputPath)
    def storeDatabaseFiles(self, token):
        ''' Store the static database files to Shock.

            @param token: Authorization token for authenticating to shock
            @return Nothing
        '''
        
        # Create a shock client.
        shockClient = ShockClient(self.shockURL, token=token)
        
        # Upload all of the static database files to shock.
        fileCache = dict()
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            if os.path.exists(localPath):
                sys.stderr.write('Saving "%s"...' %(localPath))
                
                # See if the file already exists in Shock.
                query = { 'lookupname': 'ProbAnnoData/'+name }
                nodelist = shockClient.query_node(query)
                
                # Remove all instances of the file in Shock.
                if nodelist != None:
                    for node in nodelist:
                        shockClient.delete_node(node['id'])
     
                # Build the attributes for this file and store as json in a separate file.
                moddate = time.ctime(os.path.getmtime(localPath))           
                attr = { 'lookupname': 'ProbAnnoData/'+name, 'moddate': moddate }
                attrFilename = os.path.join(self.dataFolderPath, name+'.attr')
                attrFid = open(attrFilename, 'w')
                json.dump(attr, attrFid, indent=4)
                attrFid.close()
                
                # Upload the file to Shock.
                metadata = shockClient.create_node(localPath, attrFilename)
                fileCache[key] = metadata
                os.remove(attrFilename)
                
                # Remove the list of users from the read ACL to give the file public read permission.
                # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs.
                readacl = shockClient.get_acl(metadata['id'])
                shockClient.delete_acl(metadata['id'], 'read', readacl['read'][0])
                sys.stderr.write('done\n')
                
            else:
                sys.stderr.write('Could not find "%s" so it was not saved\n' %(localPath))
                
        # Save the metadata on all of the database files.
        cacheFilename = os.path.join(self.dataFolderPath, StatusFiles['cache_file'])
        json.dump(fileCache, open(cacheFilename, 'w'), indent=4)

        return
    def storeDatabaseFiles(self, token):
        ''' Store the static database files to Shock.
            @param token: Authorization token for authenticating to shock
            @return Nothing
        '''

        # Create a shock client.
        shockClient = ShockClient(self.shockURL, token=token)

        # Upload all of the static database files to shock.
        fileCache = dict()
        shockFiles = dict(self.DataFiles.items() + self.SearchFiles.items())
        for key in shockFiles:
            localPath = shockFiles[key]
            name = os.path.basename(localPath)
            if os.path.exists(localPath):
                sys.stderr.write('Saving "%s"...' % (localPath))

                # See if the file already exists in Shock.
                query = {'lookupname': LOOKUP_NAME_PREFIX + '/' + name}
                nodelist = shockClient.query_node(query)

                # Remove all instances of the file in Shock.
                if nodelist != None:
                    for node in nodelist:
                        shockClient.delete_node(node['id'])

                # Build the attributes for this file and store as json in a separate file.
                moddate = time.ctime(os.path.getmtime(localPath))
                attr = {
                    'lookupname': LOOKUP_NAME_PREFIX + '/' + name,
                    'moddate': moddate
                }
                attrFilename = os.path.join(self.dataFolderPath,
                                            name + '.attr')
                attrFid = open(attrFilename, 'w')
                json.dump(attr, attrFid, indent=4)
                attrFid.close()

                # Upload the file to Shock.
                metadata = shockClient.create_node(localPath, attrFilename)
                fileCache[key] = metadata
                os.remove(attrFilename)

                # Remove the list of users from the read ACL to give the file public read permission.
                # Note this needs to change for Shock version 0.9.5 but not sure how to set public ACLs.
                readacl = shockClient.get_acl(metadata['id'])
                shockClient.delete_acl(metadata['id'], 'read',
                                       readacl['read'][0])
                sys.stderr.write('done\n')

            else:
                sys.stderr.write('Could not find "%s" so it was not saved\n' %
                                 (localPath))

        # Save the metadata on all of the database files.
        cacheFilename = os.path.join(self.dataFolderPath,
                                     self.StatusFiles['cache_file'])
        json.dump(fileCache, open(cacheFilename, 'w'), indent=4)

        return
Beispiel #8
0
    input['sequence_length'] = args.sequenceLen
    input['min_reads'] = args.minReads
    input['max_reads'] = args.maxReads
    if args.extreme:
        input['extreme'] = 1
    else:
        input['extreme'] = 0
    input['node_ids'] = list()

    # Create a cbd client (which must be authenticated).
    if args.url is None:
        args.url = get_url()
    cbdClient = CompressionBasedDistance(url=args.url)

    # Create a shock client.
    shockClient = ShockClient(args.shockurl,
                              cbdClient._headers['AUTHORIZATION'])

    # Parse the input file with the list of sequence files.
    (fileList, extensions, numMissingFiles) = parse_input_file(args.inputPath)
    if numMissingFiles > 0:
        exit(1)

    # Set the format based on the sequence file extension if the format argument was not specified.
    if args.format is None:
        if len(extensions) == 1:
            input['format'] = extensions.keys()[0]
        else:
            print "The format of the sequence files could not be determined.  Set the format with the --format argument."
            exit(1)
    else:
        input['format'] = args.format
Beispiel #9
0
class CompressionBasedDistance:
    ''' Calculate the compression based distance metric and save distance matrix to a file.

        @param fileList List of paths to compressed files
        @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity
        @param outputFile Path to file with output distance matrix
        @return Nothing
    '''
    def _cbdCalculator(self, fileList, scale, outputFile):
        # Parse the files.
        single_sizes = dict()
        pair_sizes = dict()

        for sourceFile in fileList:
            # Should strip prefix too
            fbase = os.path.basename(sourceFile)
            # This works as long as '.sorted.xz' only occurs at the end of the path.
            fname = fbase.replace('.sorted.xz', '')
            if PairSeparator in fname:
                pair_sizes[fname] = os.path.getsize(sourceFile)
            else:
                single_sizes[fname] = os.path.getsize(sourceFile)

        # Map file names to indices.
        fnames = single_sizes.keys()
        fnames.sort()
        indices = dict()

        for name, i in zip(fnames, range(len(fnames))):
            indices[name] = i

        # Compute the distance scores.
        pair_names = pair_sizes.keys()
        cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float)
        for pair in pair_names:
            name1, name2 = pair.split(PairSeparator)
            c1 = float(single_sizes[name1])
            c2 = float(single_sizes[name2])
            c12 = float(pair_sizes[pair])
            distance = 1.0 - (2.0 * ((c1 + c2 - c12) / (c1 + c2)))
            if distance > 1.0:
                part1 = "Distance %f is greater than 1.0.  " % (distance)
                part2 = "Check sequence read lengths and relative number of sequence reads.  "
                part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" % (c1, name1, c2,
                                                            name2, c12, pair)
                raise ValueError(part1 + part2 + part3)
            if scale == 'inf':
                distance = distance / (1.0 - distance)
            cbd_array[indices[name1], indices[name2]] = distance
            cbd_array[indices[name2], indices[name1]] = distance

        # Build the output file in CSV format.
        outf = open(outputFile, 'w')
        outf.write('ID,' + ','.join(fnames) + '\n')
        for i in range(len(fnames)):
            outf.write(fnames[i] + ',' +
                       ','.join(['{0:g}'.format(x)
                                 for x in cbd_array[i, :]]) + '\n')
        outf.close()
        return

    ''' Cleanup after running a job.

        @note All temporary files are removed even when there is an error.
        @return Nothing
    '''

    def _cleanup(self):
        # Delete input fasta files from Shock.
        for nodeId in self.input['node_ids']:
            try:
                self.shockClient.delete_node(nodeId)
            except Exception as e:
                self._log(
                    log.ERR, 'Error deleting node %s from Shock: %s' %
                    (nodeId, e.message))

        # Remove the work directory.
        shutil.rmtree(self.jobDirectory)

        # Stop the process pool.
        self.pool.close()
        self.pool.join()

        return

    ''' Log a message to the system log.

        @param level Message level (INFO, WARNING, etc.)
        @param message Message text
        @return Nothing
    '''

    def _log(self, level, message):
        # Create a logger if this is the first time the method has been called.
        if self.logger is None:
            submod = os.environ.get('KB_SERVICE_NAME',
                                    'CompressionBasedDistance')
            self.logger = log.log(submod,
                                  ip_address=True,
                                  authuser=True,
                                  module=True,
                                  method=True,
                                  call_id=True,
                                  config=os.getenv('KB_DEPLOYMENT_CONFIG'))

        # Log the message.
        self.logger.log_message(level, message, self.context['client_ip'],
                                self.context['user_id'],
                                self.context['module'], self.context['method'],
                                self.context['call_id'])
        return

    def __init__(self):
        self.logger = None

    ''' Run a job to build a distance matrix.

        When successful the distance matrix csv file is stored in Shock.

        @param job Dictionary with configuration variables, context variables, and input variables for job
        @raise ExtractError: Error extracting sequences from input sequence file
        @raise SeqLenError: Error with lengths of sequences in input sequence file
        @raise SortError: Error sorting a raw sequence file
        @raise MergeError: Error merging a raw sequence file
        @raise CompressError: Error compressing a raw sequence file
        @raise ShockError: Error saving file to Shock
        @return Nothing
    '''

    def runJob(self, job):

        self.config = job['config']
        self.context = job['context']
        self.input = job['input']

        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'],
                                       self.context['token'])

        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'],
                                    token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))

        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'],
                                         job['id'])
        self._log(
            log.INFO, 'Job ' + job['id'] + ' running with work folder ' +
            self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'extracting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file, result: %d"
                    % (result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' % (self.jobDirectory,
                                           os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file: %s" %
                    (e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %
                                  (sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError(
                "There are not enough sequence files that meet the sequence length or number of sequences criteria."
            )

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'sorting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError(
                    "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(
                job['id'], self.context['token'],
                'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p, q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0],
                                       PairSeparator,
                                       os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError(
                    "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'compressing sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile + '.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError(
                    "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'calculating distance matrix', 1,
                                          timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)

        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'storing output file in shock', 1,
                                          timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(
                csvFile,
                '%s/%s.csv' % (self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError(
                "Error saving distance matrix file to Shock. A Shock node was not created."
            )

        # Mark the job as complete.
        results = {
            'shocknodes': [node['id']],
            'shockurl': self.config['shock_url']
        }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None,
                               results)
        self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()

        return

    def calculate(self, listFilePath, scale, csvFile):

        # Each line of the list file is a path to a compressed file.
        compressedList = list()
        listFile = open(listFilePath, 'r')
        for line in listFile:
            compressedList.append(line.strip())
        listFile.close()

        # Calculate the distance matrix.
        self._cbdCalculator(compressedList, scale, csvFile)
        return
Beispiel #10
0
    def runJob(self, job):

        self.config = job['config']
        self.context = job['context']
        self.input = job['input']

        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'],
                                       self.context['token'])

        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'],
                                    token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))

        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'],
                                         job['id'])
        self._log(
            log.INFO, 'Job ' + job['id'] + ' running with work folder ' +
            self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'extracting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' % (os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file, result: %d"
                    % (result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' % (self.jobDirectory,
                                           os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile:  # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict(
            )  # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError(
                    "Error extracting sequences from input sequence file: %s" %
                    (e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %
                                  (sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError(
                "There are not enough sequence files that meet the sequence length or number of sequences criteria."
            )

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'sorting sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' % (os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '--output=%s' % (destFile), sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError(
                    "Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(
                job['id'], self.context['token'],
                'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p, q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' % (os.path.splitext(pbase)[0],
                                       PairSeparator,
                                       os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = ['/usr/bin/sort', '-m', '--output=%s' % (destFile), p, q]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError(
                    "Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'compressing sequence files', 1,
                                          timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile + '.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = ['/usr/bin/xz', '--keep', level, '--no-warn', sourceFile]
            result = self.pool.apply_async(run_command, (args, ))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError(
                    "Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'"
                    % (e.message, e.cmd, e.stdout, e.stderr))

        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'calculating distance matrix', 1,
                                          timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' % (job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)

        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'],
                                          'storing output file in shock', 1,
                                          timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(
                csvFile,
                '%s/%s.csv' % (self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError(
                "Error saving distance matrix file to Shock. A Shock node was not created."
            )

        # Mark the job as complete.
        results = {
            'shocknodes': [node['id']],
            'shockurl': self.config['shock_url']
        }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None,
                               results)
        self._log(log.INFO, 'Job ' + job['id'] + ' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()

        return
Beispiel #11
0
class CompressionBasedDistance:
    
    ''' Calculate the compression based distance metric and save distance matrix to a file.

        @param fileList List of paths to compressed files
        @param scale Scale of distance values, 'std' for 0 to 1, 'inf' for 0 to infinity
        @param outputFile Path to file with output distance matrix
        @return Nothing
    '''

    def _cbdCalculator(self, fileList, scale, outputFile):
        # Parse the files.
        single_sizes = dict()
        pair_sizes = dict()
        
        for sourceFile in fileList:
            # Should strip prefix too
            fbase = os.path.basename(sourceFile)
            # This works as long as '.sorted.xz' only occurs at the end of the path.
            fname = fbase.replace('.sorted.xz', '')
            if PairSeparator in fname:
                pair_sizes[fname] = os.path.getsize(sourceFile)
            else:
                single_sizes[fname] = os.path.getsize(sourceFile)

        # Map file names to indices.
        fnames = single_sizes.keys()
        fnames.sort()
        indices = dict()
        
        for name,i in zip(fnames, range(len(fnames))):
            indices[name] = i
        
        # Compute the distance scores.
        pair_names = pair_sizes.keys()
        cbd_array = numpy.zeros((len(fnames), len(fnames)), dtype=float)
        for pair in pair_names:
            name1, name2 = pair.split(PairSeparator)
            c1 = float(single_sizes[name1])
            c2 = float(single_sizes[name2])
            c12 = float(pair_sizes[pair])
            distance = 1.0 - ( 2.0 * ( (c1 + c2 - c12) / (c1 + c2) ) )
            if distance > 1.0:
                part1 = "Distance %f is greater than 1.0.  " %(distance)
                part2 = "Check sequence read lengths and relative number of sequence reads.  "
                part3 = "(c1=%f %s, c2=%f %s c12=%f %s)" %(c1, name1, c2, name2, c12, pair)
                raise ValueError(part1+part2+part3)
            if scale == 'inf':
                distance = distance/(1.0 - distance)
            cbd_array[indices[name1],indices[name2]] = distance
            cbd_array[indices[name2],indices[name1]] = distance
            
        # Build the output file in CSV format.
        outf = open(outputFile, 'w')
        outf.write('ID,' + ','.join(fnames) + '\n')
        for i in range(len(fnames)):
             outf.write(fnames[i] + ',' + ','.join(['{0:g}'.format(x) for x in cbd_array[i,:]]) + '\n')
        outf.close()
        return
    
    ''' Cleanup after running a job.

        @note All temporary files are removed even when there is an error.
        @return Nothing
    '''

    def _cleanup(self):
        # Delete input fasta files from Shock.
        for nodeId in self.input['node_ids']:
            try:
                self.shockClient.delete_node(nodeId)
            except Exception as e:
                self._log(log.ERR, 'Error deleting node %s from Shock: %s' %(nodeId, e.message))
            
        # Remove the work directory.
        shutil.rmtree(self.jobDirectory)
            
        # Stop the process pool.
        self.pool.close()
        self.pool.join()
        
        return
    
    ''' Log a message to the system log.

        @param level Message level (INFO, WARNING, etc.)
        @param message Message text
        @return Nothing
    '''

    def _log(self, level, message):
        # Create a logger if this is the first time the method has been called.
        if self.logger is None:
            submod = os.environ.get('KB_SERVICE_NAME', 'CompressionBasedDistance')
            self.logger = log.log(submod, ip_address=True, authuser=True, module=True, method=True,
                call_id=True, config=os.getenv('KB_DEPLOYMENT_CONFIG'))

        # Log the message.
        self.logger.log_message(level, message, self.context['client_ip'], self.context['user_id'], self.context['module'],
                                self.context['method'], self.context['call_id'])
        return

    def __init__(self):
        self.logger = None

    ''' Run a job to build a distance matrix.

        When successful the distance matrix csv file is stored in Shock.

        @param job Dictionary with configuration variables, context variables, and input variables for job
        @raise ExtractError: Error extracting sequences from input sequence file
        @raise SeqLenError: Error with lengths of sequences in input sequence file
        @raise SortError: Error sorting a raw sequence file
        @raise MergeError: Error merging a raw sequence file
        @raise CompressError: Error compressing a raw sequence file
        @raise ShockError: Error saving file to Shock
        @return Nothing
    '''

    def runJob(self, job):
        
        self.config = job['config']
        self.context = job['context']
        self.input = job['input']
        
        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'], self.context['token'])
        
        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))
        
        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id'])
        self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.")

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
             
        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p,q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
                   
        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile+'.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
        
        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)
        
        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.")
        
        # Mark the job as complete.
        results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results)
        self._log(log.INFO, 'Job '+job['id']+' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()
        
        return

    def calculate(self, listFilePath, scale, csvFile):

        # Each line of the list file is a path to a compressed file.
        compressedList = list()
        listFile = open(listFilePath, 'r')
        for line in listFile:
            compressedList.append(line.strip())
        listFile.close()

        # Calculate the distance matrix.
        self._cbdCalculator(compressedList, scale, csvFile)
        return
Beispiel #12
0
    def runJob(self, job):
        
        self.config = job['config']
        self.context = job['context']
        self.input = job['input']
        
        # Create a shock client and authenticate as the user.
        self.shockClient = ShockClient(self.config['shock_url'], self.context['token'])
        
        # Create a user and job state client and authenticate as the user.
        ujsClient = UserAndJobState(self.config['userandjobstate_url'], token=self.context['token'])

        # Create a process pool.
        self.pool = Pool(processes=int(self.config['num_pool_processes']))
        
        # Create a work directory for storing intermediate files.
        self.jobDirectory = make_job_dir(self.config['work_folder_path'], job['id'])
        self._log(log.INFO, 'Job '+job['id']+' running with work folder '+self.jobDirectory)

        # Download input fasta files from Shock and extract sequences to work directory.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'extracting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sequenceList = []
        for nodeId in self.input['node_ids']:
            node = self.shockClient.get_node(nodeId)
            sourceFile = os.path.join(self.jobDirectory, node['file']['name'])
            destFile = '%s.sequence' %(os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = nodeId
            args['sourceFile'] = sourceFile
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            if result.get() != 0:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file, result: %d" %(result.get()))
        for path in self.input['file_paths']:
            sourceFile = os.path.basename(path)
            destFile = '%s/%s.sequence' %(self.jobDirectory, os.path.splitext(sourceFile)[0])
            if PairSeparator in destFile: # Check for pair separator string in file name and replace as needed.
                destFile = destFile.replace(PairSeparator, '-')
            sequenceList.append(destFile)
            args = dict() # Needs to be scoped here so each process gets its own copy
            args['format'] = self.input['format']
            args['shockUrl'] = self.config['shock_url']
            args['auth'] = self.context['token']
            args['sequenceLen'] = self.input['sequence_length']
            args['minReads'] = self.input['min_reads']
            args['maxReads'] = self.input['max_reads']
            args['nodeId'] = None
            args['sourceFile'] = path
            args['destFile'] = destFile
            result = self.pool.apply_async(extract_seq, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except Exception as e:
                self._cleanup()
                raise ExtractError("Error extracting sequences from input sequence file: %s" %(e.message))

        # Confirm that each file met the criteria for sequence length and number of sequences.
        filesToRemove = list()
        for index in range(len(sequenceList)):
            # See if the file did not have the minimum number of sequences.
            if not os.path.exists(sequenceList[index]):
                filesToRemove.append(index)
                continue

            # See if the file has no data.
            if os.path.getsize(sequenceList[index]) == 0:
                self._cleanup()
                raise SeqLenError("Sequence file '%s' has no sequences" %(sequenceList[index]))

        filteredList = list()
        for index in range(len(sequenceList)):
            if index not in filesToRemove:
                filteredList.append(sequenceList[index])
        if len(filteredList) < 2:
            self._cleanup()
            raise SeqLenError("There are not enough sequence files that meet the sequence length or number of sequences criteria.")

        # Sort the sequences.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'sorting sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        sortedList = []
        for sourceFile in filteredList:
            destFile = '%s.sorted' %(os.path.splitext(sourceFile)[0])
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '--output=%s' %(destFile), sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise SortError("Error sorting sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
             
        # Create combined and sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'merging all pairs of sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        for p,q in combinations(sortedList, 2):
            pbase = os.path.basename(p)
            qbase = os.path.basename(q)
            dbase = '%s%s%s.sorted' %(os.path.splitext(pbase)[0], PairSeparator, os.path.splitext(qbase)[0])
            destFile = os.path.join(self.jobDirectory, dbase)
            sortedList.append(destFile)
            args = [ '/usr/bin/sort', '-m', '--output=%s' %(destFile), p, q ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise MergeError("Error merging sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
                   
        # Compress all sorted files.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'compressing sequence files', 1, timestamp(3600))
        except:
            pass
        resultList = []
        compressedList = []
        for sourceFile in sortedList:
            compressedList.append(sourceFile+'.xz')
            if self.input['extreme']:
                level = '-9e'
            else:
                level = '-9'
            args = [ '/usr/bin/xz', '--keep', level, '--no-warn', sourceFile ]
            result = self.pool.apply_async(run_command, (args,))
            resultList.append(result)
        for result in resultList:
            try:
                result.get()
            except CommandError as e:
                self._cleanup()
                raise CompressError("Error compressing sequence file: %s\nCommand: '%s'\nStdout: '%s'\nStderr: '%s'" %(e.message, e.cmd, e.stdout, e.stderr))
        
        # Calculate the distance matrix.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'calculating distance matrix', 1, timestamp(3600))
        except:
            pass
        csvFile = os.path.join(self.jobDirectory, '%s.csv' %(job['id']))
        self._cbdCalculator(compressedList, self.input['scale'], csvFile)
        
        # Store the output file in shock.
        try:
            ujsClient.update_job_progress(job['id'], self.context['token'], 'storing output file in shock', 1, timestamp(3600))
        except:
            pass
        node = self.shockClient.create_node(csvFile, '')
        if not node['id']:
            # Shock let us down. Save the distance matrix in the work directory for possible recovery.
            os.rename(csvFile, '%s/%s.csv' %(self.config['work_folder_path'], job['id']))
            self._cleanup()
            raise ShockError("Error saving distance matrix file to Shock. A Shock node was not created.")
        
        # Mark the job as complete.
        results = { 'shocknodes': [ node['id'] ], 'shockurl': self.config['shock_url'] }
        ujsClient.complete_job(job['id'], self.context['token'], 'done', None, results)
        self._log(log.INFO, 'Job '+job['id']+' completed successfully')

        # Cleanup after ourselves.
        self._cleanup()
        
        return
Beispiel #13
0
        ujsClient.delete_job(args.jobID)
        exit(1)

    # Check if the job is complete.
    if not info['complete']:
        print "Job '%s' has status '%s' and is working on task %s of %s.  Check again later." \
            %(args.jobID, info['status'], info['total_progress'], info['max_progress'])
        exit(1)

    # Show job info.
    if args.showTimes:
        print 'Job started at %s and finished at %s' % (info['started'],
                                                        info['last_update'])

    # Create a shock client.
    shockClient = ShockClient(info['results']['shockurl'],
                              ujsClient._headers['AUTHORIZATION'])

    # Download the output to the specified file and remove the file from shock.
    try:
        shockClient.download_to_path(info['results']['shocknodes'][0],
                                     args.outputPath)
    except Exception as e:
        print 'Error downloading distance matrix from %s: %s' % (
            info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    try:
        shockClient.delete_node(info['results']['shocknodes'][0])
    except Exception as e:
        print 'Error deleting distance matrix file from %s: ' % (
            +info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
Beispiel #14
0
    input['sequence_length'] = args.sequenceLen
    input['min_reads'] = args.minReads
    input['max_reads'] = args.maxReads
    if args.extreme:
        input['extreme'] = 1
    else:
        input['extreme'] = 0
    input['node_ids'] = list()

    # Create a cbd client (which must be authenticated).
    if args.url is None:
        args.url = get_url()
    cbdClient = CompressionBasedDistance(url=args.url)
    
    # Create a shock client.
    shockClient = ShockClient(args.shockurl, cbdClient._headers['AUTHORIZATION'])
    
    # Parse the input file with the list of sequence files.
    (fileList, extensions, numMissingFiles) = parse_input_file(args.inputPath)
    if numMissingFiles > 0:
        exit(1)

    # Set the format based on the sequence file extension if the format argument was not specified.
    if args.format is None:
        if len(extensions) == 1:
            input['format'] = extensions.keys()[0]
        else:
            print "The format of the sequence files could not be determined.  Set the format with the --format argument."
            exit(1)
    else:
        input['format'] = args.format
Beispiel #15
0
        print ujsClient.get_detailed_error(args.jobID)
        ujsClient.delete_job(args.jobID)
        exit(1)

    # Check if the job is complete.
    if not info['complete']:
        print "Job '%s' has status '%s' and is working on task %s of %s.  Check again later." \
            %(args.jobID, info['status'], info['total_progress'], info['max_progress'])
        exit(1)

    # Show job info.
    if args.showTimes:
        print 'Job started at %s and finished at %s' %(info['started'], info['last_update'])

    # Create a shock client.
    shockClient = ShockClient(info['results']['shockurl'], ujsClient._headers['AUTHORIZATION'])
       
    # Download the output to the specified file and remove the file from shock.
    try:
        shockClient.download_to_path(info['results']['shocknodes'][0], args.outputPath)
    except Exception as e:
        print 'Error downloading distance matrix from %s: %s' %(info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    try:
        shockClient.delete_node(info['results']['shocknodes'][0])
    except Exception as e:
        print 'Error deleting distance matrix file from %s: ' %(+info['results']['shockurl'], e.message)
        traceback.print_exc(file=sys.stdout)
    
    # Delete the job.
    ujsClient.delete_job(args.jobID)