コード例 #1
0
    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id
コード例 #2
0
    def UploadFromMEME(self, ctx, params):
        """
        :param params: instance of type "UploadGibbsInParams" -> structure:
           parameter "path" of String, parameter "ws_name" of String,
           parameter "obj_name" of String
        :returns: instance of type "UploadOutput" -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMEME
        print('Extracting motifs')
        motifList = MU.parse_meme_output(params['path'])
        print(motifList)

        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['SequenceSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(motifList, MSO)
        MSU.CheckLength(MSO, params['min_len'], params['max_len'])
        if 'absolute_locations' in params:
            for motif in MSO['Motifs']:
                for loc in motif['Motif_Locations']:
                    if loc['sequence_id'] in params['absolute_locations']:
                        loc['sequence_id'] = params['contig']
                        absStart = int(params['start'])
                        loc['start'] = absStart
                        loc['end'] = absStart + loc['end']

        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #END UploadFromMEME

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFromMEME return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #3
0
    def _save_to_ws_and_report(self, ws_id, source, assembly_data):
        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        workspace_id = dfu.ws_name_to_id(self.getWsName())
        print("Workspace id: {}".format(workspace_id))
        info = dfu.save_objects({
            'id':
            '18590',  # Numerical id of workspace
            "objects": [{
                "type": "KBaseGenomeAnnotations.Assembly-3.0",
                "data": assembly_data,
                "name": ws_id
            }]
        })[0]
        #print("Data from save to ws: {}".format(json.dumps(info, indent=2)))
        assembly_ref = "%s/%s/%s" % (info[6], info[0], info[4])

        return assembly_ref
コード例 #4
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
コード例 #5
0
class ExprMatrixUtils:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME = 'workspace_name'
    PARAM_IN_OBJ_NAME = 'output_obj_name'
    PARAM_IN_EXPSET_REF = 'expressionset_ref'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        pass

    def process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [self.PARAM_IN_EXPSET_REF,
                  self.PARAM_IN_OBJ_NAME,
                  self.PARAM_IN_WS_NAME
                 ]:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)
        self.ws_id = ws_name_id

    def get_expressionset_data(self, expressionset_ref):

        expr_set_obj = self.ws_client.get_objects2(
            {'objects': [{'ref': expressionset_ref}]})['data'][0]

        expr_set_obj_type = expr_set_obj.get('info')[2]
        expr_set_data = dict()
        expr_set_data['ws_name'] = expr_set_obj.get('info')[7]
        expr_set_data['obj_name'] = expr_set_obj.get('info')[1]

        if re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d', expr_set_obj_type):
            expr_set_data['genome_ref'] = expr_set_obj['data']['genome_id']
            expr_obj_refs = list()
            for expr_obj in expr_set_obj['data']['mapped_expression_ids']:
                expr_obj_refs.append(expr_obj.values()[0])
            expr_set_data['expr_obj_refs'] = expr_obj_refs

        elif re.match('KBaseSets.ExpressionSet-\d.\d', expr_set_obj_type):
            items = expr_set_obj.get('data').get('items')
            expr_obj_refs = list()
            for item in items:
                expr_obj_refs.append(item['ref'])
            expr_obj = self.ws_client.get_objects2(
                {'objects': [{'ref': expr_obj_refs[0]}]})['data'][0]
            expr_set_data['genome_ref'] = expr_obj['data']['genome_id']
            expr_set_data['expr_obj_refs'] = expr_obj_refs
        else:
            raise TypeError(self.PARAM_IN_EXPSET_REF + ' should be of type ' +
                            'KBaseRNASeq.RNASeqExpressionSet ' +
                            'or KBaseSets.ExpressionSet')
        return expr_set_data

    def save_expression_matrix(self, tables, expr_set_data, em_obj_name, hidden = 0):

        all_rows = {}    # build a dictionary of keys only which is a union of all row ids (gene_ids)
        self.logger.info( '***** length of tables is {0}'.format( len( tables )))
        for table in tables:
            for r in table.keys():
                all_rows[r] = []

        for gene_id in all_rows.keys():
            row = []
            for table in tables:
                if ( gene_id in table ):
                    #logger.info( 'append ' + gene_id )
                    #logger.info( pformat( table[gene_id]))
                               #all_rows[gene_id].append( table[gene_id] )
                    row.append( table[gene_id] )
                else:
                    #logger.info( 'append  0' )
                    row.append( 0 )
                all_rows[gene_id] = row
                #logger.info( all_rows[gene_id])

        em_data = {
                    'genome_ref': expr_set_data['genome_ref'],
                    'scale': 'log2',
                    'type': 'level',
                    'data': {
                            'row_ids': [],
                            'values': [],
                            'col_ids': expr_set_data['expr_obj_names']
                            },
                    'feature_mapping' : {},
                    'condition_mapping': expr_set_data['condition_map']
                   }

        # we need to load row-by-row to preserve the order
        self.logger.info('loading expression matrix data')

        for gene_id in all_rows.keys():
            em_data['feature_mapping'][gene_id] = gene_id
            em_data['data']['row_ids'].append(gene_id)
            em_data['data']['values'].append(all_rows[gene_id])

        try:
            self.logger.info( 'saving em_data em_name {0}'.format(em_obj_name))
            obj_info = self.dfu.save_objects({'id': self.ws_id,
                                              'objects': [
                                                          { 'type': 'KBaseFeatureValues.ExpressionMatrix',
                                                            'data': em_data,
                                                            'name': em_obj_name,
                                                            'hidden': hidden,
                                                            'extra_provenance_input_refs': [
                                                                em_data.get('genome_ref'),
                                                                self.params[self.PARAM_IN_EXPSET_REF]]
                                                          }
                                                    ]})[0]
            self.logger.info('ws save return:\n' + pformat(obj_info))
        except Exception as e:
            self.logger.exception(e)
            raise Exception('Failed Saving Expression Matrix to Workspace')

        return str(obj_info[6]) + '/' + str(obj_info[0]) + '/' + str(obj_info[4])

    def get_expression_matrix(self, params):

        self.process_params(params)
        self.params = params

        expressionset_ref = params.get(self.PARAM_IN_EXPSET_REF)

        expr_set_data = self.get_expressionset_data(expressionset_ref)
        expr_obj_names = list()
        fpkm_tables = list()
        tpm_tables = list()
        condition_map = dict()
        tpm_table = None
        for expr_obj_ref in expr_set_data['expr_obj_refs']:
            try:
                self.logger.info('*** getting expression set {0} from workspace ****'
                                 .format(expr_obj_ref))

                expr = self.ws_client.get_objects2(
                                            {'objects':
                                            [{'ref': expr_obj_ref}]})['data'][0]

            except Exception, e:
                self.logger.exception(e)
                raise Exception('Unable to download expression object {0} from workspace {1}'.
                                format(expr_obj_ref, expr_set_data['ws_name']))

            expr_name = expr.get('info')[1]
            expr_obj_names.append(expr_name)
            condition_map.update({expr_name: expr.get('data').get('condition')})
            num_interp = expr.get('data').get('numerical_interpretation')
            if num_interp != 'FPKM':
                raise Exception(
                    'Did not get expected FPKM value from numerical interpretation key from \
                     Expression object {0}, instead got '.format(expr_obj_ref, num_interp))

            pr_comments = expr.get('data').get('processing_comments', None)  # log2 Normalized
            if pr_comments is not None:
                self.logger.info('pr_comments are {0}'.format(pr_comments))

            fpkm_table = expr.get('data').get('expression_levels') # QUESTION: is this really FPKM levels?
            self.logger.info('FPKM keycount: {0}'.format(len(fpkm_table.keys())))
            fpkm_tables.append(fpkm_table)

            tpm_table = None  # Cufflinks doesn't generate TPM
            if 'tpm_expression_levels' in expr['data']:  # so we need to check for this key
                tpm_table = expr.get('data').get('tpm_expression_levels')
                self.logger.info('TPM keycount: {0}'.format(len(tpm_table.keys())))
                tpm_tables.append(tpm_table)

        expr_set_data['expr_obj_names'] = expr_obj_names
        expr_set_data['condition_map'] = condition_map
        output_obj_name = params.get(self.PARAM_IN_OBJ_NAME)
        fpkm_ref = self.save_expression_matrix(fpkm_tables,
                                               expr_set_data,
                                               '{0}_FPKM_ExpressionMatrix'.format(output_obj_name))
        tpm_ref = None
        if tpm_table is not None:
            tpm_ref = self.save_expression_matrix(tpm_tables,
                                                  expr_set_data,
                                                  '{0}_TPM_ExpressionMatrix'.format(output_obj_name))
        return fpkm_ref, tpm_ref
コード例 #6
0
ファイル: ReadsUtilsImpl.py プロジェクト: monicaaj/kb_blast
    def _proc_upload_reads_params(self, ctx, params):
        fwdid = params.get('fwd_id')
        if not fwdid:
            raise ValueError('No reads file provided')
        wsid = params.get('wsid')
        wsname = params.get('wsname')
        if not self.xor(wsid, wsname):
            raise ValueError(
                'Exactly one of the workspace ID or name must be provided')
        dfu = DataFileUtil(self.callback_url, token=ctx['token'])
        if wsname:
            self.log('Translating workspace name to id')
            if not isinstance(wsname, six.string_types):
                raise ValueError('wsname must be a string')
            wsid = dfu.ws_name_to_id(wsname)
            self.log('translation done')
        del wsname
        objid = params.get('objid')
        name = params.get('name')
        if not self.xor(objid, name):
            raise ValueError(
                'Exactly one of the object ID or name must be provided')
        revid = params.get('rev_id')
        interleaved = 1 if params.get('interleaved') else 0
        kbtype = 'KBaseFile.SingleEndLibrary'
        single_end = True
        if interleaved or revid:
            kbtype = 'KBaseFile.PairedEndLibrary'
            single_end = False
        if revid:
            interleaved = 0
        seqtype = params.get('sequencing_tech')
        if not seqtype:
            raise ValueError('The sequencing technology must be provided')

        sg = 1
        if 'single_genome' in params and not params['single_genome']:
            sg = 0
        o = {
            'sequencing_tech': seqtype,
            'single_genome': sg,
            # 'read_count': params.get('read_count'),
            # 'read_size': params.get('read_size'),
            # 'gc_content': params.get('gc_content')
        }
        self._add_field(o, params, 'strain')
        self._add_field(o, params, 'source')
        ism = params.get('insert_size_mean')
        self._check_pos(ism, 'insert_size_mean')
        issd = params.get('insert_size_std_dev')
        self._check_pos(issd, 'insert_size_std_dev')
        if not single_end:
            o.update({
                'insert_size_mean':
                ism,
                'insert_size_std_dev':
                issd,
                'interleaved':
                interleaved,
                'read_orientation_outward':
                1 if params.get('read_orientation_outward') else 0
            })
        return o, wsid, name, objid, kbtype, single_end, fwdid, revid
コード例 #7
0
    def upload_html_set(self, ctx, params):
        """
        Upload an HTML file set to the KBase data stores.
        :param params: instance of type "UploadHTMLSetInput" (Input to the
           upload_html_set function. Required arguments: One of: wsid - the
           id of the workspace where the reads will be saved (preferred).
           wsname - the name of the workspace where the reads will be saved.
           One of: objid - the id of the workspace object to save over name -
           the name to which the workspace object will be saved path - the
           path to the directory with the HTML files. This directory will be
           compressed and loaded into the KBase stores.) -> structure:
           parameter "wsid" of Long, parameter "wsname" of String, parameter
           "objid" of Long, parameter "name" of String, parameter "path" of
           String
        :returns: instance of type "UploadHTMLSetOutput" (Output of the
           upload_html_set function. obj_ref - a reference to the new
           Workspace object in the form X/Y/Z, where X is the workspace ID, Y
           is the object ID, and Z is the version.) -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: out
        #BEGIN upload_html_set
        del ctx
        wsid = params.get('wsid')
        wsname = params.get('wsname')
        if not self.xor(wsid, wsname):
            raise ValueError(
                'Exactly one of the workspace ID or name must be provided')
        dfu = DataFileUtil(self.callback_url)
        if wsname:
            self.log('Translating workspace name to id')
            if not isinstance(wsname, six.string_types):
                raise ValueError('wsname must be a string')
            wsid = dfu.ws_name_to_id(wsname)
            self.log('translation done')
        del wsname
        objid = params.get('objid')
        name = params.get('name')
        if not self.xor(objid, name):
            raise ValueError(
                'Exactly one of the object ID or name must be provided')
        htmlpath = params.get('path')
        if not htmlpath:
            raise ValueError('path parameter is required')
        htmlpath = os.path.abspath(os.path.expanduser(htmlpath))
        if not os.path.isdir(htmlpath):
            raise ValueError('path must be a directory')
        zipfile = dfu.pack_file({
            'file_path': htmlpath,
            'pack': 'zip'
        })['file_path']
        if os.path.getsize(zipfile) > self.MAX_ZIP_SIZE:
            os.remove(zipfile)
            raise ValueError('Zipfile from specified directory is greater ' +
                             'than maximum size allowed: ' +
                             str(self.MAX_ZIP_SIZE))
        fh, tf = tempfile.mkstemp(dir=self.scratch)
        os.close(fh)
        with open(tf, 'w') as objfile, open(zipfile, 'rb') as z:
            objfile.write('{"file":"')
            d = z.read(self.CHUNKSIZE)
            while d:
                objfile.write(base64.b64encode(d))
                d = z.read(self.CHUNKSIZE)
            objfile.write('"}')
        os.remove(zipfile)
        so = {
            'type': 'HTMLFileSetUtils.HTMLFileSet-0.1',  # TODO release
            'data_json_file': tf
        }
        if name:
            so['name'] = name
        else:
            so['objid'] = objid
        wsio = WsLargeDataIO(self.callback_url,
                             service_ver='dev')  # TODO remove dev @IgnorePep8
        ret = wsio.save_objects({'id': wsid, 'objects': [so]})[0]
        os.remove(tf)
        out = {'obj_ref': str(ret[6]) + '/' + str(ret[0]) + '/' + str(ret[4])}
        #END upload_html_set

        # At some point might do deeper type checking...
        if not isinstance(out, dict):
            raise ValueError('Method upload_html_set return value ' +
                             'out is not type dict as required.')
        # return the results
        return [out]
コード例 #8
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of type "get_promoter_for_gene_output_params" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs

        #TODO: have these guys return output paths
        for key, value in params.iteritems():
            print key
        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']
        promoterFastaFilePath = self.get_promoter_for_gene(ctx, params)[0]

        gibbsCommandList = []
        for i in range(motMin, motMax + 1, 2):
            gibbsCommandList.append(
                GU.build_gibbs_command(promoterFastaFilePath, i))

        for g in gibbsCommandList:
            GU.run_gibbs_command(g)
        #gibbsCommand = GU.build_gibbs_command(promoterFastaFilePath)
        #GU.run_gibbs_command(gibbsCommand)
        #print(promoterFastaFilePath)
        homerMotifCommand = HU.build_homer_motif_command(promoterFastaFilePath)
        homerLocationCommand = HU.build_homer_location_command(
            promoterFastaFilePath)
        os.mkdir(self.shared_folder + '/homer_out')
        #print(homerMotifCommand)
        HU.run_homer_command(homerMotifCommand)
        HU.run_homer_command(homerLocationCommand)

        MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath)
        MEU.run_meme_command(MEMEMotifCommand)

        gibbsMotifList = GU.parse_gibbs_output(motMin, motMax)
        homerMotifList = HU.parse_homer_output()
        memeMotifList = MEU.parse_meme_output()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        lineCount = 0
        with open(promoterFastaFilePath, 'r') as pFile:
            for line in pFile:
                lineCount += 1
        numFeat = lineCount / 2
        with open(promoterFastaFilePath, 'r') as pFile:
            fileStr = pFile.read()
        promHtmlStr = '<html><body> ' + fileStr + ' </body></html>'
        with open(htmlDir + '/promoters.html', 'w') as promHTML:
            promHTML.write(promHtmlStr)
        JsonPath = '/kb/module/work/tmp'
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/gibbs.json', htmlDir + '/gibbs.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/homer_out/homer.json', htmlDir + '/homer.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/meme_out/meme.json', htmlDir + '/meme.html',
            str(numFeat)
        ])
        fullMotifList = []
        for h in homerMotifList:
            add = True
            for g in gibbsMotifList:
                if h['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
            for m in memeMotifList:
                if m['Iupac_signature'] == h['Iupac_signature']:
                    add = False
                    break
            if add:
                fullMotifList.append(h)
        for g in gibbsMotifList:
            add = True
            for m in memeMotifList:
                if m['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
                if add:
                    fullMotifList.append(g)
        for m in memeMotifList:
            fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)
        dfu = DataFileUtil(self.callback_url)
        parsed = ['gibbs.html', 'homer.html', 'meme.html', 'promoters.html']
        indexHtmlStr = '<html>'
        #use js to load the page content
        for p in parsed:
            indexHtmlStr += '<head><script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script> <script> $(function(){$("#' + p.replace(
                '.html', '_content') + '").load("' + p + '"); });</script> '
        indexHtmlStr += """<style>
            body {font-family: Arial;}

            /* Style the tab */
            .tab {
            overflow: hidden;
    border: 1px solid #ccc;
    background-color: #f1f1f1;
}

/* Style the buttons inside the tab */
.tab button {
    background-color: inherit;
    float: left;
    border: none;
    outline: none;
    cursor: pointer;
    padding: 14px 16px;
    transition: 0.3s;
    font-size: 17px;
}

/* Change background color of buttons on hover */
.tab button:hover {
    background-color: #ddd;
}

/* Create an active/current tablink class */
.tab button.active {
    background-color: #ccc;
}

/* Style the tab content */
.tabcontent {
    display: none;
    padding: 6px 12px;
    border: 1px solid #ccc;
    border-top: none;
}
</style></head> """
        indexHtmlStr += '<body>'
        #adding tabs
        indexHtmlStr += '<div class="tab">\n'
        for p in parsed:
            indexHtmlStr += '<button class="tablinks" onclick="openReport(event, \'' + p.replace(
                '.html', '_content') + '\')">' + p.replace('.html',
                                                           '') + '</button>'
        indexHtmlStr += '</div>'
        for p in parsed:
            indexHtmlStr += '<div id="' + p.replace(
                '.html', '_content') + '" class="tabcontent"></div>'
        indexHtmlStr += """<script>
function openReport(evt, reportName) {
    var i, tabcontent, tablinks;
    tabcontent = document.getElementsByClassName("tabcontent");
    for (i = 0; i < tabcontent.length; i++) {
        tabcontent[i].style.display = "none";
    }
    tablinks = document.getElementsByClassName("tablinks");
    for (i = 0; i < tablinks.length; i++) {
        tablinks[i].className = tablinks[i].className.replace(" active", "");
    }
    document.getElementById(reportName).style.display = "block";
    evt.currentTarget.className += " active";
}
</script>"""

        #for p in parsed:
        #    indexHtmlStr += '<a href="' + p + '">' + p.replace('.html','') +' Output</a>\n'
        #indexHtmlStr += '</body></html>'
        with open(htmlDir + '/index.html', 'w') as html_handle:
            html_handle.write(str(indexHtmlStr))

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['FeatureSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(fullMotifList, MSO)
        objname = 'MotifSet' + str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))

        #Pass motif set into this
        save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGwasData.MotifSet',
            'data': MSO,
            'name': objname
        }]

        info = dfu.save_objects(save_objects_params)[0]
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref':
                motif_set_ref,
                'description':
                'Motif Set generated by identify promoter'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #9
0
class variation_importer_utils:
    def __init__(self, utility_params):
        self.params = utility_params
        # self.scratch = utility_params['scratch']
        self.scratch = os.path.join(utility_params['scratch'],
                                    'variation_importer_' + str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.service_wiz_url = utility_params['srv-wiz-url']
        self.callback_url = utility_params['callback_url']

        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url,
                               token=utility_params['token'])

    def _create_fake_location_data(self):
        location = {
            'lat':
            random.uniform(-90, 90),
            'lon':
            random.uniform(-180, 180),
            'elevation':
            random.uniform(0, 100),
            'description':
            "".join([random.choice(string.ascii_letters) for n in xrange(20)])
        }
        return location

    def _create_fake_straininfo(self, genotype_id):
        straininfo = {
            'source_id': genotype_id,
            'location_info': self._create_fake_location_data()
        }
        return straininfo

    def _create_fake_population(self, genotypes):
        population = {'description': 'Faker population data.', 'strains': []}
        for genome in genotypes:
            population['strains'].append(self._create_fake_straininfo(genome))
        return population

    def _create_fake_kinship_matrix(self):
        kinship = {
            'row_ids': ['one', 'two'],
            'col_ids': ['one', 'two'],
            'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]]
        }
        return kinship

    def _compare(self, s, t):
        return Counter(s) == Counter(t)

    def pretend_download_staging_file(self, vcf_filename, scratch):
        vcf_filepath = os.path.join(scratch, vcf_filename)
        shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath)
        return {'copy_file_path': vcf_filepath}

    def _generate_population(self,
                             location_filepath,
                             genotypes,
                             population_description="None Provided"):
        locations = pd.read_csv(location_filepath, delimiter='\t')

        # Drop any missing data from id, latitude, or longitude.
        locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True)

        # Compare the location IDs with the genotype IDs
        if not (self._compare(locations.iloc[:, 0].astype(str).tolist(),
                              genotypes)):
            log("Location IDs do not match Sample IDs in Variation file!")
            raise ValueError(
                "Location IDs do not match Sample IDs in Variation file!")

        col_names = [x.lower() for x in locations.columns.values]
        expected_columns = ['id', 'latitude', 'longitude']
        optional_columns = ['elevation', 'description']

        # CHeck that first three columns match the expected columns.
        if not (self._compare(col_names[0:3], expected_columns)):
            raise ValueError("Missing or unexpected column names in {}".format(
                location_filepath))

        # If optional columns are not present, give default value for each.
        for col in optional_columns:
            if col not in col_names:
                if col == 'elevation':
                    locations[col] = 0.0
                else:
                    locations[col] = "None provided."

        population = {'description': population_description, 'strains': []}
        for idx, row in locations.iterrows():
            population['strains'].append({
                'source_id': str(row['id']),
                'location_info': {
                    'lat': row['latitude'],
                    'lon': row['longitude'],
                    'elevation': row['elevation'],
                    'description': row['description']
                }
            })

        return population

    def _validate_vcf(self, vcf_filepath, vcf_version):
        validation_output_dir = os.path.join(self.scratch,
                                             'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-o")
            validator_cmd.append(validation_output_dir)
        else:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version below 4.1.  No validation logging.")

        print("Validator command: {}".format(validator_cmd))
        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            validator_output.append(line)

        p.wait()

        validation_output_filename = [
            f for f in os.listdir(validation_output_dir) if f.endswith('.txt')
        ][0]
        validation_output_filepath = os.path.join(validation_output_dir,
                                                  validation_output_filename)

        if not validation_output_filename:
            print('Validator did not generate log file!')
            raise Exception("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filepath))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filepath, p.returncode

    # Retrieve contigs from assembly file.
    def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'):
        try:
            assembly_data = self.dfu.get_objects(
                {'object_refs': [assembly_ref]})['data'][0]['data']
        except Exception as e:
            print("Unable to retrieve Assembly reference: {}".format(
                assembly_ref))
            raise ValueError(e)
        raw_contigs = assembly_data['contigs']
        contigs = {}

        # Contigs returns just a dict with key and contig_id
        for key, value in raw_contigs.iteritems():
            contigs[str(key)] = value['contig_id']
        return raw_contigs

    def _get_version_contigs_genotypes(self, vcf_filepath):
        contigs = []
        genotypes = []
        version = ''
        with (gzip.open if vcf_filepath.endswith('.gz') else open)(
                vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted."
                    )
                raise ValueError(
                    "Invalid VCF.  ##fileformat line in meta is improperly formatted."
                )
            version = float(tokens[1][-4:].rstrip())
            log("VCF version: {}".format(version))
            for line in vcf:
                if line.startswith("#CHROM"):
                    log("#CHROM encountered, exiting loop.")
                    genotypes = line.split()[9:]
                    log("Number Genotypes in vcf: {}".format(len(genotypes)))
                    break
                tokens = line.split("=")

                if tokens[0].startswith('##contig'):
                    contigs.append(tokens[2][:-2])
        return version, contigs, genotypes

    # Arabidopsis ref: 18590/2/8
    def _get_assembly_ref_from_genome(self, genome_ref):
        ga = GenomeAnnotationAPI(self.service_wiz_url)
        inputs_get_assembly = {'ref': genome_ref}
        try:
            assembly_object_ref = ga.get_assembly(inputs_get_assembly)
        except Exception as e:
            print(
                "Unable to retrieve Assembly reference ID from Genome ref_id: {}"
                .format(genome_ref))
            raise Exception(e)

        return assembly_object_ref

    def _generate_output_file_list(self):
        log('Start packing result files')
        output_files = list()

        result_file = os.path.join(self.scratch,
                                   'variation_importer_results.zip')
        excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store']
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if not (file.endswith(tuple(excluded_extensions))
                            # file.endswith('.zip') or
                            # file.endswith('.vcf') or
                            # file.endswith('.vcf.gz') or
                            # file.endswith('.html') or
                            # file.endswith('.DS_Store')
                            ):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by Variation Importer'
        })
        log("Importer output generated: {}".format(output_files))

        return output_files

    def _generate_report(self, params, variation_results, variation_file_path):

        stats_results = self._generate_variation_stats(
            params['additional_output_type'], variation_file_path)

        html_report = self._generate_html_report(variation_results,
                                                 stats_results)

        file_links = self._generate_output_file_list()
        objects = []
        if (variation_results['valid_variation_file']):
            objects = [{
                'ref':
                variation_results['variation_obj_ref'],
                'description':
                'Variation Object created by VCF Importer'
            }]

        report_params = {
            'objects_created': objects,
            'message': '',
            'direct_html_link_index': 0,
            'file_links': file_links,
            'html_links': html_report,
            'html_window_height': 330,
            'workspace_name': params['workspace_name'],
            'report_object_name':
            'variation_importer_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
            'variation_ref': variation_results['variation_obj_ref']
        }
        log("Returning from _generate_report!")
        return report_output

    def _generate_html_report(self, variation_results, stats_output=None):
        """
            _generate_html_report: generate html report from output files
        """
        html_report = list()
        print("Validation output filepath passed to html report: {}".format(
            variation_results['validation_output_filepath']))
        try:
            report_dir = os.path.join(self.scratch, 'html')
            os.mkdir(report_dir)

            with open(template_dir, 'r') as html, open(
                    variation_results['validation_output_filepath'],
                    'r') as validation:

                validation_content = '<p><h4>{} '.format(
                    variation_results['variation_filename'])
                if variation_results.get('valid_variation_file'):
                    validation_content += '<em><i>is</i> a valid </em> variation file.'
                else:
                    validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.'
                validation_content += '</h4></p>'

                report = html.read()

                # Discard the first line of the validation file.  It is irrelevant.
                validation.readline()

                validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>'
                validation_content += '<ul>'
                for line in validation.readlines():
                    validation_content += '<li>{}</li>'.format(line)
                validation_content += '</ul>'

                if variation_results.get('invalid_contigs'):
                    validation_content += '<h4>The following Contigs were not found in the reference genome.  The possible contigs have been written to the file {}.  Please see the associated links to download.</h4>'.format(
                        variation_results.get('genome_ref'),
                        'valid_contigs.txt')
                    validation_content += '<ul>'
                    for contig in variation_results.get('invalid_contigs'):
                        validation_content += '<li>{}</li>'.format(contig)
                    validation_content += '</ul>'

                # if not variation_results.get('contigs'):
                #     validation_content += '<h4>No contig information was included in the VCF file header!  Please recreate the VCF file with each contig described in the meta description </h4>'
                report = report.replace('Validation_Results',
                                        validation_content)

                if (stats_output.get('stats_file_dir')):
                    summary_results = '<p><h4>Summary Statistics</h4></p>'
                    summary_results += '''
                                        <table>
                                            <tr>
                                                <th>Number of SNPs</th>
                                                <th>Number of Genotypes </th>
                                            </tr>
                                        '''
                    summary_results += '<tr>'
                    summary_results += '<td>{}</td><td>{}</td>'.format(
                        'To be added later',
                        variation_results['num_genotypes'])
                    summary_results += '</tr></table>'
                    report = report.replace('Variation_Statistics',
                                            summary_results)

                # visualization
                image_content = ''
                if (stats_output.get('stats_img_dir')):
                    image_dir = stats_output.get('stats_img_dir')

                    for file in glob.glob(os.path.join(image_dir, '*.png')):
                        shutil.move(file, report_dir)

                    for image in glob.glob(report_dir + "/*.png"):
                        image = image.replace(report_dir + '/', '')
                        caption = image.replace(report_dir + '/',
                                                '').replace('.png', '')
                        image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                            '></a><a target="_blank"><br>' \
                            '<p align="center">{}</p></p>'.format(image, caption)

                else:
                    image_content += 'No visualizations generated.'

                report = report.replace("Visualization_Results", image_content)
        except Exception as e:
            print("Error generating HTML report.")
            raise

        report_file_path = os.path.join(report_dir, 'index.html')
        with open(report_file_path, 'w') as output:
            output.write(report)
        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': report_file_path,
                'make_handle': 0,
                'pack': 'zip'
            })
            log("Variation HTML report to shock ref: {}".format(
                html_upload_ret))
        except:
            raise ValueError('Error uploading HTML to shock')

        html_report.append({
            'shock_id': html_upload_ret['shock_id'],
            'name': os.path.basename(report_file_path),
            'label': os.path.basename(report_file_path),
            'description': 'HTML report for Variation Importer'
        })

        return html_report

    def _generate_variation_stats(self, additional_output_type,
                                  variation_filepath):
        """
            :param commments go here
        """
        file_output_directory = os.path.join(self.scratch,
                                             'stats_' + str(uuid.uuid4()))
        os.mkdir(file_output_directory)

        image_output_directory = os.path.join(
            self.scratch, 'stats_images_' + str(uuid.uuid4()))
        os.mkdir(image_output_directory)

        # TODO: Validate user supplied params and build PLINK command
        plink_cmd = ["plink"]
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)

        # plink_cmd.append('--recode12')
        # plink_cmd.append('transpose')
        # plink_cmd.append('--output-missing-genotype')
        # plink_cmd.append("0")
        plink_cmd.append('--freq')
        plink_cmd.append('--hardy')
        # plink_cmd.append('gz')

        plink_cmd.append('--out')
        plink_cmd.append(variation_filepath)

        print("PLINK arguments: {}".format(plink_cmd))

        plink_output = {
            "errors": [],
            "warnings": []
            # "notes" : []
        }
        p = subprocess.Popen(plink_cmd,
                             cwd=file_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        while True:
            line = p.stdout.readline()
            if not line:
                break
            # log(line)
            tokens = line.split(':')
            if (tokens[0] == 'Error'):
                plink_output['errors'].append(line)
                raise ValueError('PLINK 1.9 error: ' + line)
            elif (tokens[0] == 'Warning'):
                plink_output['warnings'].append(line)
                print(line)
            # elif(tokens[0] == 'Note'):
            #     plink_output['notes'].append(line)
            #     print(line)

        p.stdout.close()
        p.wait()
        plink_output_filepath = os.path.join(file_output_directory,
                                             'plink_cli_output.txt')
        with open(plink_output_filepath, 'w') as plink:
            for data in plink_output:
                plink.write("{}: {}\n".format(data, plink_output[data]))

        plink_output_files = [
            f for f in os.listdir(self.scratch)
            if f.startswith(os.path.basename(variation_filepath) + '.')
        ]

        for file in plink_output_files:
            shutil.move(os.path.join(self.scratch, file),
                        file_output_directory)

        if p.returncode != 0:
            log("PLINK encountered an error during runtime.  Please see log file."
                )

        variation_filename = os.path.basename(variation_filepath)
        base_filepath = os.path.join(file_output_directory, variation_filename)
        freq_filepath = base_filepath + '.frq'

        maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R'
        hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R'
        log("Frequency filepath: {}".format(freq_filepath))
        # TODO: make function to do Rscript calls.
        # generate visualizations and store in directory
        maf_command = ['Rscript']
        maf_command.append('--no-save')
        maf_command.append('--vanilla')
        maf_command.append(maf_script_filepath)
        maf_command.append(freq_filepath)
        maf_command.append("Minor Allele Frequencies.png")
        r = subprocess.Popen(maf_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        r.wait()
        if r.returncode != 0:
            log("Error creating MAF histogram in R")

        hwe_filepath = base_filepath + '.hwe'
        zoom_filepath = hwe_filepath + '.zoom'
        zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format(
            hwe_filepath, zoom_filepath)
        try:
            z = subprocess.Popen(zoom_command,
                                 cwd=file_output_directory,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
            z.wait()

            if z.returncode != 0:
                log("Error creating HWE zoom file.")

        except Exception as e:
            log("Error creating zoom HWE file: {}".format(e))

        hwe_command = ['Rscript']
        hwe_command.append('--no-save')
        hwe_command.append('--vanilla')
        hwe_command.append(hwe_script_filepath)
        hwe_command.append(hwe_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium.png")
        hwe_command.append(zoom_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png")
        print("MAF command: {}".format(hwe_command))
        h = subprocess.Popen(hwe_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        h.wait()

        if h.returncode != 0:
            log("Error generating HWE Zoom plot")

        return {
            'stats_file_dir': file_output_directory,
            'stats_img_dir': image_output_directory
        }

    def _save_variation_to_ws(self, workspace_name, variation_obj,
                              variation_filepath, kinship_matrix):
        ws_id = self.dfu.ws_name_to_id(workspace_name)
        try:
            vcf_shock_return = self.dfu.file_to_shock({
                'file_path': variation_filepath,
                'make_handle': 1,
                'pack': 'gzip'
            })
        except Exception as e:
            print("Error uploading file to shock!")
            raise ValueError(e)

        variation_obj['variation_file_reference'] = vcf_shock_return.get(
            'shock_id')

        info = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_obj,
                'name': 'TestVariationImporterName'
            }]
        })[0]

        variation_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        log("Variation reference created: {}".format(variation_ref))
        return variation_ref

    def validate_vcf(self, params):
        """
            :param params: dict containing all input parameters.
        """

        returnVal = {}
        valid_vcf_file = True

        try:
            vcf_filepath = self.pretend_download_staging_file(
                params['staging_file_subdir_path'],
                self.scratch).get('copy_file_path')

            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['staging_file_subdir_path']))

        try:
            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['location_file_subdir_path']))

        # Check file size
        log("{} file size: {}".format(vcf_filepath,
                                      os.path.getsize(vcf_filepath)))
        log('\nValidating {}...'.format(vcf_filepath))

        vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes(
            vcf_filepath)

        if not vcf_contigs:
            log("No contig data in {} header.".format(vcf_filepath))
            raise ValueError(
                "No contig data in {} header.".format(vcf_filepath))

        if (vcf_version < 4.1):
            log("VCF file is version {}.  Must be at least version 4.1".format(
                vcf_version))
            raise ValueError(
                "VCF file is version {}.  Must be at least version 4.1".format(
                    vcf_version))

        # Generate population object
        population = self._generate_population(location_filepath,
                                               vcf_genotypes)

        # Retrieve Assembly object reference associated with genome.
        try:
            assembly_ref = self._get_assembly_ref_from_genome(
                params['genome_ref'])
        except Exception as e:
            print("Unable to retrieve {}".format(params['genome_ref']))
            raise ValueError(e)

        # Retrieve contig list from Assembly object.
        try:
            assembly_contigs = self._get_contigs_from_assembly(assembly_ref)
        except Exception as e:
            print("Unable to retrieve contigs from Assembly ref: {}".format(
                assembly_ref))
            raise ValueError(e)

        log("Length of assembly contigs: {}".format(len(assembly_contigs)))
        # Compare contig IDs from VCF to those in the Assembly object
        invalid_contigs = []
        for contig in vcf_contigs:
            if contig not in assembly_contigs.keys():
                invalid_contigs.append(contig)

        if invalid_contigs:
            log("Invalid contig IDs found in {}".format(vcf_filepath))
            valid_contig_filepath = os.path.join(self.scratch,
                                                 'valid_contigs.txt')
            log("Writing valid contigs to file: {}".format(
                valid_contig_filepath))
            with open(valid_contig_filepath, 'w') as icf:
                for contig in assembly_contigs:
                    icf.write(contig + '\n')
            valid_vcf_file = False

        validation_output_filepath, returncode = self._validate_vcf(
            vcf_filepath, vcf_version)

        if returncode != 0:
            valid_vcf_file = False

        kinship_matrix = self._create_fake_kinship_matrix()

        variation_obj_ref = ''
        if valid_vcf_file:
            variation_object = {
                "genome": params['genome_ref'],
                "population": population,
                "contigs": vcf_contigs,
                "comment": "Comments go here",
                "assay": "Assay data goes gere.",
                "originator": "PI/Lab info goes here",
                "pubmed_id": "PubMed ID goes here",
                "kinship_info": kinship_matrix
            }

            variation_obj_ref = self._save_variation_to_ws(
                params['workspace_name'], variation_object, vcf_filepath,
                kinship_matrix)

        log("Variation object reference: {}".format(variation_obj_ref))
        variation_report_metadata = {
            'valid_variation_file': valid_vcf_file,
            'variation_obj_ref': variation_obj_ref,
            'variation_filename': os.path.basename(vcf_filepath),
            'validation_output_filepath': validation_output_filepath,
            'vcf_version': vcf_version,
            'num_genotypes': len(vcf_genotypes),
            'num_contigs': len(vcf_contigs),
            'invalid_contigs': invalid_contigs
        }

        returnVal = self._generate_report(params, variation_report_metadata,
                                          vcf_filepath)

        return returnVal
コード例 #10
0
class GenomeInterface:
    def _validate_save_one_genome_params(self, params):
        """
        _validate_save_one_genome_params:
                validates params passed to save_one_genome method
        """

        log('start validating save_one_genome params')

        # check for required parameters
        for p in ['workspace', 'name', 'data']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _check_shock_response(self, response, errtxt):
        """
        _check_shock_response: check shock node response (Copied from DataFileUtil)
        """
        log('start checking shock response')

        if not response.ok:
            try:
                err = json.loads(response.content)['error'][0]
            except:
                # this means shock is down or not responding.
                self.log("Couldn't parse response error content from Shock: " +
                         response.content)
                response.raise_for_status()
            raise ValueError(errtxt + str(err))

    def _own_handle(self, genome_data, handle_property):
        """
        _own_handle: check that handle_property point to shock nodes owned by calling user
        """

        log('start checking handle {} ownership'.format(handle_property))

        if handle_property in genome_data:
            handle_id = genome_data[handle_property]
            hs = HandleService(self.handle_url, token=self.token)
            handles = hs.hids_to_handles([handle_id])
            shock_id = handles[0]['id']

            # Copy from DataFileUtil.own_shock_node implementation:
            header = {'Authorization': 'Oauth {}'.format(self.token)}
            res = requests.get(self.shock_url + '/node/' + shock_id +
                               '/acl/?verbosity=full',
                               headers=header,
                               allow_redirects=True)
            self._check_shock_response(
                res, 'Error getting ACLs for Shock node {}: '.format(shock_id))
            owner = res.json()['data']['owner']['username']
            user_id = self.auth_client.get_user(self.token)

            if owner != user_id:
                log('start copying node to owner: {}'.format(user_id))
                dfu_shock = self.dfu.copy_shock_node({
                    'shock_id': shock_id,
                    'make_handle': True
                })
                handle_id = dfu_shock['handle']['hid']
                genome_data[handle_property] = handle_id

    def _check_dna_sequence_in_features(self, genome):
        """
        _check_dna_sequence_in_features: check dna sequence in each feature
        """
        log('start checking dna sequence in each feature')

        if 'features' in genome:
            features_to_work = {}
            for feature in genome['features']:
                if not ('dna_sequence' in feature and feature['dna_sequence']):
                    features_to_work[feature['id']] = feature['location']

            if len(features_to_work) > 0:
                aseq = AssemblySequenceAPI(self.sw_url, token=self.token)
                get_dna_params = {'requested_features': features_to_work}
                if 'assembly_ref' in genome:
                    get_dna_params['assembly_ref'] = genome['assembly_ref']
                elif 'contigset_ref' in genome:
                    get_dna_params['contigset_ref'] = genome['contigset_ref']
                else:
                    # Nothing to do (it may be test genome without contigs)...
                    return
                dna_sequences = aseq.get_dna_sequences(
                    get_dna_params)['dna_sequences']
                for feature in genome['features']:
                    if feature['id'] in dna_sequences:
                        feature['dna_sequence'] = dna_sequences[feature['id']]
                        feature['dna_sequence_length'] = len(
                            feature['dna_sequence'])

    def __init__(self, config):
        self.ws_url = config.workspaceURL
        self.handle_url = config.handleURL
        self.shock_url = config.shockURL
        self.sw_url = config.srvWizURL
        self.token = config.token
        self.auth_service_url = config.authServiceUrl
        self.callback_url = config.callbackURL

        self.ws = Workspace(self.ws_url, token=self.token)
        self.auth_client = _KBaseAuth(self.auth_service_url)
        self.dfu = DataFileUtil(self.callback_url)

    def save_one_genome(self, params):
        log('start saving genome object')

        self._validate_save_one_genome_params(params)

        workspace = params['workspace']
        name = params['name']
        data = params['data']

        # check all handles point to shock nodes owned by calling user
        self._own_handle(data, 'genbank_handle_ref')
        self._own_handle(data, 'gff_handle_ref')

        self._check_dna_sequence_in_features(data)

        if 'hidden' in params and str(
                params['hidden']).lower() in ('yes', 'true', 't', '1'):
            hidden = 1
        else:
            hidden = 0

        if isinstance(workspace, int) or workspace.isdigit():
            workspace_id = workspace
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace)

        dfu_save_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': name,
                'hidden': hidden
            }]
        }

        dfu_oi = self.dfu.save_objects(dfu_save_params)[0]

        returnVal = {'info': dfu_oi}

        return returnVal
コード例 #11
0
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
コード例 #12
0
class FastaGFFToGenome:
    def __init__(self, config):
        self.cfg = config
        self.dfu = DataFileUtil(self.cfg.callbackURL)

    def import_file(self, params):

        # 1) validate parameters
        self._validate_import_file_params(params)

        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)

        # 3) extract out the parameters
        params = self._set_parsed_params(params)

        # 4) do the upload
        result = self.upload_genome(
            shock_service_url=self.cfg.shockURL,
            handle_service_url=self.cfg.handleURL,
            workspace_service_url=self.cfg.workspaceURL,
            callback_url=self.cfg.callbackURL,
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            genome_type=params['type'],
            release=params['release'])

        # 5) generate report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 6) clear the temp directory
        shutil.rmtree(input_directory)

        # 7) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details

    def upload_genome(self,
                      shock_service_url=None,
                      handle_service_url=None,
                      workspace_service_url=None,
                      callback_url=None,
                      input_gff_file=None,
                      input_fasta_file=None,
                      workspace_name=None,
                      core_genome_name=None,
                      scientific_name="unknown_taxon",
                      taxon_wsname='ReferenceTaxons',
                      taxon_reference=None,
                      source=None,
                      release=None,
                      genome_type=None):

        # retrieve taxon
        taxonomy, taxon_reference = self._retrieve_taxon(
            taxon_reference, taxon_wsname, scientific_name)

        # reading in Fasta file
        assembly = self._retrieve_fasta_file(input_fasta_file,
                                             core_genome_name, scientific_name,
                                             source)

        if taxon_reference is not None:
            assembly["taxon_ref"] = taxon_reference

        # reading in GFF file
        feature_list = self._retrieve_gff_file(input_gff_file)

        # compile links between features
        feature_hierarchy = self._generate_feature_hierarchy(feature_list)

        # retrieve genome feature list
        (genome_features_list, genome_mrnas_list,
         genome_cdss_list) = self._retrieve_genome_feature_list(
             feature_list, feature_hierarchy, assembly)

        # remove sequences before loading
        for contig in assembly["contigs"]:
            del assembly["contigs"][contig]["sequence"]

        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': input_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })

        # generate genome info
        genome = self._gen_genome_info(core_genome_name, scientific_name,
                                       assembly_ref, genome_features_list,
                                       genome_cdss_list, genome_mrnas_list,
                                       source, assembly, taxon_reference,
                                       taxonomy, input_gff_file)

        workspace_id = self.dfu.ws_name_to_id(workspace_name)
        genome_info = self.dfu.save_objects({
            "id":
            workspace_id,
            "objects": [{
                "name": core_genome_name,
                "type": "KBaseGenomes.Genome",
                "data": genome
            }]
        })[0]
        report_string = ''

        return {'genome_info': genome_info, 'report_string': report_string}

    def _validate_import_file_params(self, params):
        """
        validate_import_file_params:
                    validates params passed to FastaGFFToGenome.import_file method

        """

        # check for required parameters
        for p in ['workspace_name', 'genome_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # one and only one of 'path', or 'shock_id' is required
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            if not isinstance(file, dict):
                raise ValueError(
                    'Required "{}" field must be a map/dict'.format(key))
            n_valid_fields = 0
            if 'path' in file and file['path'] is not None:
                n_valid_fields += 1
            if 'shock_id' in file and file['shock_id'] is not None:
                n_valid_fields += 1
            if 'ftp_url' in file and file['ftp_url'] is not None:
                n_valid_fields += 1
                raise ValueError(
                    'FTP link is currently not supported for FastaGFFToGenome')
            if n_valid_fields < 1:
                error_msg = 'Required "{}" field must include one source: '.format(
                    key)
                error_msg += 'path | shock_id'
                raise ValueError(error_msg)
            if n_valid_fields > 1:
                error_msg = 'Required "{}" field has too many sources specified: '.format(
                    key)
                error_msg += str(file.keys())
                raise ValueError(error_msg)

        # check for valid type param
        valid_types = ['Reference', 'User upload', 'Representative']
        if params.get('type') and params['type'] not in valid_types:
            error_msg = 'Entered value for type is not one of the valid entries of '
            error_msg += '[' + ''.join('"' + str(e) + '", '
                                       for e in valid_types)[0:-2] + ']'
            raise ValueError(error_msg)

    def _set_parsed_params(self, params):
        log('Setting params')

        # default params
        default_params = {
            'taxon_wsname': self.cfg.raw['taxon-workspace-name'],
            'scientific_name': 'unknown_taxon',
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'type': 'User upload',
            'metadata': {}
        }

        for field in default_params:
            if field not in params:
                params[field] = default_params[field]

        log(json.dumps(params, indent=1))

        return params

    def _stage_input(self, params, input_directory):
        """
        stage_input: Setup the input_directory by fetching the files and uncompressing if needed

        """

        file_paths = dict()
        for key in ('fasta_file', 'gff_file'):
            file = params[key]
            file_path = None
            if 'path' in file and file['path'] is not None:
                local_file_path = file['path']
                file_path = os.path.join(input_directory,
                                         os.path.basename(local_file_path))
                log('Moving file from {} to {}'.format(local_file_path,
                                                       file_path))
                shutil.copy2(local_file_path, file_path)

            if 'shock_id' in file and file['shock_id'] is not None:
                # handle shock file
                log('Downloading file from SHOCK node: {}-{}'.format(
                    self.cfg.sharedFolder, file['shock_id']))
                sys.stdout.flush()
                file_name = self.dfu.shock_to_file({
                    'file_path': input_directory,
                    'shock_id': file['shock_id']
                })['node_file_name']
                file_path = os.path.join(input_directory, file_name)

            # extract the file if it is compressed
            if file_path is not None:
                print("staged input file =" + file_path)
                sys.stdout.flush()
                dfUtil_result = self.dfu.unpack_file({'file_path': file_path})
                file_paths[key] = dfUtil_result['file_path']
            else:
                raise ValueError(
                    'No valid files could be extracted based on the input')

        return file_paths

    def _retrieve_taxon(self, taxon_reference, taxon_wsname, scientific_name):
        """
        _retrieve_taxon: retrieve taxonomy and taxon_reference

        """
        taxon_id = -1
        taxon_object_name = "unknown_taxon"

        # retrieve lookup object if scientific name provided
        if (taxon_reference is None
                and scientific_name is not "unknown_taxon"):
            # retrieve taxon lookup object then find taxon id
            taxon_lookup = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/taxon_lookup"],
                'ignore_errors':
                0
            })['data'][0]['data']['taxon_lookup']

            if (scientific_name[0:3] in taxon_lookup
                    and scientific_name in taxon_lookup[scientific_name[0:3]]):
                taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
                taxon_object_name = "{}_taxon".format(str(taxon_id))

        # retrieve Taxon object
        taxon_info = {}
        if (taxon_reference is None):
            taxon_info = self.dfu.get_objects({
                'object_refs': [taxon_wsname + "/" + taxon_object_name],
                'ignore_errors':
                0
            })['data'][0]
            taxon_reference = "{}/{}/{}".format(taxon_info['info'][6],
                                                taxon_info['info'][0],
                                                taxon_info['info'][4])
        else:
            taxon_info = self.dfu.get_objects({
                "object_refs": [taxon_reference],
                'ignore_errors': 0
            })['data'][0]

        taxonomy = taxon_info['data']['scientific_lineage']

        return taxonomy, taxon_reference

    def _retrieve_fasta_file(self, input_fasta_file, core_genome_name,
                             scientific_name, source):
        """
        _retrieve_fasta_file: retrieve info from fasta_file
                              https://www.biostars.org/p/710/

        """
        log("Reading FASTA file")

        assembly = {
            "contigs": {},
            "dna_size": 0,
            "gc_content": 0,
            "md5": [],
            "base_counts": {}
        }
        contig_seq_start = 0

        input_file_handle = open(input_fasta_file, 'rb')

        # alternate header and sequence
        faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                                  lambda line: line[0] == ">"))
        for header in faiter:
            # drop the ">"
            header = header.next()[1:].strip()
            # join all sequence lines to one.
            seq = "".join(s.strip() for s in faiter.next())

            try:
                fasta_header, fasta_description = header.split(' ', 1)
            except:
                fasta_header = header
                fasta_description = None

            # Handle record
            seq = seq.upper()

            # Build contig objects for Assembly
            seq_count = dict(collections.Counter(seq))

            # to delete at end, but required for now
            contig_dict = {"sequence": seq}

            Ncount = 0
            if "N" in seq_count:
                Ncount = seq_count["N"]
            contig_dict["Ncount"] = Ncount

            for character in seq_count:
                if character in assembly["base_counts"]:
                    assembly["base_counts"][character] += seq_count[character]
                else:
                    assembly["base_counts"][character] = seq_count[character]

            contig_seq_length = len(seq)
            assembly["dna_size"] += contig_seq_length
            contig_gc_length = seq.count("G")
            contig_gc_length += seq.count("C")
            contig_dict["gc_content"] = float("{0:.2f}".format(
                float(contig_gc_length) / float(contig_seq_length)))
            assembly["gc_content"] += contig_gc_length
            contig_dict["contig_id"] = fasta_header
            contig_dict["name"] = fasta_header
            contig_dict["length"] = contig_seq_length
            contig_dict["md5"] = hashlib.md5(seq).hexdigest()
            assembly["md5"].append(contig_dict["md5"])

            if fasta_description is not None:
                contig_dict["description"] = fasta_description

            contig_dict["is_circular"] = "Unknown"
            contig_dict["start_position"] = contig_seq_start
            contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])
            assembly["contigs"][fasta_header] = contig_dict

            # used for start of next sequence and total gc_content
            contig_seq_start += contig_seq_length

        assembly["gc_content"] = float("{0:.2f}".format(
            float(assembly["gc_content"]) / float(contig_seq_start)))
        assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
        assembly["assembly_id"] = core_genome_name + "_assembly"
        assembly["name"] = scientific_name
        assembly["external_source"] = source
        assembly["external_source_id"] = os.path.basename(input_fasta_file)
        assembly["external_source_origination_date"] = str(
            os.stat(input_fasta_file).st_ctime)
        assembly["num_contigs"] = len(assembly["contigs"].keys())
        assembly["type"] = "Unknown"
        assembly[
            "notes"] = "Note MD5s are generated from uppercasing the sequences"

        return assembly

    def _retrieve_gff_file(self, input_gff_file):
        """
        _retrieve_gff_file: retrieve info from gff_file
    
        """
        log("Reading GFF file")

        feature_list = dict()
        is_phytozome = 0
        is_patric = 0

        gff_file_handle = open(input_gff_file, 'rb')
        current_line = gff_file_handle.readline()
        line_count = 0

        while (current_line != ''):
            current_line = current_line.strip()

            if (current_line.isspace() or current_line == ""
                    or current_line.startswith("#")):
                pass
            else:
                #Split line
                (contig_id, source_id, feature_type, start, end, score, strand,
                 phase, attributes) = current_line.split('\t')

                #Checking to see if Phytozome
                if ("phytozome" in source_id or "Phytozome" in source_id):
                    is_phytozome = 1

                #Checking to see if Phytozome
                if ("PATRIC" in source_id):
                    is_patric = 1

                #PATRIC prepends their contig ids with some gibberish
                if (is_patric and "|" in contig_id):
                    contig_id = contig_id.split("|", 1)[1]

                #Features grouped by contigs first
                if (contig_id not in feature_list):
                    feature_list[contig_id] = list()

                #Populating basic feature object
                ftr = {
                    'contig': contig_id,
                    'source': source_id,
                    'type': feature_type,
                    'start': int(start),
                    'end': int(end),
                    'score': score,
                    'strand': strand,
                    'phase': phase,
                    'attributes': attributes
                }

                #Populating with attribute key-value pair
                #This is where the feature id is from
                for attribute in attributes.split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    ftr[key] = value

                feature_list[contig_id].append(ftr)

            current_line = gff_file_handle.readline()

        gff_file_handle.close()

        #Some GFF/GTF files don't use "ID" so we go through the possibilities
        feature_list = self._add_missing_identifiers(feature_list)

        #Most bacterial files have only CDSs
        #In order to work with prokaryotic and eukaryotic gene structure synonymously
        #Here we add feature dictionaries representing the parent gene and mRNAs
        feature_list = self._add_missing_parents(feature_list)

        #Phytozome has the annoying habit of editing their identifiers so we fix them
        if (is_phytozome):
            self._update_phytozome_features(feature_list)

        #All identifiers need to be checked so that they follow the same general rules
        #Rules are listed within the function itself
        feature_list = self._update_identifiers(feature_list)

        #If phytozome, the edited files need to be re-printed as GFF so that it works better with RNA-Seq pipeline
        if (is_phytozome):
            self._print_phytozome_gff(input_gff_file, feature_list)

        return feature_list

    def _add_missing_identifiers(self, feature_list):

        #General rule is to iterate through a range of possibilities if "ID" is missing
        for contig in feature_list.keys():
            for i in range(len(feature_list[contig])):
                if ("ID" not in feature_list[contig][i]):
                    for key in ("transcriptId", "proteinId", "PACid", "pacid",
                                "Parent"):
                        if (key in feature_list[contig][i]):
                            feature_list[contig][i]['ID'] = feature_list[
                                contig][i][key]
                            break

                    #If the process fails, throw an error
                    for ftr_type in ("gene", "mRNA", "CDS"):
                        if (ftr_type not in feature_list[contig][i]):
                            continue

                        if ("ID" not in feature_list[contig][i]):
                            log("Error: Cannot find unique ID to utilize in GFF attributes: "+ \
                                    feature_list[contig][i]['contig']+"."+ \
                                    feature_list[contig][i]['source']+"."+ \
                                    feature_list[contig][i]['type']+": "+ \
                                    feature_list[contig][i]['attributes'])
        return feature_list

    def _generate_feature_hierarchy(self, feature_list):

        feature_hierarchy = {contig: {} for contig in feature_list}

        #Need to remember mRNA/gene links for CDSs
        mRNA_gene_dict = {}
        exon_list_position_dict = {}

        for contig in feature_list:
            for i in range(len(feature_list[contig])):
                ftr = feature_list[contig][i]

                if ("gene" in ftr["type"]):
                    feature_hierarchy[contig][ftr["ID"]] = {
                        "utrs": [],
                        "mrnas": [],
                        "cdss": [],
                        "index": i
                    }

                if ("UTR" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[
                        ftr["Parent"]]]["utrs"].append({
                            "id": ftr["ID"],
                            "index": i
                        })

                if ("RNA" in ftr["type"]):
                    feature_hierarchy[contig][ftr["Parent"]]["mrnas"].append({
                        "id":
                        ftr["ID"],
                        "index":
                        i,
                        "cdss": []
                    })
                    mRNA_gene_dict[ftr["ID"]] = ftr["Parent"]
                    exon_list_position_dict[ftr["ID"]] = len(
                        feature_hierarchy[contig][ftr["Parent"]]["mrnas"]) - 1

                if ("CDS" in ftr["type"]):
                    feature_hierarchy[contig][mRNA_gene_dict[ftr["Parent"]]]["mrnas"]\
                        [exon_list_position_dict[ftr["Parent"]]]["cdss"].append( { "id": ftr["ID"], "index" : i } )

        return feature_hierarchy

    def _add_missing_parents(self, feature_list):

        #General rules is if CDS or RNA missing parent, add them
        for contig in feature_list.keys():
            ftrs = feature_list[contig]
            new_ftrs = []
            for i in range(len(ftrs)):
                if ("Parent" not in ftrs[i]):
                    #Assuming parent doesn't exist at all, so create de novo instead of trying to find it
                    if ("RNA" in ftrs[i]["type"] or "CDS" in ftrs[i]["type"]):
                        new_gene_ftr = copy.deepcopy(ftrs[i])
                        new_gene_ftr["type"] = "gene"
                        ftrs[i]["Parent"] = new_gene_ftr["ID"]
                        new_ftrs.append(new_gene_ftr)

                    if ("CDS" in ftrs[i]["type"]):
                        new_rna_ftr = copy.deepcopy(ftrs[i])
                        new_rna_ftr["type"] = "mRNA"
                        new_ftrs.append(new_rna_ftr)
                        ftrs[i]["Parent"] = new_rna_ftr["ID"]

                new_ftrs.append(ftrs[i])
            feature_list[contig] = new_ftrs
        return feature_list

    def _update_phytozome_features(self, feature_list):

        #General rule is to use the "Name" field where possible
        #And update parent attribute correspondingly
        for contig in feature_list.keys():
            feature_position_dict = {}
            for i in range(len(feature_list[contig])):

                #Maintain old_id for reference
                #Sometimes ID isn't available, so use PACid
                old_id = None
                for key in ("ID", "PACid", "pacid"):
                    if (key in feature_list[contig][i]):
                        old_id = feature_list[contig][i][key]
                        break
                if (old_id is None):
                    #This should be an error
                    print ("Cannot find unique ID, PACid, or pacid in GFF attributes: ",\
                               feature_list[contig][i][contig],feature_list[contig][i][source],feature_list[contig][i][attributes])
                    continue

                #Retain old_id
                feature_position_dict[old_id] = i

                #In Phytozome, gene and mRNA have "Name" field, CDS do not
                if ("Name" in feature_list[contig][i]):
                    feature_list[contig][i]["ID"] = feature_list[contig][i][
                        "Name"]

                if ("Parent" in feature_list[contig][i]):
                    #Update Parent to match new ID of parent ftr
                    feature_list[contig][i]["Parent"] = feature_list[contig][
                        feature_position_dict[feature_list[contig][i]
                                              ["Parent"]]]["ID"]

        return feature_list

    def _update_identifiers(self, feature_list):

        #General rules:
        #1) Genes keep identifier
        #2) RNAs keep identifier only if its different from gene, otherwise append ".mRNA"
        #3) CDS always uses RNA identifier with ".CDS" appended
        #4) CDS appended with an incremented digit

        CDS_count_dict = dict()
        mRNA_parent_dict = dict()

        for contig in feature_list.keys():
            for ftr in feature_list[contig]:
                if ("Parent" in ftr):

                    #Retain old_id of parents
                    old_id = ftr["ID"]

                    if (ftr["ID"] == ftr["Parent"] or "CDS" in ftr["type"]):
                        ftr["ID"] = ftr["Parent"] + "." + ftr["type"]

                    #link old to new ids for mRNA to use with CDS
                    if ("RNA" in ftr["type"]):
                        mRNA_parent_dict[old_id] = ftr["ID"]

                    if ("CDS" in ftr["type"]):
                        #Increment CDS identifier
                        if (ftr["ID"] not in CDS_count_dict):
                            CDS_count_dict[ftr["ID"]] = 1
                        else:
                            CDS_count_dict[ftr["ID"]] += 1
                        ftr["ID"] = ftr["ID"] + "." + str(
                            CDS_count_dict[ftr["ID"]])

                        #Recall new mRNA id for parent
                        ftr["Parent"] = mRNA_parent_dict[ftr["Parent"]]

        return feature_list

    def _print_phytozome_gff(self, input_gff_file, feature_list):

        #Write modified feature ids to new file
        input_gff_file = input_gff_file.replace("gene", "edited_gene") + ".gz"
        try:
            print "Printing to new file: " + input_gff_file
            gff_file_handle = gzip.open(input_gff_file, 'wb')
        except:
            print "Failed to open"

        for contig in sorted(feature_list.iterkeys()):
            for ftr in feature_list[contig]:

                #Re-build attributes
                attributes_dict = {}
                for attribute in ftr["attributes"].split(";"):
                    attribute = attribute.strip()

                    #Sometimes empty string
                    if (attribute == ""):
                        continue

                    #Use of 1 to limit split as '=' character can also be made available later
                    #Sometimes lack of "=", assume spaces instead
                    if ("=" in attribute):
                        key, value = attribute.split("=", 1)
                    elif (" " in attribute):
                        key, value = attribute.split(" ", 1)
                    else:
                        log("Warning: attribute " + attribute +
                            " cannot be separated into key,value pair")

                    if (ftr[key] != value):
                        value = ftr[key]
                    attributes_dict[key] = value

                ftr["attributes"] = ";".join(key + "=" + attributes_dict[key]
                                             for key in attributes_dict.keys())

                new_line = "\t".join(
                    str(ftr[key]) for key in [
                        'contig', 'source', 'type', 'start', 'end', 'score',
                        'strand', 'phase', 'attributes'
                    ])
                gff_file_handle.write(new_line)
        gff_file_handle.close()
        return

    def _retrieve_genome_feature_list(self, feature_list, feature_hierarchy,
                                      assembly):

        genome_features_list = list()
        genome_mrnas_list = list()
        genome_cdss_list = list()
        genome_translation_issues = list()

        for contig in feature_hierarchy:
            for gene in feature_hierarchy[contig]:

                #We only iterate through the gene objects
                #And then for each gene object, retrieve the necessary mRNA and CDS objects indirectly

                ftr = feature_list[contig][feature_hierarchy[contig][gene]
                                           ["index"]]
                contig_sequence = assembly["contigs"][
                    ftr["contig"]]["sequence"]
                gene_ftr = self._convert_ftr_object(
                    ftr, contig_sequence
                )  #reverse-complementation for negative strands done here

                #Add non-optional terms
                gene_ftr["mrnas"] = list()
                gene_ftr["cdss"] = list()
                gene_ftr["ontology_terms"] = dict()

                #Retaining longest sequences for gene feature
                longest_protein_length = 0
                longest_protein_sequence = ""
                for mRNA in feature_hierarchy[contig][gene]["mrnas"]:

                    ########################################################
                    # Construct mRNA Ftr
                    ########################################################
                    ftr = feature_list[contig][mRNA["index"]]
                    contig_sequence = assembly["contigs"][
                        ftr["contig"]]["sequence"]
                    mRNA_ftr = self._convert_ftr_object(
                        ftr, contig_sequence
                    )  #reverse-complementation for negative strands done here

                    #Modify mrna object for use in mrna array
                    #Objects will be un-used until further notice
                    mRNA_ftr['parent_gene'] = gene_ftr['id']

                    #If there are CDS, then New CDS ID without incrementation as they were aggregated
                    if (len(mRNA['cdss']) > 0):
                        mRNA_ftr['cds'] = mRNA_ftr['id'] + ".CDS"
                    else:
                        mRNA_ftr['cds'] = ""

                    #Add to mrnas array
                    genome_mrnas_list.append(mRNA_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["mrnas"].append(mRNA_ftr["id"])

                    ########################################################
                    # Construct transcript, protein sequence, UTR, CDS locations
                    ########################################################

                    #At time of writing, all of this aggregation should probably be done in a single function
                    cds_exons_locations_array = list()
                    cds_cdna_sequence = str()
                    protein_sequence = str()
                    if (len(mRNA["cdss"]) > 0):
                        (cds_exons_locations_array, cds_cdna_sequence, protein_sequence) = \
                            self._cds_aggregation_translation(mRNA["cdss"],feature_list[contig],assembly,genome_translation_issues)

                    UTRs = list()
                    if ("utrs" in feature_hierarchy[contig][gene] and
                            len(feature_hierarchy[contig][gene]["utrs"]) > 0):
                        for UTR in feature_hierarchy[contig][gene]["utrs"]:
                            ftr = feature_list[contig][UTR["index"]]
                            if ("Parent" in ftr
                                    and ftr["Parent"] == mRNA_ftr["id"]):
                                UTRs.append(ftr)

                    mrna_exons_locations_array = copy.deepcopy(
                        cds_exons_locations_array)
                    mrna_transcript_sequence = str(cds_cdna_sequence)
                    if (len(UTRs) > 0):
                        (mrna_exons_locations_array, mrna_transcript_sequence) = \
                            self._utr_aggregation(UTRs,assembly,mrna_exons_locations_array,cds_cdna_sequence)

                    #Update sequence and locations
                    mRNA_ftr["dna_sequence"] = mrna_transcript_sequence
                    mRNA_ftr["dna_sequence_length"] = len(
                        mrna_transcript_sequence)
                    mRNA_ftr["location"] = mrna_exons_locations_array
                    mRNA_ftr["md5"] = hashlib.md5(
                        mRNA_ftr["dna_sequence"]).hexdigest()

                    #Remove DNA
                    del mRNA_ftr["dna_sequence"]
                    del mRNA_ftr["dna_sequence_length"]

                    #Skip CDS if not present
                    if (len(mRNA["cdss"]) == 0):
                        continue

                    #Remove asterix representing stop codon if present
                    if (len(protein_sequence) > 0
                            and protein_sequence[-1] == '*'):
                        protein_sequence = protein_sequence[:-1]

                    #Save longest sequence
                    if (len(protein_sequence) > longest_protein_length):
                        longest_protein_length = len(protein_sequence)
                        longest_protein_sequence = protein_sequence

                    ########################################################
                    # Construct CDS Ftr
                    ########################################################
                    CDS_ftr = dict()
                    CDS_ftr['type'] = 'CDS'

                    #New CDS ID without incrementation as they were aggregated
                    CDS_ftr['id'] = mRNA_ftr['id'] + '.CDS'

                    #Add gene/mrna links
                    CDS_ftr['parent_gene'] = gene_ftr['id']
                    CDS_ftr['parent_mrna'] = mRNA_ftr['id']

                    #Update sequence and locations
                    CDS_ftr["dna_sequence"] = cds_cdna_sequence
                    CDS_ftr["dna_sequence_length"] = len(cds_cdna_sequence)
                    CDS_ftr["location"] = cds_exons_locations_array
                    CDS_ftr["md5"] = hashlib.md5(
                        CDS_ftr["dna_sequence"]).hexdigest()

                    #Add protein
                    CDS_ftr["protein_translation"] = str(
                        protein_sequence).upper()
                    CDS_ftr["protein_translation_length"] = len(
                        CDS_ftr["protein_translation"])
                    #Only generate md5 for dna sequences
                    #CDS_ftr["md5"] = hashlib.md5(CDS_ftr["protein_translation"]).hexdigest()

                    #Add empty non-optional fields for populating in future
                    CDS_ftr["ontology_terms"] = dict()
                    if ("aliases" not in CDS_ftr):
                        CDS_ftr["aliases"] = list()
                    if ("function" not in CDS_ftr):
                        CDS_ftr["function"] = ""

                    #Add to cdss array
                    genome_cdss_list.append(CDS_ftr)

                    #Add ids to gene_ftr arrays
                    gene_ftr["cdss"].append(CDS_ftr["id"])

                gene_ftr["protein_translation"] = longest_protein_sequence
                gene_ftr["protein_translation_length"] = longest_protein_length
                genome_features_list.append(gene_ftr)

        msg = "Genome features processed: {} genes, {} RNAs, and {} CDSs\n".format(
            len(genome_features_list), len(genome_mrnas_list),
            len(genome_cdss_list))
        msg += "{} mRNA(s) had errors during translation".format(
            len(genome_translation_issues))
        log(msg)

        return genome_features_list, genome_mrnas_list, genome_cdss_list

    def _gen_genome_info(self, core_genome_name, scientific_name, assembly_ref,
                         genome_features_list, genome_cdss_list,
                         genome_mrnas_list, source, assembly, taxon_reference,
                         taxonomy, input_gff_file):
        """
        _gen_genome_info: generate genome info

        """
        genome = dict()
        genome["id"] = core_genome_name
        genome["scientific_name"] = scientific_name
        genome["assembly_ref"] = assembly_ref
        genome["features"] = genome_features_list
        genome["cdss"] = genome_cdss_list
        genome["mrnas"] = genome_mrnas_list
        genome["source"] = source
        genome["domain"] = "Eukaryota"
        genome["genetic_code"] = 1
        genome["gc_content"] = assembly["gc_content"]
        genome["dna_size"] = assembly["dna_size"]

        if taxon_reference is not None:
            genome["taxon_ref"] = taxon_reference
            genome["taxonomy"] = taxonomy

        gff_file_to_shock = self.dfu.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = gff_file_to_shock['handle']['hid']

        genome['gff_handle_ref'] = gff_handle_ref

        return genome

    def _convert_ftr_object(self, old_ftr, contig):
        new_ftr = dict()
        new_ftr["id"] = old_ftr["ID"]

        dna_sequence = Seq(contig[old_ftr["start"] - 1:old_ftr["end"]],
                           IUPAC.ambiguous_dna)

        # reverse complement
        if (old_ftr["strand"] == "-"):
            dna_sequence = dna_sequence.reverse_complement()
            old_start = old_ftr["start"]
            old_ftr["start"] = old_ftr["end"]
            old_ftr["end"] = old_start

        new_ftr["dna_sequence"] = str(dna_sequence).upper()
        new_ftr["dna_sequence_length"] = len(dna_sequence)
        new_ftr["md5"] = hashlib.md5(str(dna_sequence)).hexdigest()
        new_ftr["location"] = [[
            old_ftr["contig"], old_ftr["start"], old_ftr["strand"],
            len(dna_sequence)
        ]]
        new_ftr["type"] = old_ftr["type"]

        new_ftr["aliases"] = list()
        for key in ("transcriptId", "proteinId", "PACid", "pacid"):
            if (key in old_ftr.keys()):
                new_ftr["aliases"].append(key + ":" + old_ftr[key])

        return new_ftr

    def _utr_aggregation(self, utr_list, assembly, exons, exon_sequence):

        #create copies of locations and transcript
        utrs_exons = list(exons)
        utr_exon_sequence = exon_sequence

        five_prime_dna_sequence = ""
        three_prime_dna_sequence = ""
        five_prime_locations = list()
        three_prime_locations = list()

        for UTR in (utr_list):
            contig_sequence = assembly["contigs"][UTR["contig"]]["sequence"]
            UTR_ftr = self._convert_ftr_object(
                UTR, contig_sequence
            )  #reverse-complementation for negative strands done here

            #aggregate sequences and locations
            if ("five_prime" in UTR_ftr["id"]):
                five_prime_dna_sequence += UTR_ftr["dna_sequence"]
                five_prime_locations.append(UTR_ftr["location"][0])
            if ("three_prime" in UTR_ftr["id"]):
                three_prime_dna_sequence += UTR_ftr["dna_sequence"]
                three_prime_locations.append(UTR_ftr["location"][0])

        #Handle five_prime UTRs
        if (len(five_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file)
            five_prime_locations = sorted(five_prime_locations,
                                          key=lambda x: x[1])

            #Merge last UTR with CDS if "next" to each other
            if( ( utrs_exons[0][2] == "+" and five_prime_locations[-1][1]+five_prime_locations[-1][3] == utrs_exons[0][1] ) or \
                ( utrs_exons[0][2] == "-" and five_prime_locations[-1][1]-five_prime_locations[-1][3] == utrs_exons[0][1] ) ):

                #Remove last UTR
                last_five_prime_location = five_prime_locations[-1]
                five_prime_locations = five_prime_locations[:-1]

                #"Add" last UTR to first exon
                utrs_exons[0][1] = last_five_prime_location[1]
                utrs_exons[0][3] += last_five_prime_location[3]

            #Prepend other UTRs if available
            if (len(five_prime_locations) > 0):
                utrs_exons = five_prime_locations + utrs_exons

        utr_exon_sequence = five_prime_dna_sequence + utr_exon_sequence

        #Handle three_prime UTRs
        if (len(three_prime_locations) > 0):

            #Sort UTRs by "start" (reverse-complement UTRs in Phytozome appear to be incorrectly ordered in the GFF file
            three_prime_locations = sorted(three_prime_locations,
                                           key=lambda x: x[1])

            #Merge first UTR with CDS if "next to each other
            if( ( utrs_exons[0][2] == "+" and utrs_exons[-1][1]+utrs_exons[-1][3] == three_prime_locations[0][1] ) or \
                ( utrs_exons[0][2] == "-" and utrs_exons[-1][1]-utrs_exons[-1][3] == three_prime_locations[0][1] ) ):

                #Remove first UTR
                first_three_prime_location = three_prime_locations[0]
                three_prime_locations = three_prime_locations[1:]

                #"Add" first UTR to last exon
                utrs_exons[-1][3] += first_three_prime_location[3]

        #Append other UTRs if available
        if (len(three_prime_locations) > 0):
            utrs_exons = utrs_exons + three_prime_locations

        utr_exon_sequence += three_prime_dna_sequence

        return (utrs_exons, utr_exon_sequence)

    def _cds_aggregation_translation(self, cds_list, feature_list, assembly,
                                     issues):

        dna_sequence = ""
        locations = list()

        # collect phases, and lengths of exons
        # right now, this is only for the purpose of error reporting
        phases = list()
        exons = list()

        #Saving parent mRNA identifier
        Parent_mRNA = cds_list[0]["id"]
        for CDS in (cds_list):
            ftr = feature_list[CDS["index"]]
            phases.append(ftr["phase"])
            Parent_mRNA = ftr["Parent"]

            contig_sequence = assembly["contigs"][ftr["contig"]]["sequence"]
            CDS_ftr = self._convert_ftr_object(
                ftr, contig_sequence
            )  #reverse-complementation for negative strands done here
            exons.append(len(CDS_ftr["dna_sequence"]))

            # Remove base(s) according to phase, but only for first CDS
            if (CDS == cds_list[0] and int(ftr["phase"]) != 0):
                log("Adjusting phase for first CDS: " + CDS["id"])
                CDS_ftr["dna_sequence"] = CDS_ftr["dna_sequence"][
                    int(ftr["phase"]):]

            #aggregate sequences and locations
            dna_sequence += CDS_ftr["dna_sequence"]
            locations.append(CDS_ftr["location"][0])

        # translate sequence
        dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
        rna_sequence = dna_sequence_obj.transcribe()

        # incomplete gene model with no start codon
        if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
            msg = "Missing start codon for {}. Possibly incomplete gene model.".format(
                Parent_mRNA)
            log(msg)

        # You should never have this problem, needs to be reported rather than "fixed"
        codon_count = len(str(rna_sequence)) % 3
        if codon_count != 0:
            msg = "Number of bases for RNA sequence for {} ".format(
                Parent_mRNA)
            msg += "is not divisible by 3. "
            msg += "The resulting protein may well be mis-translated."
            log(msg)
            issues.append(Parent_mRNA)

        protein_sequence = Seq("")
        try:
            protein_sequence = rna_sequence.translate()
        except CodonTable.TranslationError as te:
            log("TranslationError for: " + feature_object["id"], phases, exons,
                " : " + str(te))

        return (locations, dna_sequence.upper(), str(protein_sequence).upper())
コード例 #13
0
class GenericsUtil:
    def _validate_fetch_data_params(self, params):
        """
        _validate_fetch_data_params:
            validates params passed to fetch_data method
        """

        log('start validating fetch_data params')

        # check for required parameters
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_import_matrix_from_excel_params(self, params):
        """
        _validate_import_matrix_from_excel_params:
            validates params passed to import_matrix_from_excel method
        """
        log('start validating import_matrix_from_excel params')

        # check for required parameters
        for p in ['obj_type', 'matrix_name', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        obj_type = params.get('obj_type')
        if obj_type not in MATRIX_TYPE:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        if params.get('input_file_path'):
            file_path = params.get('input_file_path')
        elif params.get('input_shock_id'):
            file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        elif params.get('input_staging_file_path'):
            file_path = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params.get('input_staging_file_path')
            }).get('copy_file_path')
        else:
            error_msg = "Must supply either a input_shock_id or input_file_path "
            error_msg += "or input_staging_file_path"
            raise ValueError(error_msg)

        refs_key = [
            'col_conditionset_ref', 'row_conditionset_ref', 'genome_ref',
            'diff_expr_matrix_ref'
        ]
        refs = {k: v for k, v in params.items() if k in refs_key}

        return (obj_type, file_path, params.get('workspace_name'),
                params.get('matrix_name'), refs)

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
        """
        log('Start uploading file to shock: {}'.format(file_path))

        file_to_shock_params = {'file_path': file_path, 'pack': 'zip'}
        shock_id = self.dfu.file_to_shock(file_to_shock_params).get('shock_id')

        return shock_id

    def _upload_dir_to_shock(self, directory):
        """
        _upload_dir_to_shock: upload target dir to shock using DataFileUtil
        """
        log('Start uploading directory to shock: {}'.format(directory))

        file_to_shock_params = {'file_path': directory, 'pack': 'zip'}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _generate_html_string(self, df):
        """
        _generate_html_string: generating a html string from df
        template used: https://developers.google.com/chart/interactive/docs/gallery/table
                       https://developers.google.com/chart/interactive/docs/reference#formatters
        """
        dtypes = df.dtypes
        columns = df.columns

        column_str = ''
        number_columns = []
        for idx, column in enumerate(columns):
            dtype = dtypes[idx].name
            if 'int' in dtype or 'float' in dtype:
                column_str += "data.addColumn('number', '{}')\n".format(column)
                number_columns.append(column)
            else:
                column_str += "data.addColumn('string', '{}')\n".format(column)

        data_str = "data.addRows({})".format(df.values.tolist())

        formatter_str = ''
        for number_column in number_columns:
            mean = round(df[number_column].mean(), 2)
            column_n = columns.tolist().index(number_column)
            formatter_str += "var formatter_{} = ".format(column_n)
            formatter_str += "new google.visualization.BarFormat({base: "
            formatter_str += str(mean)
            formatter_str += ", width: 120});\n"
            formatter_str += "formatter_{}.format(data, {});\n".format(
                column_n, column_n)

        return column_str, data_str, formatter_str

    def _find_between(self, s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_type_spec(self, obj_type):
        """
        _find_type_spec: find body spec of type
        """
        obj_type_name = self._find_between(obj_type, '\.', '\-')

        type_info = self.wsClient.get_type_info(obj_type)
        type_spec = type_info.get('spec_def')

        type_spec_list = type_spec.split(obj_type_name + ';')
        obj_type_spec = type_spec_list[0].split('structure')[-1]
        log('Found spec for {}\n{}\n'.format(obj_type, obj_type_spec))

        return obj_type_spec

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')

        constraints = {'contains': [], 'rowsum': [], 'unique': []}

        unique = [
            item.split('\n')[0].strip()
            for item in type_desc.split('@unique')[1:]
        ]
        constraints['unique'] = unique

        contains = [
            item.split('\n')[0].strip()
            for item in type_desc.split('@contains')[1:]
        ]
        constraints['contains'] = contains

        return constraints

    def _find_generics_type(self, obj_type):
        """
        _find_generics_type: try to find generics type in an object
        """

        log('Start finding generics type and name')

        obj_type_spec = self._find_type_spec(obj_type)

        if not obj_type_spec:
            raise ValueError('Cannot retrieve spec for: {}'.format(obj_type))

        generics_types = [
            generics_type for generics_type in GENERICS_TYPE
            if generics_type in obj_type_spec
        ]

        if not generics_types:
            error_msg = 'Cannot find generics type in spec:\n{}\n'.format(
                obj_type_spec)
            raise ValueError(error_msg)

        generics_module = dict()
        for generics_type in generics_types:
            for item in obj_type_spec.split(generics_type)[1:]:
                generics_type_name = item.split(';')[0].strip().split(
                    ' ')[-1].strip()
                generics_module.update({generics_type_name: generics_type})

        log('Found generics type:\n{}\n'.format(generics_module))

        return generics_module

    def _convert_data(self, data, generics_module):
        """
        _convert_data: convert data to df based on data_type
        """

        data_types = generics_module.values()

        if not set(GENERICS_TYPE) >= set(data_types):
            raise ValueError(
                'Found unknown generics data type in:\n{}\n'.format(
                    data_types))

        if data_types == ['FloatMatrix2D']:
            key = generics_module.keys()[generics_module.values().index(
                'FloatMatrix2D')]
            values = data[key]['values']
            index = data[key]['row_ids']
            columns = data[key]['col_ids']
            df = pd.DataFrame(values, index=index, columns=columns)
        # elif 'FloatMatrix2D' in data_types:  # default case
        #     key = generics_module.keys()[generics_module.values().index('FloatMatrix2D')]
        #     values = data[key]['values']
        #     index = data[key]['row_ids']
        #     columns = data[key]['col_ids']
        #     df = pd.DataFrame(values, index=index, columns=columns)
        else:
            raise ValueError('Unexpected Error')

        return df.to_json()

    def _retrieve_data(self, obj_ref, generics_module=None):
        """
        _retrieve_data: retrieve object data and return a dataframe in json format
        """
        log('Start retrieving data')
        obj_source = self.dfu.get_objects({"object_refs":
                                           [obj_ref]})['data'][0]

        obj_info = obj_source.get('info')
        obj_data = obj_source.get('data')

        if not generics_module:
            generics_module = self._find_generics_type(obj_info[2])

        try:
            data = {
                k: v
                for k, v in obj_data.items() if k in generics_module.keys()
            }
        except KeyError:
            raise ValueError('Retrieved wrong generics type name')

        data_matrix = self._convert_data(data, generics_module)

        return data_matrix

    def _get_col_cond_list(self, col_mapping, col_conditionset_ref, cols):
        """
        _get_col_cond_list: generate col condition list for excel
        """
        col_cond_list = []

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [col_conditionset_ref]})['data'][0]['data']
        col_condition_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]
        for col in cols:
            condition_id = col_mapping.get(col)
            if condition_id:
                col_cond_list.append(
                    conditionset_data.get('conditions').get(condition_id))
            else:
                col_cond_list.append([''] * len(col_condition_names))

        col_cond_list = map(list, zip(*col_cond_list))
        for idx, col_array in enumerate(col_cond_list):
            col_array.insert(0, col_condition_names[idx])

        return col_cond_list

    def _get_row_cond_list(self, row_mapping, row_conditionset_ref, rows):
        """
        _get_row_cond_list: generate row condition list for excel
        """
        row_cond_list = []

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [row_conditionset_ref]})['data'][0]['data']
        row_condition_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]

        row_cond_list.append(row_condition_names)

        for row in rows:
            condition_id = row_mapping.get(row)
            if condition_id:
                row_cond_list.append(
                    conditionset_data.get('conditions').get(condition_id))
            else:
                row_cond_list.append([''] * len(row_condition_names))

        return row_cond_list

    def _get_data_list(self, cols, rows, values):
        """
        _get_data_list: generate data value list for excel
        """
        data_arrays = []
        cols.insert(0, '')
        data_arrays.append(cols)
        for idx, row in enumerate(rows):
            values[idx].insert(0, row)
        data_arrays += values

        return data_arrays

    def _merge_cond_list(self, excel_list, col_cond_list, row_cond_list):
        """
        _merge_cond_list: merge lists for excel
        """
        col_cond_len = len(col_cond_list)
        for item in excel_list[:col_cond_len]:
            row_len = len(row_cond_list[0]) if row_cond_list else 0
            item[0:0] = [''] * row_len

        if row_cond_list:
            for idx, item in enumerate(excel_list[col_cond_len:]):
                item[0:0] = row_cond_list[idx]

    def _is_number(s):
        """
        _is_number: string is a numeric
        """
        try:
            float(s)
            return True
        except ValueError:
            pass

        return False

    def _gen_excel(self, excel_list, obj_name):
        """
        _gen_excel: create excel
        """

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory, '{}.xlsx'.format(obj_name))

        log('Start writing to file: {}'.format(file_path))

        workbook = xlsxwriter.Workbook(file_path, {'nan_inf_to_errors': True})
        worksheet = workbook.add_worksheet()

        row = 1
        for data_entry in excel_list:
            for idx, cell_data in enumerate(data_entry):
                worksheet.write(row, idx, cell_data)

            row += 1

        workbook.close()

        return file_path

    def _write_mapping_sheet(self, file_path, sheet_name, mapping, index):
        """
        _write_mapping_sheet: write mapping to sheet
        """
        df_dict = collections.OrderedDict()

        df_dict[index[0]] = []
        df_dict[index[1]] = []

        for key, value in mapping.items():
            df_dict.get(index[0]).append(key)
            df_dict.get(index[1]).append(value)

        df = pd.DataFrame.from_dict(df_dict)

        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            writer.book = load_workbook(file_path)
            df.to_excel(writer, sheet_name=sheet_name)

    def _filter_constraints(self, constraints, data):

        contains_constraints = constraints.get('contains')

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint.split(' ')[1:]
            missing_key = True
            for in_value in in_values:
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                             in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)

        return constraints

    def _retrieve_value(self, data, value):
        log('Getting value for {}'.format(value))
        retrieve_data = []
        m_data = DotMap(data)
        if value.startswith(
                'values'):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'),
                                     value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = m_data_cp.values()
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2(
                    {'objects': [{
                        'ref': obj_ref,
                        'included': [included]
                    }]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [
                            x.get(keys[2]) for x in ref_data.get(keys[0])
                        ]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        log('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20]))

        return retrieve_data

    def _generate_report(self, matrix_obj_ref, workspace_name):
        """
        _generate_report: generate summary report
        """

        report_params = {
            'message':
            '',
            'objects_created': [{
                'ref': matrix_obj_ref,
                'description': 'Imported Matrix'
            }],
            'workspace_name':
            workspace_name,
            'report_object_name':
            'import_matrix_from_excel_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """

        validated = True
        failed_constraints = {'contains': [], 'rowsum': [], 'unique': []}

        unique_constraints = constraints.get('unique')
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint)
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint)

        contains_constraints = constraints.get('contains')
        for contains_constraint in contains_constraints:
            value = contains_constraint.split(' ')[0]
            in_values = contains_constraint.split(' ')[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <=
                    set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(contains_constraint)

        return validated, failed_constraints

    def _process_mapping_sheet(self, file_path, sheet_name):
        """
        _process_mapping: process mapping sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return dict()
        else:
            mapping = {value[0]: value[1] for value in df.values.tolist()}

        return mapping

    def _process_conditionset_sheet(self, file_path, sheet_name, matrix_name,
                                    workspace_id):
        """
        _process_conditionset_sheet: process condition set sheet
        """

        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
        except XLRDError:
            return ''
        else:
            obj_name = '{}_{}'.format(sheet_name, matrix_name)
            result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            self._mkdir_p(result_directory)
            file_path = os.path.join(result_directory,
                                     '{}.xlsx'.format(obj_name))
            df.to_excel(file_path)
            import_condition_set_params = {
                'output_obj_name': obj_name,
                'output_ws_id': workspace_id,
                'input_file_path': file_path
            }

            ref = self.cu.file_to_condition_set(import_condition_set_params)

            return ref.get('condition_set_ref')

    def _file_to_data(self, file_path, refs, matrix_name, workspace_id):
        log('Start reading and converting excel file data')
        data = refs

        try:
            pd.read_excel(file_path)
        except XLRDError:
            # TODO: convert csv file to excel
            log('Found csv file')
            raise ValueError('Please provide .xlsx file only')

        # processing data sheet
        try:
            df = pd.read_excel(file_path, sheet_name='data')
        except XLRDError:
            raise ValueError('Cannot find <data> sheetss')
        else:
            df.fillna(0, inplace=True)
            matrix_data = {
                'row_ids': df.index.tolist(),
                'col_ids': df.columns.tolist(),
                'values': df.values.tolist()
            }

            data.update({'data': matrix_data})

        # processing col/row_mapping
        col_mapping = self._process_mapping_sheet(file_path, 'col_mapping')
        data.update({'col_mapping': col_mapping})

        row_mapping = self._process_mapping_sheet(file_path, 'row_mapping')
        data.update({'row_mapping': row_mapping})

        # processing col/row_conditionset
        col_conditionset_ref = self._process_conditionset_sheet(
            file_path, 'col_conditionset', matrix_name, workspace_id)
        data.update({'col_conditionset_ref': col_conditionset_ref})

        row_conditionset_ref = self._process_conditionset_sheet(
            file_path, 'row_conditionset', matrix_name, workspace_id)
        data.update({'row_conditionset_ref': row_conditionset_ref})

        # processing metadata
        metadata = self._process_mapping_sheet(file_path, 'metadata')
        data.update(metadata)

        return data

    def _build_header_str(self, factor_names):

        header_str = ''
        width = 100.0 / len(factor_names)

        header_str += '<tr class="header">'
        header_str += '<th style="width:{0:.2f}%;">Feature ID</th>'.format(
            width)

        for factor_name in factor_names:
            header_str += '<th style="width:{0:.2f}%;"'.format(width)
            header_str += '>{}</th>'.format(factor_name)
        header_str += '</tr>'

        return header_str

    def _build_html_str(self, row_mapping, conditionset_data, row_ids):

        log('Start building html replacement')

        factor_names = [
            factor.get('factor') for factor in conditionset_data.get('factors')
        ]

        header_str = self._build_header_str(factor_names)

        table_str = ''

        conditions = conditionset_data.get('conditions')

        for feature_id, factor_id in row_mapping.items():
            if feature_id in row_ids:
                feature_conditions = conditions.get(factor_id)

                table_str += '<tr>'
                table_str += '<td>{}</td>'.format(feature_id)

                for feature_condition in feature_conditions:
                    table_str += '<td>{}</td>'.format(feature_condition)
                table_str += '</tr>'

        return header_str, table_str

    def _generate_search_html_report(self, header_str, table_str):

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'search.html')

        shutil.copy2(os.path.join(os.path.dirname(__file__), 'kbase_icon.png'),
                     output_directory)
        shutil.copy2(
            os.path.join(os.path.dirname(__file__), 'search_icon.png'),
            output_directory)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'search_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '//HEADER_STR', header_str)
                report_template = report_template.replace(
                    '//TABLE_STR', table_str)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Search Matrix App'
        })

        return html_report

    def _generate_search_report(self, header_str, table_str, workspace_name):
        log('Start creating report')

        output_html_files = self._generate_search_html_report(
            header_str, table_str)

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name':
            'kb_matrix_filter_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _filter_value_data(self, value_data, feature_ids):

        filtered_value_data = dict()
        filtered_value_data['col_ids'] = value_data['col_ids']

        feature_ids = feature_ids.split(',')

        filtered_value_data['row_ids'] = feature_ids
        filtered_value_data['values'] = list()

        values = value_data['values']
        row_ids = value_data['row_ids']
        for feature_id in feature_ids:
            idx = row_ids.index(feature_id)
            filtered_value_data['values'].append(values[idx])

        return filtered_value_data

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.srv_wiz_url = config['srv-wiz-url']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.cu = ConditionUtils(self.callback_url, service_ver="dev")

    def filter_matrix(self, params):
        """
        filter_matrix: create sub-matrix based on input feature_ids or group by factor name

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        feature_ids: string of feature ids that result matrix contains
        filtered_matrix_name: name of newly created filtered matrix object
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')
        feature_ids = params.get('feature_ids')
        filtered_matrix_name = params.get('filtered_matrix_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_info = matrix_source.get('info')
        matrix_data = matrix_source.get('data')

        matrix_type = self._find_between(matrix_info[2], '\.', '\-')

        value_data = matrix_data.get('data')
        filtered_value_data = self._filter_value_data(value_data, feature_ids)
        matrix_data['data'] = filtered_value_data

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        filtered_matrix_obj_ref = self.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(matrix_type),
            'obj_name':
            filtered_matrix_name,
            'data':
            matrix_data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_refs': [filtered_matrix_obj_ref]}

        report_output = self._generate_report(filtered_matrix_obj_ref,
                                              workspace_name)

        returnVal.update(report_output)

        return returnVal

    def search_matrix(self, params):
        """
        search_matrix: generate a HTML report that allows users to select feature ids

        arguments:
        matrix_obj_ref: object reference of a matrix
        workspace_name: workspace name
        """

        matrix_obj_ref = params.get('matrix_obj_ref')
        workspace_name = params.get('workspace_name')

        matrix_source = self.dfu.get_objects({"object_refs":
                                              [matrix_obj_ref]})['data'][0]
        matrix_data = matrix_source.get('data')

        row_mapping = matrix_data.get('row_mapping')
        row_conditionset_ref = matrix_data.get('row_conditionset_ref')

        row_ids = matrix_data['data']['row_ids']

        if not (row_mapping and row_conditionset_ref):
            raise ValueError(
                'Matrix obejct is missing either row_mapping or row_conditionset_ref'
            )

        conditionset_data = self.dfu.get_objects(
            {"object_refs": [row_conditionset_ref]})['data'][0]['data']

        header_str, table_str = self._build_html_str(row_mapping,
                                                     conditionset_data,
                                                     row_ids)

        returnVal = self._generate_search_report(header_str, table_str,
                                                 workspace_name)

        return returnVal

    def import_matrix_from_excel(self, params):
        """
        import_matrix_from_excel: import matrix object from excel

        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_name: workspace name matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_conditionset_ref: column ConditionSet reference
        row_conditionset_ref: row ConditionSet reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (obj_type, file_path, workspace_name, matrix_name,
         refs) = self._validate_import_matrix_from_excel_params(params)

        if not isinstance(workspace_name, int):
            workspace_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            workspace_id = workspace_name

        data = self._file_to_data(file_path, refs, matrix_name, workspace_id)

        matrix_obj_ref = self.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            data,
            'workspace_name':
            workspace_id
        })['obj_ref']

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref, workspace_name)

        returnVal.update(report_output)

        return returnVal

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        log('Starting saving object')

        obj_type = params.get('obj_type')

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({
            'mod': module_name
        }).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').iteritems() if v)
        validate = self.validate_data({'obj_type': obj_type, 'data': data})

        if not validate.get('validated'):
            log('Data failed type checking')
            failed_constraints = validate.get('failed_constraints')
            error_msg = 'Object {} failed type checking:\n'.format(
                params.get('obj_name'))
            if failed_constraints.get('unique'):
                unique_values = failed_constraints.get('unique')
                error_msg += 'Object should have unique field: {}\n'.format(
                    unique_values)
            if failed_constraints.get('contains'):
                contained_values = failed_constraints.get('contains')
                for contained_value in contained_values:
                    subset_value = contained_value.split(' ')[0]
                    super_value = ' '.join(contained_value.split(' ')[1:])
                    error_msg += 'Object field [{}] should contain field [{}]\n'.format(
                        super_value, subset_value)
            raise ValueError(error_msg)

        workspace_name = params.get('workspace_name')
        if not isinstance(workspace_name, int):
            ws_name_id = self.dfu.ws_name_to_id(workspace_name)
        else:
            ws_name_id = workspace_name

        info = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type": obj_type,
                "data": data,
                "name": params.get('obj_name')
            }]
        })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        returnVal = {
            'validated': validated,
            'failed_constraints': failed_constraints
        }

        return returnVal

    def generate_matrix_html(self, params):
        """
        generate_matrix_html: generate a html page for given data

        arguments:
        df: a pandas dataframe

        return:
        html_string: html as a string format
        """

        column_str, data_str, formatter_str = self._generate_html_string(
            params.get('df'))

        with open(
                os.path.join(os.path.dirname(__file__),
                             'matrix_page_template.html'),
                'r') as matrix_page_template_file:
            html_string = matrix_page_template_file.read()
            html_string = html_string.replace('// ADD_COL', column_str)
            html_string = html_string.replace('// ADD_DATA', data_str)
            html_string = html_string.replace('// ADD_FORMATTER',
                                              formatter_str)

        returnVal = {'html_string': html_string}

        return returnVal

    def fetch_data(self, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """

        log('--->\nrunning GenericsUtil.fetch_data\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_fetch_data_params(params)

        try:
            data_matrix = self._retrieve_data(params.get('obj_ref'),
                                              params.get('generics_module'))
        except Exception:
            error_msg = 'Running fetch_data returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Please try to specify generics type and name as generics_module\n'
            raise ValueError(error_msg)

        returnVal = {'data_matrix': data_matrix}

        return returnVal

    def export_matrix(self, params):
        """
        export_matrix: univeral downloader for matrix data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: select the generics data to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        and only data is needed
                        generics_module should be
                        {'data': 'FloatMatrix2D'}
        """
        log('Start exporting matrix')

        if 'input_ref' in params:
            params['obj_ref'] = params.pop('input_ref')

        obj_source = self.dfu.get_objects(
            {"object_refs": [params.get('obj_ref')]})['data'][0]
        obj_data = obj_source.get('data')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        file_path = os.path.join(result_directory,
                                 '{}.xlsx'.format(obj_source.get('info')[1]))

        data_matrix = self.fetch_data(params).get('data_matrix')
        df = pd.read_json(data_matrix)

        df.to_excel(file_path, sheet_name='data')

        if obj_data.get('col_mapping'):
            self._write_mapping_sheet(file_path, 'col_mapping',
                                      obj_data.get('col_mapping'),
                                      ['col_name', 'condition_name'])
            obj_data.pop('col_mapping')

        if obj_data.get('row_mapping'):
            self._write_mapping_sheet(file_path, 'row_mapping',
                                      obj_data.get('row_mapping'),
                                      ['row_name', 'condition_name'])
            obj_data.pop('row_mapping')

        try:
            obj_data.pop('data')
        except KeyError:
            log('Missing key [data]')

        self._write_mapping_sheet(file_path, 'metadata', obj_data,
                                  ['name', 'value'])

        shock_id = self._upload_to_shock(file_path)

        return {'shock_id': shock_id}
コード例 #14
0
class EditAlignmentSet:
    """
     Constains a set of functions for expression levels calculations.
    """

    PARAM_IN_WS_NAME_ID = 'workspace_name'
    PARAM_IN_OBJ_NAME_ID = 'output_object_name'
    PARAM_IN_ALIGNSET_REF = 'alignment_set_ref'
    PARAM_IN_ALIGNS_ADD = 'alignments_to_add'
    PARAM_IN_ALIGNS_RM = 'alignments_to_remove'

    def __init__(self, config, logger=None):
        self.config = config
        self.logger = logger
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'EAS_' + str(uuid.uuid4()))
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.setAPI = SetAPI(self.callback_url)
        pass

    def _process_params(self, params):
        """
        validates params passed to gen expression matrix method
        """
        for p in [
                self.PARAM_IN_ALIGNSET_REF, self.PARAM_IN_OBJ_NAME_ID,
                self.PARAM_IN_WS_NAME_ID
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        ws_name_id = params.get(self.PARAM_IN_WS_NAME_ID)
        if not isinstance(ws_name_id, int):
            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD)
        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM)

        if alignments_to_add is None and alignments_to_remove is None:
            raise ValueError(
                'Either "alignments_to_remove" or "alignments_to_add" should be given'
            )

        return ws_name_id

    def _get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def _get_obj_info(self, ref):
        return self.ws_client.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]

    def _get_set_items(self, alignment_set_ref):

        obj_info = self._get_obj_info(alignment_set_ref)
        obj_type = self._get_type_from_obj_info(obj_info)

        if obj_type in ['KBaseSets.ReadsAlignmentSet']:
            set_data = self.setAPI.get_reads_alignment_set_v1(
                {'ref': alignment_set_ref})
            items = set_data['data']['items']
        elif obj_type in ['KBaseRNASeq.RNASeqAlignmentSet']:
            alignmentset_obj = self.ws_client.get_objects2(
                {'objects': [{
                    'ref': alignment_set_ref
                }]})['data'][0]
            """
            Add each alignment object to align_item and add it to items list
            """
            items = list()
            for alignment_ref in alignmentset_obj['data']['sample_alignments']:
                align_item = dict()
                align_item['ref'] = alignment_ref
                items.append(align_item)
        else:
            raise ValueError(
                '"alignment_set_ref" should be of type KBaseSets.ReadsAlignmentSet or '
                + 'KBaseRNASeq.RNASeqAlignmentSet')

        return items

    def _add_alignments(self, alignment_set_items, alignment_refs_list):

        for alignment_ref in alignment_refs_list:

            found = False
            for set_item in alignment_set_items:
                if set_item.get('ref') == alignment_ref:
                    print('{} already in the input Alignment Set. Not added'.
                          format(alignment_ref))
                    found = True
                    break

            if not found:
                alignment_set_items.append({'ref': alignment_ref})
        return alignment_set_items

    def _remove_alignments(self, input_alignment_set, alignment_set_items,
                           alignments_to_remove):

        for input_item in input_alignment_set:
            if not (input_item.get('ref') in alignments_to_remove):
                alignment_set_items.append(input_item)

        return alignment_set_items

    def _save_alignment_set(self, ws_name, obj_name, set_data):

        res = self.setAPI.save_reads_alignment_set_v1({
            "workspace": ws_name,
            "output_object_name": obj_name,
            "data": set_data
        })
        return res.get('set_ref')

    def edit_alignment_set(self, params):

        ws_name_id = self._process_params(params)
        obj_name = params.get(self.PARAM_IN_OBJ_NAME_ID)

        alignment_set_ref = params.get(self.PARAM_IN_ALIGNSET_REF)

        print('INPUT ALIGNMENT SET REF: ' + alignment_set_ref)

        input_alignment_set = self._get_set_items(alignment_set_ref)

        alignments_to_remove = params.get(self.PARAM_IN_ALIGNS_RM, None)
        alignments_to_add = params.get(self.PARAM_IN_ALIGNS_ADD, None)

        set_items = list()
        if alignments_to_remove is not None:
            set_items = self._remove_alignments(input_alignment_set, set_items,
                                                alignments_to_remove)
        if alignments_to_add is not None:
            set_items = self._add_alignments(set_items, alignments_to_add)

        set_data = {
            'description': 'Edited from {}'.format(alignment_set_ref),
            'items': set_items
        }

        output_alignment_set_ref = self._save_alignment_set(
            ws_name_id, obj_name, set_data)
        return output_alignment_set_ref
コード例 #15
0
def upload_genome(shock_service_url=None,
                  handle_service_url=None,
                  workspace_service_url=None,
                  callback_url=None,
                  input_gff_file=None,
                  input_fasta_file=None,
                  workspace_name=None,
                  core_genome_name=None,
                  scientific_name="unknown_taxon",
                  taxon_wsname='ReferenceTaxons',
                  taxon_reference=None,
                  source=None,
                  release=None,
                  genome_type=None):

    assembly_ref = None
    gff_handle_ref = None
    time_string = str(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y_%m_%d_%H_%M_%S'))

    dfUtil = DataFileUtil(callback_url)

    ###########################################
    #Retrieve taxon
    #Taxon lookup dependent on full genus
    #Example: Athaliana    Arabidopsis thaliana
    ###########################################
    #default to
    taxon_id = -1
    taxon_object_name = "unknown_taxon"

    #Retrieve lookup object if scientific name provided
    if (taxon_reference is None and scientific_name is not "unknown_taxon"):
        #Need to retrieve taxon lookup object then find taxon id
        taxon_lookup = dfUtil.get_objects({
            'object_refs': [taxon_wsname + "/taxon_lookup"],
            'ignore_errors':
            0
        })['data'][0]['data']['taxon_lookup']

        if (scientific_name[0:3] in taxon_lookup
                and scientific_name in taxon_lookup[scientific_name[0:3]]):
            taxon_id = taxon_lookup[scientific_name[0:3]][scientific_name]
            taxon_object_name = "%s_taxon" % (str(taxon_id))

    #Retrieve Taxon object
    taxon_info = {}
    if (taxon_reference is None):
        taxon_info = dfUtil.get_objects({
            'object_refs': [taxon_wsname + "/" + taxon_object_name],
            'ignore_errors':
            0
        })['data'][0]
        taxon_reference = "%s/%s/%s" % (taxon_info['info'][6],
                                        taxon_info['info'][0],
                                        taxon_info['info'][4])
    else:
        taxon_info = dfUtil.get_objects([{
            "object_refs": [taxon_reference],
            'ignore_errors': 0
        }])['data'][0]

    taxonomy = taxon_info['data']['scientific_lineage']
    ###########################################
    #End taxonomy retrieval
    ###########################################

    ###########################################
    #Create logger
    ###########################################
    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    # send messages to sys.stderr
    streamHandler = logging.StreamHandler(sys.stderr)

    formatter = logging.Formatter(
        "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
    )
    formatter.converter = time.gmtime
    streamHandler.setFormatter(formatter)

    logger.addHandler(streamHandler)
    ###########################################
    #End logger creation
    ###########################################

    ##########################################
    #Reading in Fasta file, Code taken from https://www.biostars.org/p/710/
    ##########################################
    logger.info("Reading FASTA file.")

    assembly = {
        "contigs": {},
        "dna_size": 0,
        "gc_content": 0,
        "md5": [],
        "base_counts": {}
    }
    contig_seq_start = 0

    input_file_handle = open(input_fasta_file, 'rb')

    # alternate header and sequence
    faiter = (x[1] for x in itertools.groupby(input_file_handle,
                                              lambda line: line[0] == ">"))
    for header in faiter:
        # drop the ">"
        header = header.next()[1:].strip()
        # join all sequence lines to one.
        seq = "".join(s.strip() for s in faiter.next())

        try:
            fasta_header, fasta_description = header.split(' ', 1)
        except:
            fasta_header = header
            fasta_description = None

        #Handle record
        seq = seq.upper()

        #Build contig objects for Assembly
        seq_count = dict(collections.Counter(seq))

        #to delete at end, but required for now
        contig_dict = {"sequence": seq}

        Ncount = 0
        if "N" in seq_count:
            Ncount = seq_count["N"]
        contig_dict["Ncount"] = Ncount

        for character in seq_count:
            if character in assembly["base_counts"]:
                assembly["base_counts"][character] += seq_count[character]
            else:
                assembly["base_counts"][character] = seq_count[character]

        contig_seq_length = len(seq)
        assembly["dna_size"] += contig_seq_length

        contig_gc_length = seq.count("G")
        contig_gc_length += seq.count("C")
        contig_dict["gc_content"] = float("{0:.2f}".format(
            float(contig_gc_length) / float(contig_seq_length)))
        assembly["gc_content"] += contig_gc_length

        contig_dict["contig_id"] = fasta_header
        contig_dict["name"] = fasta_header
        contig_dict["length"] = contig_seq_length
        contig_dict["md5"] = hashlib.md5(seq).hexdigest()
        assembly["md5"].append(contig_dict["md5"])

        if fasta_description is not None:
            contig_dict["description"] = fasta_description

        contig_dict["is_circular"] = "Unknown"
        contig_dict["start_position"] = contig_seq_start
        contig_dict["num_bytes"] = sys.getsizeof(contig_dict["sequence"])

        assembly["contigs"][fasta_header] = contig_dict

        #used for start of next sequence and total gc_content
        contig_seq_start += contig_seq_length

    assembly["gc_content"] = float("{0:.2f}".format(
        float(assembly["gc_content"]) / float(contig_seq_start)))
    assembly["md5"] = hashlib.md5(",".join(assembly["md5"])).hexdigest()
    assembly["assembly_id"] = core_genome_name + "_assembly"
    assembly["name"] = scientific_name
    assembly["external_source"] = source
    assembly["external_source_id"] = os.path.basename(input_fasta_file)
    assembly["external_source_origination_date"] = str(
        os.stat(input_fasta_file).st_ctime)
    assembly["num_contigs"] = len(assembly["contigs"].keys())
    assembly["type"] = "Unknown"
    assembly[
        "notes"] = "Note MD5s are generated from uppercasing the sequences"

    if taxon_reference is not None:
        assembly["taxon_ref"] = taxon_reference

    logger.info("Reading GFF file.")

    header = list()
    feature_list = dict()
    original_CDS_count = dict()
    original_feature_ids = dict()

    #    gff_file_handle = gzip.open(input_gff_file, 'rb')
    gff_file_handle = open(input_gff_file, 'rb')
    current_line = gff_file_handle.readline()
    gff_object = dict()
    while (current_line != ''):
        current_line = current_line.strip()

        if (current_line.startswith("##") or current_line.startswith("#!")):
            header.append(current_line)
            if ('headers' not in gff_object):
                gff_object['headers'] = list()
            gff_object['headers'].append(current_line)
        else:
            if ('features' not in gff_object):
                gff_object['features'] = list()

            contig_id, source_id, feature_type, start, end, score, strand, phase, attributes = current_line.split(
                '\t')
            attributes_dict = dict()
            for attribute in attributes.split(";"):
                if (attribute == "" or "=" not in attribute):
                    continue
                key, value = attribute.split("=", 1)
                attributes_dict[key] = value

            #ID should be transferred from Name or Parent
            old_id = None
            for key in ("ID", "PACid", "pacid"):
                if (key in attributes_dict):
                    old_id = attributes_dict[key]
                    break
            if (old_id is None):
                eprint(
                    "Cannot find unique ID, PACid, or pacid in GFF attributes: "
                    + attributes)
                continue

            if ("Name" in attributes_dict):
                attributes_dict["ID"] = attributes_dict["Name"]
            else:
                attributes_dict["ID"] = original_feature_ids[
                    attributes_dict["Parent"]] + "." + feature_type

                #if CDS have to increment
                if (feature_type == "CDS"):
                    if (attributes_dict["ID"] not in original_CDS_count):
                        original_CDS_count[attributes_dict["ID"]] = 1
                    else:
                        original_CDS_count[attributes_dict["ID"]] += 1

                    attributes_dict["ID"] += "." + str(
                        original_CDS_count[attributes_dict["ID"]])

            #Update parent
            if ("Parent" in attributes_dict):
                attributes_dict["Parent"] = original_feature_ids[
                    attributes_dict["Parent"]]

            original_feature_ids[old_id] = attributes_dict["ID"]

            #recreate line for GFF
            partial_line, attributes = current_line.rsplit('\t', 1)
            new_line = partial_line + "\t" + ";".join(
                key + "=" + attributes_dict[key]
                for key in attributes_dict.keys())
            gff_object['features'].append(new_line)

            if (contig_id not in assembly["contigs"]):
                logger.warn("Missing contig: " + contig_id)

            if (contig_id not in feature_list):
                feature_list[contig_id] = list()

            feature = {
                'type': feature_type,
                'start': int(start),
                'end': int(end),
                'score': score,
                'strand': strand,
                'phase': phase
            }
            for attribute in attributes.split(";"):
                if (attribute == "" or "=" not in attribute):
                    continue
                key, value = attribute.split("=", 1)
                feature[key] = value

            #Append contig identifier
            feature["contig"] = contig_id
            feature_list[contig_id].append(feature)

        current_line = gff_file_handle.readline()
    gff_file_handle.close()

    #Writing updated lines to gff_file_handle
    input_gff_file = input_gff_file.replace("gene", "edited_gene")
    gff_file_handle = gzip.open(input_gff_file, 'wb')
    if ('headers' in gff_object):
        gff_file_handle.write("\n".join(gff_object["headers"]))
    gff_file_handle.write("\n".join(gff_object["features"]))
    gff_file_handle.close()

    #New code inserted to better handle feature identifiers
    #Start by extracting and group them first
    features_identifiers_dict = dict()
    features_identifiers_list = list()
    features_identifiers_count = dict()
    features_parents_dict = dict()
    features_name_id_dict = dict()
    CDS_count = dict()
    for contig in sorted(feature_list):
        for feature in feature_list[contig]:
            #We're only considering gene, mRNA, and CDS for brevity's sake
            if (feature["type"] not in ("gene", "mRNA", "CDS")):
                continue

            #gene and mRNA always have name, CDS do not
            if ("Name" not in feature):
                feature["Name"] = None

            #Update parent following name/id switch
            if ("Parent" in feature
                    and feature["Parent"] in features_name_id_dict):
                feature["Parent"] = features_name_id_dict[feature["Parent"]]

            #ID should be transferred to Name, but need to maintain parent
            if (feature["Name"] is not None):
                features_name_id_dict[feature["ID"]] = feature["Name"]
                feature["ID"] = feature["Name"]
            else:
                feature["ID"] = feature["Parent"] + "." + feature["type"]
                #if CDS have to increment
                if (feature["type"] == "CDS"):
                    if (feature["ID"] not in CDS_count):
                        CDS_count[feature["ID"]] = 1
                    else:
                        CDS_count[feature["ID"]] += 1

                    feature["ID"] += "." + str(CDS_count[feature["ID"]])

            #Collect
            if (feature["type"] == "gene"):
                features_identifiers_dict[feature["ID"]] = dict()
            if (feature["type"] == "mRNA"):
                features_identifiers_dict[feature["Parent"]][
                    feature["ID"]] = dict()
                features_parents_dict[feature["ID"]] = feature["Parent"]
            if (feature["type"] == "CDS"):
                features_identifiers_dict[features_parents_dict[
                    feature["Parent"]]][feature["Parent"]][feature["ID"]] = 1

            features_identifiers_list.append(feature)
            features_identifiers_count[
                feature["ID"]] = len(features_identifiers_list) - 1

    updated_features_identifiers_dict = dict()
    updated_features_list = list()
    updated_features_identifiers_count = dict()
    updated_features_parents_dict = dict()
    updated_CDS_count = dict()
    for gene in sorted(features_identifiers_dict):

        #retrieve original object
        gene_ftr = features_identifiers_list[features_identifiers_count[gene]]

        #store gene
        updated_features_identifiers_dict[gene_ftr["ID"]] = dict()
        updated_features_list.append(gene_ftr)
        updated_features_identifiers_count[
            gene_ftr["ID"]] = len(updated_features_list) - 1

        for mRNA in sorted(features_identifiers_dict[gene],
                           key=lambda x: features_identifiers_count[x]):
            #retrieve feature
            mRNA_ftr = features_identifiers_list[
                features_identifiers_count[mRNA]]

            if ("PAC" in mRNA[0:3]):
                if ("Name" in mRNA_ftr):
                    mRNA_ftr["ID"] = mRNA_ftr["Name"]

            updated_features_identifiers_dict[gene_ftr["ID"]][
                mRNA_ftr["ID"]] = dict()
            updated_features_parents_dict[mRNA_ftr["ID"]] = mRNA_ftr["Parent"]

            updated_features_list.append(mRNA_ftr)
            updated_features_identifiers_count[
                mRNA_ftr["ID"]] = len(updated_features_list) - 1

            for CDS in sorted(features_identifiers_dict[gene][mRNA],
                              key=lambda x: features_identifiers_count[x]):
                #retrieve feature
                CDS_ftr = features_identifiers_list[
                    features_identifiers_count[CDS]]

                if ("PAC" in CDS[0:3]):
                    CDS_ftr["ID"] = mRNA_ftr["ID"] + ".CDS"

                    if (CDS_ftr["ID"] not in updated_CDS_count):
                        updated_CDS_count[CDS_ftr["ID"]] = 1
                    else:
                        updated_CDS_count[CDS_ftr["ID"]] += 1

                    CDS_ftr["ID"] += "." + str(
                        updated_CDS_count[CDS_ftr["ID"]])
                    CDS_ftr["Parent"] = mRNA_ftr["ID"]

                updated_features_identifiers_dict[gene_ftr["ID"]][
                    mRNA_ftr["ID"]][CDS_ftr["ID"]] = 1
                updated_features_parents_dict[
                    CDS_ftr["ID"]] = CDS_ftr["Parent"]

                updated_features_list.append(CDS_ftr)
                updated_features_identifiers_count[
                    CDS_ftr["ID"]] = len(updated_features_list) - 1

    genome_features_list = list()
    genome_mrnas_list = list()
    genome_cdss_list = list()
    for gene in sorted(updated_features_identifiers_dict):
        #retrieve updated object
        gene_ftr = updated_features_list[
            updated_features_identifiers_count[gene]]

        gene_object = convert_ftr_object(
            gene_ftr, assembly["contigs"][gene_ftr["contig"]]["sequence"])
        gene_object["type"] = "gene"

        #New terms, TODO, move to end of gene loop
        gene_object["cdss"] = list()
        gene_object["mrnas"] = list()

        #use function of longest CDS for gene
        longest_protein_length = 0
        longest_protein_sequence = ""
        for mRNA in sorted(
                updated_features_identifiers_dict[gene],
                key=lambda x: updated_features_identifiers_count[x]):
            #retrieve updated object
            mRNA_ftr = updated_features_list[
                updated_features_identifiers_count[mRNA]]

            feature_object = convert_ftr_object(
                mRNA_ftr, assembly["contigs"][mRNA_ftr["contig"]]["sequence"])
            feature_object['parent_gene'] = gene_object['id']

            mrna_object = copy.deepcopy(feature_object)
            cds_object = copy.deepcopy(feature_object)

            cds_object['id'] = mrna_object['id'] + ".CDS"
            mrna_object['cds'] = cds_object['id']

            cds_object['parent_mrna'] = mrna_object['id']

            del mrna_object["dna_sequence"]
            del mrna_object["dna_sequence_length"]

            cds_object["ontology_terms"] = dict()

            gene_object["mrnas"].append(mrna_object["id"])
            gene_object["cdss"].append(cds_object["id"])

            #CDS aggregation needs to be done in order to build protein sequence and list of locations
            CDS_list = sorted(
                updated_features_identifiers_dict[gene][mRNA],
                key=lambda x: updated_features_identifiers_count[x])

            dna_sequence = ""
            locations = list()

            #collect phases, and lengths of exons
            #right now, this is only for the purpose of error reporting
            phases = list()
            exons = list()

            for CDS in (CDS_list):
                #retrieve updated partial CDS
                add_ftr = updated_features_list[
                    updated_features_identifiers_count[CDS]]
                phases.append(add_ftr["phase"])

                add_ftr_obj = convert_ftr_object(
                    add_ftr,
                    assembly["contigs"][add_ftr["contig"]]["sequence"])
                exons.append(len(add_ftr_obj["dna_sequence"]))

                #Remove base(s) according to phase, but only for first CDS
                if (CDS == CDS_list[0] and int(add_ftr["phase"]) != 0):
                    logger.info("Adjusting phase for first CDS: " + CDS)
                    add_ftr_obj["dna_sequence"] = add_ftr_obj["dna_sequence"][
                        int(add_ftr["phase"]):]

                dna_sequence += add_ftr_obj["dna_sequence"]
                locations.append(add_ftr_obj["location"][0])

            #translate sequence
            dna_sequence_obj = Seq(dna_sequence, IUPAC.ambiguous_dna)
            rna_sequence = dna_sequence_obj.transcribe()

            #Incomplete gene model with no start codon
            #Translate as is
            if str(rna_sequence.upper())[:3] not in codon_table.start_codons:
                logger.info("Missing start codon for " + feature_object["id"] +
                            " Assuming incomplete gene model.")
                #temp_seq = 'AUG'+str(rna_sequence.upper())[3:]
                #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna)

            #You should never have this problem, needs to be reported rather than "fixed"
            codon_count = len(str(rna_sequence)) % 3
            if codon_count != 0:
                logger.info(
                    "Number of bases for RNA sequence for " +
                    feature_object["id"] +
                    " is not divisible by 3. The resulting protein may well be mis-translated."
                )
                #temp_seq = str(rna_sequence.upper())+"N"
                #if codon_count == 1:
                #    temp_seq+="N"
                #new_codon_count=len(temp_seq) % 3
                #rna_sequence = Seq(temp_seq, IUPAC.ambiguous_dna)

            protein_sequence = Seq("")
            try:
                protein_sequence = rna_sequence.translate()  #cds=True)
            except CodonTable.TranslationError as te:
                logger.info("TranslationError for: " + feature_object["id"],
                            phases, exons, " : " + str(te))

            cds_object["protein_translation"] = str(protein_sequence).upper()
            cds_object["protein_translation_length"] = len(
                cds_object["protein_translation"])
            cds_object["md5"] = hashlib.md5(
                cds_object["protein_translation"]).hexdigest()

            if (cds_object["protein_translation_length"] >
                    longest_protein_length):
                longest_protein_length = cds_object[
                    "protein_translation_length"]
                longest_protein_sequence = cds_object["protein_translation"]

            del cds_object["dna_sequence"]
            del cds_object["dna_sequence_length"]
            if ("aliases" not in cds_object):
                cds_object["aliases"] = list()
            if ("function" not in cds_object):
                cds_object["function"] = ""

            #End of mRNA loop
            genome_mrnas_list.append(mrna_object)
            genome_cdss_list.append(cds_object)

        #End of gene loop
        gene_object["ontology_terms"] = dict()
        gene_object["protein_translation"] = longest_protein_sequence
        gene_object["protein_translation_length"] = longest_protein_length
        genome_features_list.append(gene_object)

    #remove sequences before loading
    for contig in assembly["contigs"]:
        del assembly["contigs"][contig]["sequence"]


#    assembly_string = simplejson.dumps(assembly, sort_keys=True, indent=4, ensure_ascii=False)
#    assembly_file = open("Bulk_Phytozome_Upload/"+assembly["name"]+'.json', 'w+')
#    assembly_file.write(assembly_string)
#    assembly_file.close()

    if (assembly_ref == None):
        #Upload FASTA to shock
        #Need to gunzip file first
        gunzipped_fasta_file = input_fasta_file
        #        gunzipped_fasta_file=input_fasta_file[0:-3]
        #        with gzip.open(input_fasta_file, 'rb') as f_in:
        #            with open(gunzipped_fasta_file, 'wb') as f_out:
        #                shutil.copyfileobj(f_in, f_out)

        token = os.environ.get('KB_AUTH_TOKEN')

        logger.info("Attempting Assembly save for %s" %
                    (assembly["assembly_id"]))
        aUtil = AssemblyUtil(callback_url)
        assembly_ref = aUtil.save_assembly_from_fasta({
            'file': {
                'path': gunzipped_fasta_file,
                'assembly_name': assembly['assembly_id']
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            assembly['assembly_id']
        })
        logger.info("Assembly saved for %s" % (assembly["name"]))

        #Remove gunzipped file
        #os.remove(input_fasta_file[0:-3])

    genome = dict()
    genome["id"] = core_genome_name
    genome["scientific_name"] = scientific_name
    genome["assembly_ref"] = assembly_ref
    genome["features"] = genome_features_list
    genome["cdss"] = genome_cdss_list
    genome["mrnas"] = genome_mrnas_list
    genome["source"] = source
    genome["domain"] = "Eukaryota"
    genome["genetic_code"] = 1
    genome["gc_content"] = assembly["gc_content"]
    genome["dna_size"] = assembly["dna_size"]

    if taxon_reference is not None:
        genome["taxon_ref"] = taxon_reference
        genome["taxonomy"] = taxonomy

    UserMeta = dict()
    UserMeta['Taxonomy'] = taxonomy
    UserMeta['Source'] = source
    UserMeta['Domain'] = "Eukaryota"
    UserMeta['Source ID'] = core_genome_name
    UserMeta['Name'] = scientific_name
    UserMeta['Genetic code'] = 1

    UserMeta['GC content'] = assembly["gc_content"]
    UserMeta['Size'] = assembly["dna_size"]
    UserMeta['Number contigs'] = assembly['num_contigs']

    #id_source_version_array = core_genome_name.split("_")
    #version = "_".join(id_source_version_array[2:])
    #UserMeta['Version']=version
    #UserMeta['url']='';

    if (gff_handle_ref == None):
        token = os.environ.get('KB_AUTH_TOKEN')
        file_upload = dfUtil.file_to_shock({
            'file_path': input_gff_file,
            'make_handle': 1,
            'pack': "gzip"
        })
        gff_handle_ref = file_upload['handle']['hid']

    genome['gff_handle_ref'] = gff_handle_ref

    #    genome_string = simplejson.dumps(genome, sort_keys=True, indent=4, ensure_ascii=False)
    #    genome_file = open("Bulk_Phytozome_Upload/"+core_genome_name+'.json', 'w+')
    #    genome_file.write(genome_string)
    #    genome_file.close()

    logger.info("Attempting Genome save for %s" % (core_genome_name))
    workspace_id = dfUtil.ws_name_to_id(workspace_name)
    genome_info = dfUtil.save_objects({
        "id":
        workspace_id,
        "objects": [{
            "name": core_genome_name,
            "type": "KBaseGenomes.Genome",
            "data": genome
        }]
    })[0]
    logger.info("Genome saved for %s" % (core_genome_name))

    return {'genome_info': genome_info, 'report_string': ""}