def test_simple_upload(self):
        genomeFileUtil = self.getImpl()

        ### Test for a Local Function Call - file needs to be just on disk
        tmp_dir = self.__class__.cfg['scratch']
        #file_name = "GCF_000005845.2_ASM584v2_genomic.gbff.gz"
        #shutil.copy(os.path.join("data", file_name), tmp_dir)
        gbk_path = self.getTempGenbank()  # os.path.join(tmp_dir, file_name)
        print('attempting upload via local function directly')
        ws_obj_name = 'MyGenome'
        result = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), 
            {
                'file_path':gbk_path,
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name
            });
        pprint(result)
        # todo: add test that result is correct

        ### Test for upload from SHOCK - upload the file to shock first
        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], 
                                token=self.__class__.ctx['token'],
                                service_ver='dev')
        shock_id = data_file_cli.file_to_shock({'file_path': gbk_path})['shock_id']
        ws_obj_name2 = 'MyGenome.2'
        result2 = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), 
            {
                'shock_id':shock_id,
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name2,
                'convert_to_legacy':1
            });
        pprint(result2)
Example #2
0
    def _upload_report(self, report_dir, file_links, workspace_name,
                       saved_objects):
        dfu = DataFileUtil(self.callback_url)
        upload_info = dfu.file_to_shock({
            'file_path': report_dir,
            'pack': 'zip'
        })
        shock_id = upload_info['shock_id']

        report_params = {
            'message':
            'JGI metagenome assembly report',
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': shock_id,
                'name': 'index.html',
                'description': 'assembly report'
            }],
            'file_links':
            file_links,
            'report_object_name':
            'JGI_assembly_pipeline.' + str(uuid.uuid4()),
            'workspace_name':
            workspace_name,
            'objects_created':
            saved_objects
        }

        report_client = KBaseReport(self.callback_url)
        report = report_client.create_extended_report(report_params)
        return {'report_ref': report['ref'], 'report_name': report['name']}
Example #3
0
 def test_shock_copy_node(self):
     test_phrase = "Hi there!"
     path_to_temp_file = "/kb/module/work/tmp/temp_copy_" + str(
         time.time()) + ".fq"
     self.textToFile(test_phrase, path_to_temp_file)
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                        token=self.ctx['token'])
     attributes = {'foo': 'bar'}
     shock_id = dfu.file_to_shock({
         'file_path': path_to_temp_file,
         'attributes': attributes
     })['shock_id']
     # Check what's saved
     os.remove(path_to_temp_file)
     node_info = dfu.shock_to_file({
         'shock_id': shock_id,
         'file_path': path_to_temp_file
     })
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file))
     self.assertEqual(
         node_info.get('attributes'), attributes,
         "Unexpected attributes in node info: " + str(node_info))
     # Let's copy shock node
     shock_id2 = dfu.copy_shock_node({'shock_id': shock_id})['shock_id']
     path_to_temp_file2 = "/kb/module/work/tmp/temp_copy2_" + str(
         time.time()) + ".fq"
     node_info2 = dfu.shock_to_file({
         'shock_id': shock_id2,
         'file_path': path_to_temp_file2
     })
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
     self.assertEqual(
         node_info2.get('attributes'), attributes,
         "Unexpected attributes in node info: " + str(node_info2))
Example #4
0
 def test_shock_handle_ws(self):
     test_phrase = "Hi there!"
     path_to_temp_file = "/kb/module/work/tmp/temp_" + str(
         time.time()) + ".fq"
     self.textToFile(test_phrase, path_to_temp_file)
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                        token=self.ctx['token'])
     uploaded = dfu.file_to_shock({
         'file_path': path_to_temp_file,
         'make_handle': 1
     })
     fhandle = uploaded['handle']
     self.assertTrue('hid' in fhandle, "Handle: " + str(fhandle))
     data = {'hid': fhandle['hid']}
     obj_name = 'TestObject.1'
     info = self.getWsClient().save_objects({
         'workspace':
         self.getWsName(),
         'objects': [{
             'type': 'Empty.AHandle',
             'data': data,
             'name': obj_name
         }]
     })[0]
     self.assertEqual(info[1], obj_name)
     ref = self.getWsName() + '/' + obj_name
     handle_data = self.getWsClient().get_objects([{'ref': ref}])[0]['data']
     self.assertTrue('hid' in handle_data, "Data: " + str(handle_data))
     hid = handle_data['hid']
     path_to_temp_file2 = "/kb/module/work/tmp/temp2_" + str(
         time.time()) + ".fq"
     dfu.shock_to_file({'handle_id': hid, 'file_path': path_to_temp_file2})
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
Example #5
0
 def load_test_genome_with_cache(self, filename, gbff_cache_filename):
     """ cache filename needs to in scratch space """
     with open(filename, 'r') as file:
         data_str = file.read()
     data = json.loads(data_str)
     # save to ws
     save_info = {
         'workspace':
         self.getWsName(),
         'objects': [{
             'type': 'KBaseGenomes.Genome',
             'data': data,
             'name': 'e_coli'
         }]
     }
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     shock_file = dfu.file_to_shock({
         'file_path': gbff_cache_filename,
         'make_handle': 1
     })
     data['genbank_handle_ref'] = shock_file['handle']['hid']
     # save to ws
     save_info['objects'][0]['name'] = 'e_coli_with_genbank'
     result = self.ws.save_objects(save_info)
     info = result[0]
     ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     print('created test genome with gbff cache: ' + ref + ' from file ' +
           filename)
     return ref
Example #6
0
def make_fake_alignment(callback_url, dummy_file, name, reads_ref, genome_ref,
                        ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqAlignment object and returns a ref to it.
    callback_url: needed for DataFileUtil,
    dummy_file: path to some dummy "alignment" file (make it small - needs to be uploaded to shock)
    name: the name of the object
    reads_ref: a reference to a valid (probably fake) reads library
    genome_ref: a reference to a valid (also probably fake) genome
    workspace_name: the name of the workspace to save this object
    workspace_client: a Workspace client tuned to the server of your choice
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    fake_alignment = {
        "file": dummy_shock_info['handle'],
        "library_type": "fake",
        "read_sample_id": reads_ref,
        "condition": "fake",
        "genome_id": genome_ref
    }
    return make_fake_object(fake_alignment, "KBaseRNASeq.RNASeqAlignment",
                            name, ws_name, ws_client)
Example #7
0
def make_fake_expression(callback_url, dummy_file, name, genome_ref,
                         annotation_ref, alignment_ref, ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqExpression object and returns a ref to it.
    genome_ref: reference to a genome object
    annotation_ref: reference to a KBaseRNASeq.GFFAnnotation
    alignment_ref: reference to a KBaseRNASeq.RNASeqAlignment
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    exp = {
        "id": "fake",
        "type": "fake",
        "numerical_interpretation": "fake",
        "expression_levels": {
            "feature_1": 0,
            "feature_2": 1,
            "feature_3": 2
        },
        "genome_id": genome_ref,
        "annotation_id": annotation_ref,
        "mapped_rnaseq_alignment": {
            "id1": alignment_ref
        },
        "condition": "",
        "tool_used": "none",
        "tool_version": "0.0.0",
        "file": dummy_shock_info['handle']
    }
    return make_fake_object(exp, "KBaseRNASeq.RNASeqExpression", name, ws_name,
                            ws_client)
    def test_simple_upload(self):
        # fetch the test files and set things up
        genomeFileUtil = self.getImpl()
        gbk_path = "data/GCF_000005845.2_ASM584v2_genomic.gbff"

        ### Test for a Local Function Call
        print('attempting upload via local function directly')
        ws_obj_name = 'MyGenome'
        result = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file' : { 'path': gbk_path },
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name
            })[0]
        pprint(result)
        self.assertIsNotNone(result['genome_ref'])
        target_dir = os.path.join("/kb/module/work/tmp", "GCF_000005845")
        download_genome_to_json_files(self.getContext()['token'], result['genome_ref'],
                                      target_dir)
        #self.assertEqual(0, len(compare_genome_json_files(target_dir, 
        #                                                  os.path.join("/kb/module/test/data", 
        #                                                               "GCF_000005845"))))
        # todo: add test that result is correct

        ### Test for upload from SHOCK - upload the file to shock first
        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], 
                                token=self.__class__.ctx['token'],
                                service_ver='dev')
        shutil.copy(gbk_path, self.__class__.cfg['scratch'])
        shock_id = data_file_cli.file_to_shock({
            'file_path': os.path.join(self.__class__.cfg['scratch'], gbk_path.split("/")[-1])
        })['shock_id']
        ws_obj_name2 = 'MyGenome.2'
        result2 = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file': {'shock_id':shock_id},
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name2,
            })[0]
        pprint(result2)
        self.assertIsNotNone(result['genome_ref'])
        # todo: add test that result is correct

        ### Test for upload via FTP- use something from genbank
        print('attempting upload through ftp url')
        ws_obj_name3 = 'MyGenome.3'
        result3 = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file':{'ftp_url': self.__class__.TEST_ECOLI_FILE_FTP},
                'workspace_name': self.getWsName(),
                'genome_name': ws_obj_name3,
            })[0]
        pprint(result3)

        self.assertIsNotNone(result3['genome_ref'])
Example #9
0
 def package_folder(self, folder_path, zip_file_name, zip_file_description):
     ''' Simple utility for packaging a folder and saving to shock '''
     dfu = DataFileUtil(self.callback_url)
     output = dfu.file_to_shock({
         'file_path': folder_path,
         'make_handle': 0,
         'pack': 'zip'
     })
     return {
         'shock_id': output['shock_id'],
         'name': zip_file_name,
         'description': zip_file_description
     }
Example #10
0
    def test_basic_upload_and_download(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'MyNewAssembly'
        result = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'file': {
                    'path': fasta_path
                },
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name
            })
        pprint(result)
        self.check_fasta_file(ws_obj_name, fasta_path)

        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shock_id = data_file_cli.file_to_shock({'file_path':
                                                fasta_path})['shock_id']
        ws_obj_name2 = 'MyNewAssembly.2'
        result2 = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'shock_id': shock_id,
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name2
            })
        pprint(result2)
        self.check_fasta_file(ws_obj_name2, fasta_path)

        print('attempting upload via ftp url')
        ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz'
        ws_obj_name3 = 'MyNewAssembly.3'
        result3 = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'ftp_url': ftp_url,
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name3
            })
        pprint(result3)
        # todo: add checks here on ws object

        ws_obj_name3 = 'MyNewAssembly.3'
        result4 = assemblyUtil.export_assembly_as_fasta(
            self.getContext(),
            {'input_ref': self.getWsName() + '/' + ws_obj_name3})
        pprint(result4)
Example #11
0
 def test_handles(self):
     wsName = self.generatePesudoRandomWorkspaceName()
     self.ws.set_permissions({
         'workspace': wsName,
         'new_permission': 'w',
         'users': [self.ctx2['user_id']]
     })
     temp_shock_file = "/kb/module/work/tmp/shock1.txt"
     with open(temp_shock_file, "w") as f1:
         f1.write("Test Shock Handle")
     token1 = self.ctx['token']
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token1)
     handle1 = dfu.file_to_shock({
         'file_path': temp_shock_file,
         'make_handle': 1
     })['handle']
     hid1 = handle1['hid']
     genome_name = "Genome.1"
     ws2 = Workspace(self.cfg['workspace-url'], token=token1)
     ws2.save_objects({
         'workspace':
         wsName,
         'objects': [{
             'name': genome_name,
             'type': 'KBaseGenomes.Genome',
             'data': {
                 'id': "qwerty",
                 'scientific_name': "Qwerty",
                 'domain': "Bacteria",
                 'genetic_code': 11,
                 'genbank_handle_ref': hid1
             }
         }]
     })
     genome = self.impl.get_genome_v1(
         self.ctx2, {'genomes': [{
             'ref': wsName + '/' + genome_name
         }]})[0]['genomes'][0]['data']
     self.impl.save_one_genome_v1(self.ctx2, {
         'workspace': wsName,
         'name': genome_name,
         'data': genome
     })[0]
     genome = self.impl.get_genome_v1(
         self.ctx2, {'genomes': [{
             'ref': wsName + '/' + genome_name
         }]})[0]['genomes'][0]['data']
     self.assertTrue('genbank_handle_ref' in genome)
     hid2 = genome['genbank_handle_ref']
     self.assertNotEqual(hid1, hid2)
Example #12
0
def make_fake_annotation(callback_url, dummy_file, name, ws_name, ws_client):
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    annotation = {
        "handle": dummy_shock_info['handle'],
        "size": 0,
        "genome_id": "not_a_real_genome",
        "genome_scientific_name": "Genomus falsus"
    }
    return make_fake_object(annotation, "KBaseRNASeq.GFFAnnotation", name,
                            ws_name, ws_client)
Example #13
0
def upload_file_to_shock(logger,
                         filePath,
                         make_handle = True,
                         shock_service_url = None,
                         #attributes = '{}',
                         ssl_verify = True,
                         token = None):
    """
    Use HTTP multi-part POST to save a file to a SHOCK instance.
    """

    
    #shock_service_url is from config
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token)
    #return dfu.file_to_shock({"file_path":filePath, "attributes": json.dumps(attributes), "make_handle" : make_handle})
    return dfu.file_to_shock( { "file_path":filePath,  "make_handle" : make_handle } )
    def test_basic_upload_and_download(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'MyNewAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name
                                                        })
        pprint(result)
        self.check_fasta_file(ws_obj_name, fasta_path)


        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shock_id = data_file_cli.file_to_shock({'file_path': fasta_path})['shock_id']
        ws_obj_name2 = 'MyNewAssembly.2'
        result2 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'shock_id': shock_id,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name2
                                                         })
        pprint(result2)
        self.check_fasta_file(ws_obj_name2, fasta_path)

        print('attempting upload via ftp url')
        ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz'
        ws_obj_name3 = 'MyNewAssembly.3'
        result3 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'ftp_url': ftp_url,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name3
                                                         })
        pprint(result3)
        # todo: add checks here on ws object

        ws_obj_name3 = 'MyNewAssembly.3'
        result4 = assemblyUtil.export_assembly_as_fasta(self.getContext(),
                                                        {'input_ref': self.getWsName() + '/' + ws_obj_name3})
        pprint(result4)
Example #15
0
 def package_folder(self, folder_path, zip_file_name, zip_file_description):
     ''' Simple utility for packaging a folder and saving to shock '''
     if folder_path == self.scratch:
         raise ValueError(
             "cannot package folder that is not a subfolder of scratch")
     dfu = DataFileUtil(self.callback_url)
     if not os.path.exists(folder_path):
         raise ValueError("cannot package folder that doesn't exist: " +
                          folder_path)
     output = dfu.file_to_shock({
         'file_path': folder_path,
         'make_handle': 0,
         'pack': 'zip'
     })
     return {
         'shock_id': output['shock_id'],
         'name': zip_file_name,
         'description': zip_file_description
     }
Example #16
0
    def _put_cached_index(self, assembly_info, index_files_basename, output_dir, ws_for_cache):

        if not ws_for_cache:
            print('WARNING: bowtie2 index cannot be cached because "ws_for_cache" field not set')
            return False

        try:
            dfu = DataFileUtil(self.callback_url)
            result = dfu.file_to_shock({'file_path': output_dir,
                                        'make_handle': 1,
                                        'pack': 'targz'})

            bowtie2_index = {'handle': result['handle'], 'size': result['size'],
                             'assembly_ref': assembly_info['ref'],
                             'index_files_basename': index_files_basename}

            ws = Workspace(self.ws_url)
            save_params = {'objects': [{'hidden': 1,
                                        'provenance': self.provenance,
                                        'name': os.path.basename(output_dir),
                                        'data': bowtie2_index,
                                        'type': 'KBaseRNASeq.Bowtie2IndexV2'
                                        }]
                           }
            if ws_for_cache.strip().isdigit():
                save_params['id'] = int(ws_for_cache)
            else:
                save_params['workspace'] = ws_for_cache.strip()
            save_result = ws.save_objects(save_params)
            print('Bowtie2IndexV2 cached to: ')
            pprint(save_result[0])
            return True

        except Exception:
            # if we fail in saving the cached object, don't worry
            print('WARNING: exception encountered when trying to cache the index files:')
            print(traceback.format_exc())
            print('END WARNING: exception encountered when trying to cache the index files')

        return False
Example #17
0
def create_report(callback_url, scratch, workspace_name, result_data):
    """
    Create KBase extended report object for the output html
    """
    html = create_html_tables(result_data)
    dfu = DataFileUtil(callback_url)
    report_name = 'fastANI_report_' + str(uuid.uuid4())
    report_client = KBaseReport(callback_url)
    html_dir = os.path.join(scratch, report_name)
    os.mkdir(html_dir)
    # Move all pdfs into the html directory
    for result in result_data:
        if os.path.exists(result['viz_path']):
            os.rename(result['viz_path'],
                      os.path.join(html_dir, result['viz_filename']))
    with open(os.path.join(html_dir, "index.html"), 'w') as file:
        file.write(html)
    shock = dfu.file_to_shock({
        'file_path': html_dir,
        'make_handle': 0,
        'pack': 'zip'
    })
    html_file = {
        'shock_id': shock['shock_id'],
        'name': 'index.html',
        'label': 'html_files',
        'description': 'FastANI HTML report'
    }
    report = report_client.create_extended_report({
        'direct_html_link_index':
        0,
        'html_links': [html_file],
        'report_object_name':
        report_name,
        'workspace_name':
        workspace_name
    })
    return {'report_name': report['name'], 'report_ref': report['ref']}
class ExpressionUtils:
    '''
    Module Name:
    ExpressionUtils

    Module Description:
    A KBase module: ExpressionUtils

This module is intended for use by Assemblers to upload RNASeq Expression files
(gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent.
The expression files are uploaded as a single compressed file.This module also generates
expression levels and tpm expression levels from the input files and saves them in the
workspace object. Once uploaded, the expression files can be downloaded onto an output directory.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.1.1"
    GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git"
    GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99"

    #BEGIN_CLASS_HEADER

    PARAM_IN_SRC_DIR = 'source_dir'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_ALIGNMENT_REF = 'alignment_ref'

    PARAM_IN_GENOME_REF = 'genome_ref'
    PARAM_IN_ANNOTATION_ID = 'annotation_id'
    PARAM_IN_BAM_FILE_PATH = 'bam_file_path'
    PARAM_IN_DESCRIPTION = 'description'
    PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level'
    PARAM_IN_PROC_COMMENTS = 'processing_comments'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'
    PARAM_IN_ORIG_MEDIAN = 'original_median'
    PARAM_IN_EXT_SRC_DATE = 'external_source_date'
    PARAM_IN_TRANSCRIPTS = 'transcripts'
    PARAM_IN_SRC = 'source'

    def _check_required_param(self, in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _proc_upload_expression_params(self, ctx, params):
        """
        Check the presence and validity of upload expression params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR,
            self.PARAM_IN_ALIGNMENT_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        source_dir = params.get(self.PARAM_IN_SRC_DIR)

        if not (os.path.isdir(source_dir)):
            raise ValueError('Source directory does not exist: ' + source_dir)

        if not os.listdir(source_dir):
            raise ValueError('Source directory is empty: ' + source_dir)

        return ws_name_id, obj_name_id, source_dir

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _get_genome_ref(self, assembly_or_genome_ref, params):
        if self.PARAM_IN_GENOME_REF in params and params[
                self.PARAM_IN_GENOME_REF] is not None:
            return params[self.PARAM_IN_GENOME_REF]

        obj_type = self._get_ws_info(assembly_or_genome_ref)[2]
        if obj_type.startswith('KBaseGenomes.Genome'):
            return assembly_or_genome_ref

        raise ValueError('Alignment object does not contain genome_ref; '
                         '"{}" parameter is required'.format(
                             self.PARAM_IN_GENOME_REF))

    def _get_expression_levels(self,
                               source_dir,
                               genome_ref,
                               transcripts=False):

        fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking')
        if transcripts:
            fpkm_file_path = os.path.join(source_dir, 't_data.ctab')

        if not os.path.isfile(fpkm_file_path):
            raise ValueError('{} file is required'.format(fpkm_file_path))

        id_col = 5 if transcripts else 0
        self.__LOGGER.info(
            'Generating expression levels from {}'.format(fpkm_file_path))
        return self.expression_utils.get_expression_levels(
            fpkm_file_path, genome_ref, id_col)

    def _gen_ctab_files(self, params, alignment_ref):

        source_dir = params.get(self.PARAM_IN_SRC_DIR)
        if len(glob.glob(source_dir + '/*.ctab')) < 5:

            self.__LOGGER.info(' =======  Generating ctab files ==========')
            gtf_file = os.path.join(source_dir, 'transcripts.gtf')
            if not os.path.isfile(gtf_file):
                raise ValueError(
                    "{} file is required to generate ctab files, found missing"
                    .format(gtf_file))

            if self.PARAM_IN_BAM_FILE_PATH in params and \
               params[self.PARAM_IN_BAM_FILE_PATH] is not None:
                bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH]
            else:
                self.__LOGGER.info(
                    'Downloading bam file from alignment object')
                rau = ReadsAlignmentUtils(self.callback_url)
                alignment_retVal = rau.download_alignment(
                    {'source_ref': alignment_ref})
                alignment_dir = alignment_retVal.get('destination_dir')

                allbamfiles = glob.glob(alignment_dir + '/*.bam')
                if len(allbamfiles) == 0:
                    raise ValueError('bam file does not exist in {}'.format(d))
                elif len(allbamfiles) == 1:
                    bam_file_path = allbamfiles[0]
                elif len(allbamfiles) > 1:
                    tmp_file_path = os.path.join(alignment_dir,
                                                 'accepted_hits.bam')
                    if os.path.isfile(tmp_file_path):
                        bam_file_path = tmp_file_path
                    else:
                        tmp_file_path = os.path.join(
                            alignment_dir, 'accepted_hits_sorted.bam')
                        if os.path.isfile(tmp_file_path):
                            bam_file_path = tmp_file_path
                        else:
                            raise ValueError(
                                'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}'
                                .format(alignment_dir))

            result = self.table_maker.build_ctab_files(
                ref_genome_path=gtf_file,
                alignment_path=bam_file_path,
                output_dir=source_dir)
            if result != 0:
                raise ValueError('Tablemaker failed')

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.__LOGGER = logging.getLogger('ExpressionUtils')
        self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
        )
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.config['SDK_CALLBACK_URL'] = self.callback_url
        self.expression_utils = Expression_Utils(self.config)
        self.dfu = DataFileUtil(self.callback_url)
        self.table_maker = TableMaker(config, self.__LOGGER)
        self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER)
        #END_CONSTRUCTOR
        pass

    def upload_expression(self, ctx, params):
        """
        Uploads the expression  *
        :param params: instance of type "UploadExpressionParams" (*   
           Required input parameters for uploading a reads expression data
           string   destination_ref        -   object reference of expression
           data. The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id string   source_dir             -  
           directory with the files to be uploaded string   alignment_ref    
           -   alignment workspace object reference *) -> structure:
           parameter "destination_ref" of String, parameter "source_dir" of
           String, parameter "alignment_ref" of String, parameter
           "genome_ref" of String, parameter "annotation_id" of String,
           parameter "bam_file_path" of String, parameter "transcripts" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
           1)), parameter "data_quality_level" of Long, parameter
           "original_median" of Double, parameter "description" of String,
           parameter "platform" of String, parameter "source" of String,
           parameter "external_source_date" of String, parameter
           "processing_comments" of String
        :returns: instance of type "UploadExpressionOutput" (*     Output
           from upload expression    *) -> structure: parameter "obj_ref" of
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_expression

        self.__LOGGER.info('Starting upload expression, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params(
            ctx, params)

        alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF)
        try:
            alignment_obj = self.dfu.get_objects(
                {'object_refs': [alignment_ref]})['data'][0]
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        alignment = alignment_obj['data']
        assembly_or_genome_ref = alignment['genome_id']

        genome_ref = self._get_genome_ref(assembly_or_genome_ref, params)

        expression_levels, tpm_expression_levels = self._get_expression_levels(
            source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS))

        self._gen_ctab_files(params, alignment_ref)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': source_dir,
            'make_handle': 1,
            'pack': 'zip'
        })
        """
        move the zipfile created in the source directory one level up
        """
        path, dir = os.path.split(source_dir)
        zipfile = dir + '.zip'
        if os.path.isfile(os.path.join(source_dir, zipfile)):
            shutil.move(os.path.join(source_dir, zipfile),
                        os.path.join(path, zipfile))

        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        expression_data = {
            'numerical_interpretation': 'FPKM',
            'genome_id': genome_ref,
            'mapped_rnaseq_alignment': {
                alignment['read_sample_id']: alignment_ref
            },
            'condition': alignment['condition'],
            'file': file_handle,
            'expression_levels': expression_levels,
            'tpm_expression_levels': tpm_expression_levels
        }
        additional_params = [
            self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION,
            self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM,
            self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID,
            self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE,
            self.PARAM_IN_SRC
        ]

        for opt_param in additional_params:
            if opt_param in params and params[opt_param] is not None:
                expression_data[opt_param] = params[opt_param]

        extra_provenance_input_refs = list()
        extra_provenance_input_refs.append(
            params.get(self.PARAM_IN_ALIGNMENT_REF))
        if self.PARAM_IN_GENOME_REF in params and params.get(
                self.PARAM_IN_GENOME_REF) is not None:
            extra_provenance_input_refs.append(
                params.get(self.PARAM_IN_GENOME_REF))

        self.__LOGGER.info('===========   Adding extra_provenance_refs')
        self.__LOGGER.info(str(extra_provenance_input_refs))
        self.__LOGGER.info('==========================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqExpression",
                "data":
                expression_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs":
                extra_provenance_input_refs
            }]
        })[0]

        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        print(returnVal)
        #END upload_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_expression(self, ctx, params):
        """
        Downloads expression *
        :param params: instance of type "DownloadExpressionParams" (*
           Required input parameters for downloading expression string
           source_ref         -       object reference of expression source.
           The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id *) -> structure: parameter "source_ref" of
           String
        :returns: instance of type "DownloadExpressionOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_expression

        self.__LOGGER.info('Running download_expression with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'download_' + str(timestamp))
        os.mkdir(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            expression[0]['data']['file']['id'],
            'file_path':
            output_dir,
            'unpack':
            'unpack'
        })

        if not os.listdir(output_dir):
            raise ValueError('No files were downloaded: ' + output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        returnVal = {'destination_dir': output_dir}

        #END download_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_expression(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download expressions from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting expression string   source_ref         - 
           object reference of expression source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_expression

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        output = {'shock_id': expression[0]['data']['file']['id']}

        #END export_expression

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_expression return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_expressionMatrix(self, ctx, params):
        """
        :param params: instance of type "getExprMatrixParams" (* Following
           are the required input parameters to get Expression Matrix *) ->
           structure: parameter "workspace_name" of String, parameter
           "output_obj_name" of String, parameter "expressionset_ref" of
           String
        :returns: instance of type "getExprMatrixOutput" -> structure:
           parameter "exprMatrix_FPKM_ref" of String, parameter
           "exprMatrix_TPM_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_expressionMatrix
        fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix(
            params)

        returnVal = {
            'exprMatrix_FPKM_ref': fpkm_ref,
            'exprMatrix_TPM_ref': tpm_ref
        }
        #END get_expressionMatrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_expressionMatrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Example #19
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        result_file_path = os.path.join(self.scratch, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in assembly_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([[str(e['contig_id']), e['length']]
                              for e in contig_data])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', overview_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        
        """
        uuid_string = str(uuid.uuid4())

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}
        object_data = self.dfu.get_objects(get_objects_params)
        objects_created = [{
            'ref': obj_ref,
            'description': 'Imported Assembly'
        }]

        output_html_files = self.generate_html_report(obj_ref, object_data,
                                                      params)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 270,
            'report_object_name': 'kb_upload_assembly_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Example #20
0
class ProkkaUtils:

    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx'];
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key + " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{"ref": "KBaseOntology/seed_subsystem_ontology"}])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4: -1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence, id=new_id, description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple("renamed_assembly",
                                      "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath, contig_counter, new_ids_to_old,
                                records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The  directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = ["perl", "/kb/prokka/bin/prokka", "--outdir", output_dir, "--prefix",
                           "mygenome", "--kingdom", kingdom]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError("gram parameter is not supported in current Prokka installation")
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(["--mincontiglen", str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results", "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {"id": fid, "location": location, "type": "gene",
                               "aliases": aliases, "md5": md5, "dna_sequence": dna,
                               "dna_sequence_length": len(dna),
                               }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                         "evidence": [evidence],
                                                         "term_name": sso_item["name"],
                                                         "ontology_ref": self.sso_ref,
                                                         "term_lineage": []}
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {"id": cds_id, "location": location, "md5": md5, "parent_gene": fid,
                               "parent_mrna": mrna_id, "function": (product if product else ""),
                               "ontology_terms": {}, "protein_translation": prot,
                               "protein_translation_length": prot_len, "aliases": aliases}
                        mrna = {"id": mrna_id, "location": location, "md5": md5,
                                "parent_gene": fid, "cds": cds_id}
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(genes_with_sso) + "\n"
        report += "Average protein length: " + str(int(sum(prot_lengths) /
                                                       float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly", "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {"id": sso_item["id"],
                                                             "evidence": [evidence],
                                                             "term_name": sso_item["name"],
                                                             "ontology_ref": self.sso_ref,
                                                             "term_lineage": []}

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(self.scratch,
                                                 "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] + "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty.")

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified', 'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """

        :param annotation_args: genome_data, new_annotations from prokka, and the output_genome_name
        :type
        :return:
        """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {"current_functions": len(genome_data["data"]["features"]), "new_functions": 0,
                 "found_functions": 0, "new_ontologies": 0}

        function_report_filepath = os.path.join(self.scratch, "ontology_report")
        ontology_report_filepath = os.path.join(self.scratch, "function_report")
        onto_r = open(function_report_filepath, "w")
        func_r = open(ontology_report_filepath, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i]["function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [new_function]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)
                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)
            if current_function:
                func_r.write(json.dumps([fid, [current_function], [new_function]]) + "\n")
            else:
                func_r.write(json.dumps([fid, current_functions, [new_function]]) + "\n")

            onto_r.write(json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        info = self.gfu.save_one_genome({"workspace": self.output_workspace,
                                         "name": annotation_args["output_genome_name"],
                                         "data": genome_data["data"],
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        annotated_genome = namedtuple("annotated_genome",
                                      "genome_ref function_report_filepath ontology_report_filepath stats")

        return annotated_genome(genome_ref, function_report_filepath, ontology_report_filepath,
                                stats)

    def upload_file(self, filepath, message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path": filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" + output_file_shock_id)
        return {"shock_id": output_file_shock_id,
                "name": os.path.basename(filepath),
                "label": os.path.basename(filepath),
                "description": message}

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [self.upload_file(genome.ontology_report_filepath),
                      self.upload_file(genome.function_report_filepath)]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n"
                          ).format(genome_ref, stats["current_functions"], stats["new_functions"],
                                   stats["new_ontologies"])

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "file_links": file_links,
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": self.output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(genome_data=genome_data,
                                                                     new_annotations=new_annotations,
                                                                     output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params, "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref": assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(gff_filepath=prokka_results.gff_filepath,
                                                       cds_to_dna=prokka_results.cds_to_dna,
                                                       cds_to_prot=prokka_results.cds_to_dna,
                                                       new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {"id": "Unknown",
                  "features": annotated_assembly.features,
                  "scientific_name": scientific_name,
                  "domain": domain,
                  "genetic_code": gcode,
                  "assembly_ref": assembly_ref,
                  "cdss": annotated_assembly.cdss,
                  "mrnas": annotated_assembly.mrnas,
                  "source": "PROKKA annotation pipeline",
                  "gc_content": assembly_info.gc_content,
                  "dna_size": assembly_info.dna_size,
                  "reference_annotation": 0}

        info = self.gfu.save_one_genome({"workspace": output_workspace,
                                         "name": output_genome_name,
                                         "data": genome,
                                         "provenance": self.ctx.provenance()})["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report(
            {"message": report_message,
             "objects_created": [{"ref": genome_ref, "description": "Annotated genome"}],
             "report_object_name": "kb_prokka_report_" + str(uuid.uuid4()),
             "workspace_name": output_workspace
             })

        return {"output_genome_ref": genome_ref, "report_name": report_info["name"],
                "report_ref": report_info["ref"]}
class kb_virsorterTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_virsorter',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_virsorter'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_virsorter(cls.cfg)

        cls.testobjref = []
        #cls.testobjdata = []
        cls.testwsname = []

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, 'wsName'):
            cls.wsClient.delete_workspace({'workspace': cls.wsName})
            print('Test workspace was deleted')

        if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0:
            try:
                print('Deleting workspace 2 ' + cls.testwsname[0])
                cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]})
                print('Test workspace 2 was deleted ' + cls.testwsname[0])
            except Exception as e:
                print e

        #if hasattr(cls, 'testobjdata'):
        #    try:
        #        print('Deleting shock data ' + str(len(cls.testobjdata)))
        #        print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0])))
        #        print('Deleting shock data ' + str(cls.testobjdata[0]))
        #        node = cls.testobjdata[0]['data'][0]['lib']['file']['id']
        #        cls.delete_shock_node(node)
        #        print('Test shock data was deleted')
        #    except Exception as e:
        #        print e

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, 'wsName'):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_kb_virsorter_" + str(suffix)
        ret = self.getWsClient().create_workspace({'workspace': wsName})
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx
    
    
    def write_file(self, filename, content):
        tmp_dir = self.cfg['scratch']
        file_path = os.path.join(tmp_dir, filename)
        with open(file_path, 'w') as fh1:
            fh1.write(content)
        return file_path


    def delete_shock_node(self, node_id):
        header = {'Authorization': 'Oauth {0}'.format(cls.token)}
        requests.delete(cls.shockURL + '/node/' + node_id, headers=header,
                        allow_redirects=True)

    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)

        #self.delete_shock_node(shock_id)


    def create_random_string(self):
        N = 20
        return ''.join(
            random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))

    def test_virsorter_ok(self):
        self.upload_assembly()


        if not self.testwsname:
            self.testwsname.append(self.create_random_string())

        print "upload_reads self.testwsname[0] " + self.testwsname[0]

        #try:
        #    ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  # test_ws_name
        #except Exception as e:
        #    # print "ERROR"
        #    # print(type(e))
        #    # print(e.args)
        #    print(e)
        #    pass

        print "self.testwsname "+ str(self.testwsname)
        params = {}
        params['assembly_ref'] =  str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref
        params['ws_name'] = self.testwsname[0]

        result = self.getImpl().run_virsorter(self.getContext(), params)
        print('RESULT run_virsorter:')
        pprint(result)

        #testresult = [
        #    {'blah': 'blah', 'bleh': 'bleh'}]

        testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}]


        self.assertEqual(sorted(result), sorted(testresult))


    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]
Example #22
0
class variation_importer_utils:
    def __init__(self, utility_params):
        self.params = utility_params
        # self.scratch = utility_params['scratch']
        self.scratch = os.path.join(utility_params['scratch'],
                                    'variation_importer_' + str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.service_wiz_url = utility_params['srv-wiz-url']
        self.callback_url = utility_params['callback_url']

        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url,
                               token=utility_params['token'])

    def _create_fake_location_data(self):
        location = {
            'lat':
            random.uniform(-90, 90),
            'lon':
            random.uniform(-180, 180),
            'elevation':
            random.uniform(0, 100),
            'description':
            "".join([random.choice(string.ascii_letters) for n in xrange(20)])
        }
        return location

    def _create_fake_straininfo(self, genotype_id):
        straininfo = {
            'source_id': genotype_id,
            'location_info': self._create_fake_location_data()
        }
        return straininfo

    def _create_fake_population(self, genotypes):
        population = {'description': 'Faker population data.', 'strains': []}
        for genome in genotypes:
            population['strains'].append(self._create_fake_straininfo(genome))
        return population

    def _create_fake_kinship_matrix(self):
        kinship = {
            'row_ids': ['one', 'two'],
            'col_ids': ['one', 'two'],
            'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]]
        }
        return kinship

    def _compare(self, s, t):
        return Counter(s) == Counter(t)

    def pretend_download_staging_file(self, vcf_filename, scratch):
        vcf_filepath = os.path.join(scratch, vcf_filename)
        shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath)
        return {'copy_file_path': vcf_filepath}

    def _generate_population(self,
                             location_filepath,
                             genotypes,
                             population_description="None Provided"):
        locations = pd.read_csv(location_filepath, delimiter='\t')

        # Drop any missing data from id, latitude, or longitude.
        locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True)

        # Compare the location IDs with the genotype IDs
        if not (self._compare(locations.iloc[:, 0].astype(str).tolist(),
                              genotypes)):
            log("Location IDs do not match Sample IDs in Variation file!")
            raise ValueError(
                "Location IDs do not match Sample IDs in Variation file!")

        col_names = [x.lower() for x in locations.columns.values]
        expected_columns = ['id', 'latitude', 'longitude']
        optional_columns = ['elevation', 'description']

        # CHeck that first three columns match the expected columns.
        if not (self._compare(col_names[0:3], expected_columns)):
            raise ValueError("Missing or unexpected column names in {}".format(
                location_filepath))

        # If optional columns are not present, give default value for each.
        for col in optional_columns:
            if col not in col_names:
                if col == 'elevation':
                    locations[col] = 0.0
                else:
                    locations[col] = "None provided."

        population = {'description': population_description, 'strains': []}
        for idx, row in locations.iterrows():
            population['strains'].append({
                'source_id': str(row['id']),
                'location_info': {
                    'lat': row['latitude'],
                    'lon': row['longitude'],
                    'elevation': row['elevation'],
                    'description': row['description']
                }
            })

        return population

    def _validate_vcf(self, vcf_filepath, vcf_version):
        validation_output_dir = os.path.join(self.scratch,
                                             'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-o")
            validator_cmd.append(validation_output_dir)
        else:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version below 4.1.  No validation logging.")

        print("Validator command: {}".format(validator_cmd))
        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            validator_output.append(line)

        p.wait()

        validation_output_filename = [
            f for f in os.listdir(validation_output_dir) if f.endswith('.txt')
        ][0]
        validation_output_filepath = os.path.join(validation_output_dir,
                                                  validation_output_filename)

        if not validation_output_filename:
            print('Validator did not generate log file!')
            raise Exception("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filepath))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filepath, p.returncode

    # Retrieve contigs from assembly file.
    def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'):
        try:
            assembly_data = self.dfu.get_objects(
                {'object_refs': [assembly_ref]})['data'][0]['data']
        except Exception as e:
            print("Unable to retrieve Assembly reference: {}".format(
                assembly_ref))
            raise ValueError(e)
        raw_contigs = assembly_data['contigs']
        contigs = {}

        # Contigs returns just a dict with key and contig_id
        for key, value in raw_contigs.iteritems():
            contigs[str(key)] = value['contig_id']
        return raw_contigs

    def _get_version_contigs_genotypes(self, vcf_filepath):
        contigs = []
        genotypes = []
        version = ''
        with (gzip.open if vcf_filepath.endswith('.gz') else open)(
                vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted."
                    )
                raise ValueError(
                    "Invalid VCF.  ##fileformat line in meta is improperly formatted."
                )
            version = float(tokens[1][-4:].rstrip())
            log("VCF version: {}".format(version))
            for line in vcf:
                if line.startswith("#CHROM"):
                    log("#CHROM encountered, exiting loop.")
                    genotypes = line.split()[9:]
                    log("Number Genotypes in vcf: {}".format(len(genotypes)))
                    break
                tokens = line.split("=")

                if tokens[0].startswith('##contig'):
                    contigs.append(tokens[2][:-2])
        return version, contigs, genotypes

    # Arabidopsis ref: 18590/2/8
    def _get_assembly_ref_from_genome(self, genome_ref):
        ga = GenomeAnnotationAPI(self.service_wiz_url)
        inputs_get_assembly = {'ref': genome_ref}
        try:
            assembly_object_ref = ga.get_assembly(inputs_get_assembly)
        except Exception as e:
            print(
                "Unable to retrieve Assembly reference ID from Genome ref_id: {}"
                .format(genome_ref))
            raise Exception(e)

        return assembly_object_ref

    def _generate_output_file_list(self):
        log('Start packing result files')
        output_files = list()

        result_file = os.path.join(self.scratch,
                                   'variation_importer_results.zip')
        excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store']
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if not (file.endswith(tuple(excluded_extensions))
                            # file.endswith('.zip') or
                            # file.endswith('.vcf') or
                            # file.endswith('.vcf.gz') or
                            # file.endswith('.html') or
                            # file.endswith('.DS_Store')
                            ):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by Variation Importer'
        })
        log("Importer output generated: {}".format(output_files))

        return output_files

    def _generate_report(self, params, variation_results, variation_file_path):

        stats_results = self._generate_variation_stats(
            params['additional_output_type'], variation_file_path)

        html_report = self._generate_html_report(variation_results,
                                                 stats_results)

        file_links = self._generate_output_file_list()
        objects = []
        if (variation_results['valid_variation_file']):
            objects = [{
                'ref':
                variation_results['variation_obj_ref'],
                'description':
                'Variation Object created by VCF Importer'
            }]

        report_params = {
            'objects_created': objects,
            'message': '',
            'direct_html_link_index': 0,
            'file_links': file_links,
            'html_links': html_report,
            'html_window_height': 330,
            'workspace_name': params['workspace_name'],
            'report_object_name':
            'variation_importer_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
            'variation_ref': variation_results['variation_obj_ref']
        }
        log("Returning from _generate_report!")
        return report_output

    def _generate_html_report(self, variation_results, stats_output=None):
        """
            _generate_html_report: generate html report from output files
        """
        html_report = list()
        print("Validation output filepath passed to html report: {}".format(
            variation_results['validation_output_filepath']))
        try:
            report_dir = os.path.join(self.scratch, 'html')
            os.mkdir(report_dir)

            with open(template_dir, 'r') as html, open(
                    variation_results['validation_output_filepath'],
                    'r') as validation:

                validation_content = '<p><h4>{} '.format(
                    variation_results['variation_filename'])
                if variation_results.get('valid_variation_file'):
                    validation_content += '<em><i>is</i> a valid </em> variation file.'
                else:
                    validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.'
                validation_content += '</h4></p>'

                report = html.read()

                # Discard the first line of the validation file.  It is irrelevant.
                validation.readline()

                validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>'
                validation_content += '<ul>'
                for line in validation.readlines():
                    validation_content += '<li>{}</li>'.format(line)
                validation_content += '</ul>'

                if variation_results.get('invalid_contigs'):
                    validation_content += '<h4>The following Contigs were not found in the reference genome.  The possible contigs have been written to the file {}.  Please see the associated links to download.</h4>'.format(
                        variation_results.get('genome_ref'),
                        'valid_contigs.txt')
                    validation_content += '<ul>'
                    for contig in variation_results.get('invalid_contigs'):
                        validation_content += '<li>{}</li>'.format(contig)
                    validation_content += '</ul>'

                # if not variation_results.get('contigs'):
                #     validation_content += '<h4>No contig information was included in the VCF file header!  Please recreate the VCF file with each contig described in the meta description </h4>'
                report = report.replace('Validation_Results',
                                        validation_content)

                if (stats_output.get('stats_file_dir')):
                    summary_results = '<p><h4>Summary Statistics</h4></p>'
                    summary_results += '''
                                        <table>
                                            <tr>
                                                <th>Number of SNPs</th>
                                                <th>Number of Genotypes </th>
                                            </tr>
                                        '''
                    summary_results += '<tr>'
                    summary_results += '<td>{}</td><td>{}</td>'.format(
                        'To be added later',
                        variation_results['num_genotypes'])
                    summary_results += '</tr></table>'
                    report = report.replace('Variation_Statistics',
                                            summary_results)

                # visualization
                image_content = ''
                if (stats_output.get('stats_img_dir')):
                    image_dir = stats_output.get('stats_img_dir')

                    for file in glob.glob(os.path.join(image_dir, '*.png')):
                        shutil.move(file, report_dir)

                    for image in glob.glob(report_dir + "/*.png"):
                        image = image.replace(report_dir + '/', '')
                        caption = image.replace(report_dir + '/',
                                                '').replace('.png', '')
                        image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                            '></a><a target="_blank"><br>' \
                            '<p align="center">{}</p></p>'.format(image, caption)

                else:
                    image_content += 'No visualizations generated.'

                report = report.replace("Visualization_Results", image_content)
        except Exception as e:
            print("Error generating HTML report.")
            raise

        report_file_path = os.path.join(report_dir, 'index.html')
        with open(report_file_path, 'w') as output:
            output.write(report)
        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': report_file_path,
                'make_handle': 0,
                'pack': 'zip'
            })
            log("Variation HTML report to shock ref: {}".format(
                html_upload_ret))
        except:
            raise ValueError('Error uploading HTML to shock')

        html_report.append({
            'shock_id': html_upload_ret['shock_id'],
            'name': os.path.basename(report_file_path),
            'label': os.path.basename(report_file_path),
            'description': 'HTML report for Variation Importer'
        })

        return html_report

    def _generate_variation_stats(self, additional_output_type,
                                  variation_filepath):
        """
            :param commments go here
        """
        file_output_directory = os.path.join(self.scratch,
                                             'stats_' + str(uuid.uuid4()))
        os.mkdir(file_output_directory)

        image_output_directory = os.path.join(
            self.scratch, 'stats_images_' + str(uuid.uuid4()))
        os.mkdir(image_output_directory)

        # TODO: Validate user supplied params and build PLINK command
        plink_cmd = ["plink"]
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)

        # plink_cmd.append('--recode12')
        # plink_cmd.append('transpose')
        # plink_cmd.append('--output-missing-genotype')
        # plink_cmd.append("0")
        plink_cmd.append('--freq')
        plink_cmd.append('--hardy')
        # plink_cmd.append('gz')

        plink_cmd.append('--out')
        plink_cmd.append(variation_filepath)

        print("PLINK arguments: {}".format(plink_cmd))

        plink_output = {
            "errors": [],
            "warnings": []
            # "notes" : []
        }
        p = subprocess.Popen(plink_cmd,
                             cwd=file_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        while True:
            line = p.stdout.readline()
            if not line:
                break
            # log(line)
            tokens = line.split(':')
            if (tokens[0] == 'Error'):
                plink_output['errors'].append(line)
                raise ValueError('PLINK 1.9 error: ' + line)
            elif (tokens[0] == 'Warning'):
                plink_output['warnings'].append(line)
                print(line)
            # elif(tokens[0] == 'Note'):
            #     plink_output['notes'].append(line)
            #     print(line)

        p.stdout.close()
        p.wait()
        plink_output_filepath = os.path.join(file_output_directory,
                                             'plink_cli_output.txt')
        with open(plink_output_filepath, 'w') as plink:
            for data in plink_output:
                plink.write("{}: {}\n".format(data, plink_output[data]))

        plink_output_files = [
            f for f in os.listdir(self.scratch)
            if f.startswith(os.path.basename(variation_filepath) + '.')
        ]

        for file in plink_output_files:
            shutil.move(os.path.join(self.scratch, file),
                        file_output_directory)

        if p.returncode != 0:
            log("PLINK encountered an error during runtime.  Please see log file."
                )

        variation_filename = os.path.basename(variation_filepath)
        base_filepath = os.path.join(file_output_directory, variation_filename)
        freq_filepath = base_filepath + '.frq'

        maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R'
        hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R'
        log("Frequency filepath: {}".format(freq_filepath))
        # TODO: make function to do Rscript calls.
        # generate visualizations and store in directory
        maf_command = ['Rscript']
        maf_command.append('--no-save')
        maf_command.append('--vanilla')
        maf_command.append(maf_script_filepath)
        maf_command.append(freq_filepath)
        maf_command.append("Minor Allele Frequencies.png")
        r = subprocess.Popen(maf_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        r.wait()
        if r.returncode != 0:
            log("Error creating MAF histogram in R")

        hwe_filepath = base_filepath + '.hwe'
        zoom_filepath = hwe_filepath + '.zoom'
        zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format(
            hwe_filepath, zoom_filepath)
        try:
            z = subprocess.Popen(zoom_command,
                                 cwd=file_output_directory,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
            z.wait()

            if z.returncode != 0:
                log("Error creating HWE zoom file.")

        except Exception as e:
            log("Error creating zoom HWE file: {}".format(e))

        hwe_command = ['Rscript']
        hwe_command.append('--no-save')
        hwe_command.append('--vanilla')
        hwe_command.append(hwe_script_filepath)
        hwe_command.append(hwe_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium.png")
        hwe_command.append(zoom_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png")
        print("MAF command: {}".format(hwe_command))
        h = subprocess.Popen(hwe_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        h.wait()

        if h.returncode != 0:
            log("Error generating HWE Zoom plot")

        return {
            'stats_file_dir': file_output_directory,
            'stats_img_dir': image_output_directory
        }

    def _save_variation_to_ws(self, workspace_name, variation_obj,
                              variation_filepath, kinship_matrix):
        ws_id = self.dfu.ws_name_to_id(workspace_name)
        try:
            vcf_shock_return = self.dfu.file_to_shock({
                'file_path': variation_filepath,
                'make_handle': 1,
                'pack': 'gzip'
            })
        except Exception as e:
            print("Error uploading file to shock!")
            raise ValueError(e)

        variation_obj['variation_file_reference'] = vcf_shock_return.get(
            'shock_id')

        info = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_obj,
                'name': 'TestVariationImporterName'
            }]
        })[0]

        variation_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        log("Variation reference created: {}".format(variation_ref))
        return variation_ref

    def validate_vcf(self, params):
        """
            :param params: dict containing all input parameters.
        """

        returnVal = {}
        valid_vcf_file = True

        try:
            vcf_filepath = self.pretend_download_staging_file(
                params['staging_file_subdir_path'],
                self.scratch).get('copy_file_path')

            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['staging_file_subdir_path']))

        try:
            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['location_file_subdir_path']))

        # Check file size
        log("{} file size: {}".format(vcf_filepath,
                                      os.path.getsize(vcf_filepath)))
        log('\nValidating {}...'.format(vcf_filepath))

        vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes(
            vcf_filepath)

        if not vcf_contigs:
            log("No contig data in {} header.".format(vcf_filepath))
            raise ValueError(
                "No contig data in {} header.".format(vcf_filepath))

        if (vcf_version < 4.1):
            log("VCF file is version {}.  Must be at least version 4.1".format(
                vcf_version))
            raise ValueError(
                "VCF file is version {}.  Must be at least version 4.1".format(
                    vcf_version))

        # Generate population object
        population = self._generate_population(location_filepath,
                                               vcf_genotypes)

        # Retrieve Assembly object reference associated with genome.
        try:
            assembly_ref = self._get_assembly_ref_from_genome(
                params['genome_ref'])
        except Exception as e:
            print("Unable to retrieve {}".format(params['genome_ref']))
            raise ValueError(e)

        # Retrieve contig list from Assembly object.
        try:
            assembly_contigs = self._get_contigs_from_assembly(assembly_ref)
        except Exception as e:
            print("Unable to retrieve contigs from Assembly ref: {}".format(
                assembly_ref))
            raise ValueError(e)

        log("Length of assembly contigs: {}".format(len(assembly_contigs)))
        # Compare contig IDs from VCF to those in the Assembly object
        invalid_contigs = []
        for contig in vcf_contigs:
            if contig not in assembly_contigs.keys():
                invalid_contigs.append(contig)

        if invalid_contigs:
            log("Invalid contig IDs found in {}".format(vcf_filepath))
            valid_contig_filepath = os.path.join(self.scratch,
                                                 'valid_contigs.txt')
            log("Writing valid contigs to file: {}".format(
                valid_contig_filepath))
            with open(valid_contig_filepath, 'w') as icf:
                for contig in assembly_contigs:
                    icf.write(contig + '\n')
            valid_vcf_file = False

        validation_output_filepath, returncode = self._validate_vcf(
            vcf_filepath, vcf_version)

        if returncode != 0:
            valid_vcf_file = False

        kinship_matrix = self._create_fake_kinship_matrix()

        variation_obj_ref = ''
        if valid_vcf_file:
            variation_object = {
                "genome": params['genome_ref'],
                "population": population,
                "contigs": vcf_contigs,
                "comment": "Comments go here",
                "assay": "Assay data goes gere.",
                "originator": "PI/Lab info goes here",
                "pubmed_id": "PubMed ID goes here",
                "kinship_info": kinship_matrix
            }

            variation_obj_ref = self._save_variation_to_ws(
                params['workspace_name'], variation_object, vcf_filepath,
                kinship_matrix)

        log("Variation object reference: {}".format(variation_obj_ref))
        variation_report_metadata = {
            'valid_variation_file': valid_vcf_file,
            'variation_obj_ref': variation_obj_ref,
            'variation_filename': os.path.basename(vcf_filepath),
            'validation_output_filepath': validation_output_filepath,
            'vcf_version': vcf_version,
            'num_genotypes': len(vcf_genotypes),
            'num_contigs': len(vcf_contigs),
            'invalid_contigs': invalid_contigs
        }

        returnVal = self._generate_report(params, variation_report_metadata,
                                          vcf_filepath)

        return returnVal
class CufflinksUtils:
    CUFFLINKS_TOOLKIT_PATH = '/opt/cufflinks/'
    GFFREAD_TOOLKIT_PATH = '/opt/cufflinks/'

    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass

    def parse_FPKMtracking_calc_TPM(self, filename):
        """
        Generates TPM from FPKM
        :return:
        """
        fpkm_dict = {}
        tpm_dict = {}
        gene_col = 0
        fpkm_col = 9
        sum_fpkm = 0.0
        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                gene_id = larr[gene_col]
                if gene_id != "":
                    fpkm = float(larr[fpkm_col])
                    sum_fpkm = sum_fpkm + fpkm
                    fpkm_dict[gene_id] = math.log(fpkm + 1, 2)
                    tpm_dict[gene_id] = fpkm

        if sum_fpkm == 0.0:
            log("Warning: Unable to calculate TPM values as sum of FPKM values is 0"
                )
        else:
            for g in tpm_dict:
                tpm_dict[g] = math.log((tpm_dict[g] / sum_fpkm) * 1e6 + 1, 2)

        return fpkm_dict, tpm_dict

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_cufflinks_params(self, params):
        """
        _validate_run_cufflinks_params:
                Raises an exception if params are invalid
        """

        log('Start validating run_cufflinks params')

        # check for required parameters
        for p in ['alignment_object_ref', 'workspace_name', 'genome_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)

            raise ValueError(error_msg)

    def _run_gffread(self, gff_path, gtf_path):
        """
        _run_gffread: run gffread script

        ref: http://cole-trapnell-lab.github.io/cufflinks/file_formats/#the-gffread-utility
        """
        log('converting gff to gtf')
        command = self.GFFREAD_TOOLKIT_PATH + '/gffread '
        command += "-E {0} -T -o {1}".format(gff_path, gtf_path)

        self._run_command(command)

    def _create_gtf_annotation_from_genome(self, genome_ref):
        """
         Create reference annotation file from genome
        """
        ref = self.ws.get_object_subset([{
            'ref':
            genome_ref,
            'included': ['contigset_ref', 'assembly_ref']
        }])
        if 'contigset_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['contigset_ref']
        elif 'assembly_ref' in ref[0]['data']:
            contig_id = ref[0]['data']['assembly_ref']
        if contig_id is None:
            raise ValueError(
                "Genome at {0} does not have reference to the assembly object".
                format(genome_ref))
        print(contig_id)
        log("Generating GFF file from Genome")
        try:
            ret = self.au.get_assembly_as_fasta({'ref': contig_id})
            output_file = ret['path']
            mapping_filename = c_mapping.create_sanitized_contig_ids(
                output_file)
            os.remove(output_file)
            # get the GFF
            ret = self.gfu.genome_to_gff({'genome_ref': genome_ref})
            genome_gff_file = ret['file_path']
            c_mapping.replace_gff_contig_ids(genome_gff_file,
                                             mapping_filename,
                                             to_modified=True)
            gtf_ext = ".gtf"

            if not genome_gff_file.endswith(gtf_ext):
                gtf_path = os.path.splitext(genome_gff_file)[0] + '.gtf'
                self._run_gffread(genome_gff_file, gtf_path)
            else:
                gtf_path = genome_gff_file

            log("gtf file : " + gtf_path)
        except Exception:
            raise ValueError(
                "Generating GTF file from Genome Annotation object Failed :  {}"
                .format("".join(traceback.format_exc())))
        return gtf_path

    def _get_gtf_file(self, alignment_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch
        alignment_data = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]['data']

        genome_ref = alignment_data.get('genome_id')
        # genome_name = self.ws.get_object_info([{"ref": genome_ref}], includeMetadata=None)[0][1]
        # ws_gtf = genome_name+"_GTF_Annotation"

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_gtf_file_from_genome_ref(self, genome_ref):
        """
        _get_gtf_file: get the reference annotation file (in GTF or GFF3 format)
        """
        result_directory = self.scratch

        genome_data = self.ws.get_objects2({'objects': [{
            'ref': genome_ref
        }]})['data'][0]['data']

        gff_handle_ref = genome_data.get('gff_handle_ref')

        if gff_handle_ref:
            log('getting reference annotation file from genome')
            annotation_file = self.dfu.shock_to_file({
                'handle_id': gff_handle_ref,
                'file_path': result_directory,
                'unpack': 'unpack'
            })['file_path']
        else:
            annotation_file = self._create_gtf_annotation_from_genome(
                genome_ref)

        return annotation_file

    def _get_input_file(self, alignment_ref):
        """
        _get_input_file: get input BAM file from Alignment object
        """

        bam_file_dir = self.rau.download_alignment(
            {'source_ref': alignment_ref})['destination_dir']

        files = os.listdir(bam_file_dir)
        bam_file_list = [
            file for file in files if re.match(r'.*\_sorted\.bam', file)
        ]
        if not bam_file_list:
            bam_file_list = [
                file for file in files if re.match(r'.*(?<!sorted)\.bam', file)
            ]

        if not bam_file_list:
            raise ValueError('Cannot find .bam file from alignment {}'.format(
                alignment_ref))

        bam_file_name = bam_file_list[0]

        bam_file = os.path.join(bam_file_dir, bam_file_name)

        return bam_file

    def _generate_command(self, params):
        """
        _generate_command: generate cufflinks command
        """
        cufflinks_command = '/opt/cufflinks/cufflinks'
        cufflinks_command += (' -q --no-update-check -p ' +
                              str(params.get('num_threads', 1)))
        if 'max_intron_length' in params and params[
                'max_intron_length'] is not None:
            cufflinks_command += (' --max-intron-length ' +
                                  str(params['max_intron_length']))
        if 'min_intron_length' in params and params[
                'min_intron_length'] is not None:
            cufflinks_command += (' --min-intron-length ' +
                                  str(params['min_intron_length']))
        if 'overhang_tolerance' in params and params[
                'overhang_tolerance'] is not None:
            cufflinks_command += (' --overhang-tolerance ' +
                                  str(params['overhang_tolerance']))

        cufflinks_command += " -o {0} -G {1} {2}".format(
            params['result_directory'], params['gtf_file'],
            params['input_file'])

        log('Generated cufflinks command: {}'.format(cufflinks_command))

        return cufflinks_command

    def _process_rnaseq_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing RNASeqAlignment object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        if '/' not in params['genome_ref']:
            params['genome_ref'] = params['workspace_name'] + '/' + params[
                'genome_ref']

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_rnaseq_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params['gtf_file'],
            params['expression_suffix'])

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _process_kbasesets_alignment_object(self, params):
        """
        _process_alignment_object: process KBaseRNASeq.RNASeqAlignment type input object
        """
        log('start processing KBaseSets object\nparams:\n{}'.format(
            json.dumps(params, indent=1)))
        alignment_ref = params.get('alignment_ref')

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        params['result_directory'] = str(result_directory)

        # input files
        params['input_file'] = self._get_input_file(alignment_ref)
        if not params.get('gtf_file'):
            params['gtf_file'] = self._get_gtf_file(alignment_ref)

        command = self._generate_command(params)
        self._run_command(command)

        expression_obj_ref = self._save_kbasesets_expression(
            result_directory, alignment_ref, params.get('workspace_name'),
            params.get('genome_ref'), params.get('gtf_file'),
            params.get('expression_suffix'))

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_obj_ref,
            'alignment_ref': alignment_ref
        }

        expression_name = self.ws.get_object_info([{
            "ref": expression_obj_ref
        }],
                                                  includeMetadata=None)[0][1]

        widget_params = {
            "output": expression_name,
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_html_report(self, result_directory, obj_ref):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory, 'report.html')

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]

        expression_object_type = expression_object.get('info')[2]

        Overview_Content = ''
        if re.match('KBaseRNASeq.RNASeqExpression-\d.\d',
                    expression_object_type):
            Overview_Content += '<p>Generated Expression Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d.\d',
                      expression_object_type):
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data'][
                    'sample_expression_ids']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref
                    }], includeMetadata=None)[0][1]
                Overview_Content += '<p>{}</p>'.format(expression_name)
        elif re.match('KBaseSets.ExpressionSet-\d.\d', expression_object_type):
            pprint(expression_object)
            Overview_Content += '<p>Generated Expression Set Object:</p><p>{}</p>'.format(
                expression_object.get('info')[1])
            Overview_Content += '<br><p>Generated Expression Object:</p>'
            for expression_ref in expression_object['data']['items']:
                expression_name = self.ws.get_object_info(
                    [{
                        "ref": expression_ref['ref']
                    }], includeMetadata=None)[0][1]
                condition = expression_ref['label']
                Overview_Content += '<p>condition:{0}; expression_name: {1}</p>'.format(
                    condition, expression_name)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', Overview_Content)
                result_file.write(report_template)

        html_report.append({
            'path':
            result_file_path,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Cufflinks App'
        })
        return html_report

    def _save_rnaseq_expression(self, result_directory, alignment_ref,
                                workspace_name, genome_ref, gtf_file,
                                expression_suffix):
        """
        _save_rnaseq_expression: save Expression object to workspace
        """
        log('start saving Expression object')
        alignment_object_name = self.ws.get_object_info(
            [{
                "ref": alignment_ref
            }], includeMetadata=None)[0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_kbasesets_expression(self, result_directory, alignment_ref,
                                   workspace_name, genome_ref, gtf_file,
                                   expression_suffix):
        """
        _save_kbasesets_expression: save Expression object to workspace using ExpressionUtils
        and SetAPI
        """
        log('start saving Expression object')

        alignment_info = self.ws.get_object_info3(
            {'objects': [{
                "ref": alignment_ref
            }]})
        alignment_object_name = alignment_info['infos'][0][1]

        # set expression name
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_ref = self.eu.upload_expression({
            'destination_ref':
            workspace_name + '/' + expression_name,
            'source_dir':
            result_directory,
            'alignment_ref':
            alignment_ref,
            'tool_used':
            self.tool_used,
            'tool_version':
            self.tool_version
        })['obj_ref']

        return expression_ref

    def _save_rnaseq_expression_set(self, alignment_expression_map,
                                    alignment_set_ref, workspace_name,
                                    expression_set_name):
        """
        _save_rnaseq_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _save_kbasesets_expression_set(self, alignment_expression_map,
                                       alignment_set_ref, workspace_name,
                                       expression_set_name):
        """
        _save_kbasesets_expression_set: save ExpressionSet object to workspace
        """
        log('start saving ExpressionSet object')
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        expression_set_data = self._generate_expression_set_data(
            alignment_expression_map, alignment_set_ref, expression_set_name)

        object_type = 'KBaseRNASeq.RNASeqExpressionSet'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': expression_set_data,
                'name': expression_set_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        expression_set_ref = str(dfu_oi[6]) + '/' + str(dfu_oi[0]) + '/' + str(
            dfu_oi[4])

        return expression_set_ref

    def _generate_report(self,
                         obj_ref,
                         workspace_name,
                         result_directory,
                         exprMatrix_FPKM_ref=None,
                         exprMatrix_TPM_ref=None):
        """
        _generate_report: generate summary report
        """

        log('creating report')

        output_files = self._generate_output_file_list(result_directory)
        output_html_files = self._generate_html_report(result_directory,
                                                       obj_ref)

        expression_object = self.ws.get_objects2(
            {'objects': [{
                'ref': obj_ref
            }]})['data'][0]
        expression_info = expression_object['info']
        expression_data = expression_object['data']

        expression_object_type = expression_info[2]
        if re.match('KBaseRNASeq.RNASeqExpression-\d+.\d+',
                    expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseRNASeq.RNASeqExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'Expression generated by Cufflinks'
            }]
        elif re.match('KBaseSets.ExpressionSet-\d+.\d+',
                      expression_object_type):
            objects_created = [{
                'ref':
                obj_ref,
                'description':
                'ExpressionSet generated by Cufflinks'
            }]
            items = expression_data['items']
            for item in items:
                objects_created.append({
                    'ref':
                    item['ref'],
                    'description':
                    'Expression generated by Cufflinks'
                })
            objects_created.append({
                'ref':
                exprMatrix_FPKM_ref,
                'description':
                'FPKM ExpressionMatrix generated by Cufflinks'
            })
            objects_created.append({
                'ref':
                exprMatrix_TPM_ref,
                'description':
                'TPM ExpressionMatrix generated by Cufflinks'
            })

        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'file_links': output_files,
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 366,
            'report_object_name': 'kb_cufflinks_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _parse_FPKMtracking(self, filename, metric):
        result = {}
        pos1 = 0
        if metric == 'FPKM':
            pos2 = 7
        if metric == 'TPM':
            pos2 = 8

        with open(filename) as f:
            next(f)
            for line in f:
                larr = line.split("\t")
                if larr[pos1] != "":
                    try:
                        result[larr[pos1]] = math.log(float(larr[pos2]) + 1, 2)
                    except ValueError:
                        result[larr[pos1]] = math.log(1, 2)

        return result

    def _generate_output_file_list(self, result_directory):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('Start packing result files')
        output_files = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(output_directory, 'cufflinks_result.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(result_directory):
                for file in files:
                    if not (file.endswith('.DS_Store')):
                        zip_file.write(
                            os.path.join(root, file),
                            os.path.join(os.path.basename(root), file))

        output_files.append({
            'path': result_file,
            'name': os.path.basename(result_file),
            'label': os.path.basename(result_file),
            'description': 'File(s) generated by Cufflinks App'
        })

        return output_files

    def _generate_expression_data(self, result_directory, alignment_ref,
                                  gtf_file, workspace_name, expression_suffix):
        """
        _generate_expression_data: generate Expression object with cufflinks output files
        """
        alignment_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_ref
            }]})['data'][0]

        # set expression name
        alignment_object_name = alignment_data_object['info'][1]
        if re.match('.*_[Aa]lignment$', alignment_object_name):
            expression_name = re.sub('_[Aa]lignment$', expression_suffix,
                                     alignment_object_name)
        else:  # assume user specified suffix
            expression_name = alignment_object_name + expression_suffix

        expression_data = {
            'id': expression_name,
            'type': 'RNA-Seq',
            'numerical_interpretation': 'FPKM',
            'processing_comments': 'log2 Normalized',
            'tool_used': self.tool_used,
            'tool_version': self.tool_version
        }
        alignment_data = alignment_data_object['data']

        condition = alignment_data.get('condition')
        expression_data.update({'condition': condition})

        genome_id = alignment_data.get('genome_id')
        expression_data.update({'genome_id': genome_id})

        read_sample_id = alignment_data.get('read_sample_id')
        expression_data.update(
            {'mapped_rnaseq_alignment': {
                read_sample_id: alignment_ref
            }})

        exp_dict, tpm_exp_dict = self.parse_FPKMtracking_calc_TPM(
            os.path.join(result_directory, 'genes.fpkm_tracking'))

        expression_data.update({'expression_levels': exp_dict})

        expression_data.update({'tpm_expression_levels': tpm_exp_dict})

        handle = self.dfu.file_to_shock({
            'file_path': result_directory,
            'pack': 'zip',
            'make_handle': True
        })['handle']
        expression_data.update({'file': handle})

        return expression_data

    def _generate_expression_set_data(self, alignment_expression_map,
                                      alignment_set_ref, expression_set_name):
        """
        _generate_expression_set_data: generate ExpressionSet object with cufflinks output files
        """
        alignment_set_data_object = self.ws.get_objects2(
            {'objects': [{
                'ref': alignment_set_ref
            }]})['data'][0]

        alignment_set_data = alignment_set_data_object['data']

        expression_set_data = {
            'tool_used': self.tool_used,
            'tool_version': self.tool_version,
            'id': expression_set_name,
            'alignmentSet_id': alignment_set_ref,
            'genome_id': alignment_set_data.get('genome_id'),
            'sampleset_id': alignment_set_data.get('sampleset_id')
        }

        sample_expression_ids = []
        mapped_expression_objects = []
        mapped_expression_ids = []

        for alignment_expression in alignment_expression_map:
            alignment_ref = alignment_expression.get('alignment_ref')
            expression_ref = alignment_expression.get('expression_obj_ref')
            sample_expression_ids.append(expression_ref)
            mapped_expression_ids.append({alignment_ref: expression_ref})
            alignment_name = self.ws.get_object_info(
                [{
                    "ref": alignment_ref
                }], includeMetadata=None)[0][1]
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_ref
                }], includeMetadata=None)[0][1]
            mapped_expression_objects.append({alignment_name: expression_name})

        expression_set_data['sample_expression_ids'] = sample_expression_ids
        expression_set_data[
            'mapped_expression_objects'] = mapped_expression_objects
        expression_set_data['mapped_expression_ids'] = mapped_expression_ids

        return expression_set_data

    def _process_alignment_set_object(self, params, alignment_object_type):
        """
        _process_alignment_set_object: process KBaseRNASeq.RNASeqAlignmentSet type input object
                                        and KBaseSets.ReadsAlignmentSet type object
        """
        log('start processing KBaseRNASeq.RNASeqAlignmentSet object or KBaseSets.ReadsAlignmentSet object'
            '\nparams:\n{}'.format(json.dumps(params, indent=1)))

        alignment_set_ref = params.get('alignment_set_ref')

        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            params['gtf_file'] = self._get_gtf_file(alignment_set_ref)
        else:
            if not '/' in params['genome_ref']:
                params['genome_ref'] = params['workspace_name'] + '/' + params[
                    'genome_ref']

            params['gtf_file'] = self._get_gtf_file_from_genome_ref(
                params['genome_ref'])

        alignment_set = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            0,
            'include_set_item_ref_paths':
            1
        })
        mul_processor_params = []
        for alignment in alignment_set["data"]["items"]:
            alignment_ref = alignment['ref_path']
            alignment_upload_params = params.copy()
            alignment_upload_params['alignment_ref'] = alignment_ref
            mul_processor_params.append(alignment_upload_params)
            # use the following when you want to run the cmd sequentially
            # self._process_kbasesets_alignment_object(mul_processor_params[0])

        cpus = min(params.get('num_threads'), multiprocessing.cpu_count())
        pool = Pool(ncpus=cpus)
        log('running _process_alignment_object with {} cpus'.format(cpus))
        alignment_expression_map = pool.map(
            self._process_kbasesets_alignment_object, mul_processor_params)

        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        expression_items = list()
        for proc_alignment_return in alignment_expression_map:
            expression_obj_ref = proc_alignment_return.get(
                'expression_obj_ref')
            alignment_ref = proc_alignment_return.get('alignment_ref')
            alignment_info = self.ws.get_object_info3({
                'objects': [{
                    "ref": alignment_ref
                }],
                'includeMetadata':
                1
            })
            condition = alignment_info['infos'][0][10]['condition']
            expression_items.append({
                "ref": expression_obj_ref,
                "label": condition,
            })
            expression_name = self.ws.get_object_info(
                [{
                    "ref": expression_obj_ref
                }], includeMetadata=None)[0][1]
            self._run_command('cp -R {} {}'.format(
                proc_alignment_return.get('result_directory'),
                os.path.join(result_directory, expression_name)))

        expression_set = {
            "description": "generated by kb_cufflinks",
            "items": expression_items
        }

        expression_set_info = self.set_api.save_expression_set_v1({
            "workspace":
            params['workspace_name'],
            "output_object_name":
            params['expression_set_name'],
            "data":
            expression_set
        })

        returnVal = {
            'result_directory': result_directory,
            'expression_obj_ref': expression_set_info['set_ref']
        }

        widget_params = {
            "output": params.get('expression_set_name'),
            "workspace": params.get('workspace_name')
        }
        returnVal.update(widget_params)

        return returnVal

    def _generate_output_object_name(self, params, alignment_object_type,
                                     alignment_object_name):
        """
        Generates the output object name based on input object type and name and stores it in
        params with key equal to 'expression' or 'expression_set' based on whether the input
        object is an alignment or alignment_set.

        :param params: module input params
        :param alignment_object_type: input alignment object type
        :param alignment_object_name: input alignment object name
        :param alignment_object_data: input alignment object data
        """
        expression_set_suffix = params['expression_set_suffix']
        expression_suffix = params['expression_suffix']

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment$', alignment_object_name):
                params['expression_name'] = re.sub('_[Aa]lignment$',
                                                   expression_suffix,
                                                   alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_name'] = alignment_object_name + expression_suffix
        if re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*',
                    alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):
                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix
        if re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            if re.match('.*_[Aa]lignment_[Ss]et$', alignment_object_name):

                # set expression set name
                params['expression_set_name'] = re.sub('_[Aa]lignment_[Ss]et$',
                                                       expression_set_suffix,
                                                       alignment_object_name)
            else:  # assume user specified suffix
                params[
                    'expression_set_name'] = alignment_object_name + expression_set_suffix

    def _save_expression_matrix(self, expressionset_ref, workspace_name):
        """
        _save_expression_matrix: save FPKM and TPM ExpressionMatrix
        """

        log('start saving ExpressionMatrix object')

        expression_set_name = self.ws.get_object_info(
            [{
                "ref": expressionset_ref
            }], includeMetadata=None)[0][1]

        output_obj_name_prefix = re.sub('_*[Ee]xpression_*[Ss]et', '',
                                        expression_set_name)

        upload_expression_matrix_params = {
            'expressionset_ref': expressionset_ref,
            'output_obj_name': output_obj_name_prefix,
            'workspace_name': workspace_name
        }

        expression_matrix_refs = self.eu.get_expressionMatrix(
            upload_expression_matrix_params)

        return expression_matrix_refs

    def run_cufflinks_app(self, params):
        log('--->\nrunning CufflinksUtil.run_cufflinks_app\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_run_cufflinks_params(params)

        alignment_object_ref = params.get('alignment_object_ref')
        alignment_object_info = self.ws.get_object_info3(
            {"objects": [{
                "ref": alignment_object_ref
            }]})['infos'][0]

        alignment_object_type = alignment_object_info[2]
        alignment_object_name = alignment_object_info[1]

        # get output object name
        self._generate_output_object_name(params, alignment_object_type,
                                          alignment_object_name)

        log('--->\nalignment object type: \n' +
            '{}'.format(alignment_object_type))

        if re.match('^KBaseRNASeq.RNASeqAlignment-\d*', alignment_object_type):
            params.update({'alignment_ref': alignment_object_ref})
            returnVal = self._process_rnaseq_alignment_object(params)
            report_output = self._generate_report(
                returnVal.get('expression_obj_ref'),
                params.get('workspace_name'),
                returnVal.get('result_directory'))
            returnVal.update(report_output)
        elif re.match('^KBaseRNASeq.RNASeqAlignmentSet-\d*', alignment_object_type) or \
             re.match('^KBaseSets.ReadsAlignmentSet-\d*', alignment_object_type):
            params.update({'alignment_set_ref': alignment_object_ref})
            returnVal = self._process_alignment_set_object(
                params, alignment_object_type)
            expression_matrix_refs = self._save_expression_matrix(
                returnVal['expression_obj_ref'], params.get('workspace_name'))
            returnVal.update(expression_matrix_refs)

            report_output = self._generate_report(
                returnVal['expression_obj_ref'], params.get('workspace_name'),
                returnVal['result_directory'],
                expression_matrix_refs['exprMatrix_FPKM_ref'],
                expression_matrix_refs['exprMatrix_TPM_ref'])
            returnVal.update(report_output)
        else:
            raise ValueError(
                'None RNASeqAlignment type\nObject info:\n{}'.format(
                    alignment_object_info))

        return returnVal
Example #24
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" (Genome is a
           KBase genome Featureset is a KBase featureset Promoter_length is
           the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "fastapath" of
           String, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs

        #TODO: Things to fix in here...
        #      Use MotifUtils to parse output and create object
        #      create new function for report ?

        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']

        #promoterFastaFilePath = self.get_promoter_for_gene(ctx,params)[0]
        promoterFastaFilePath = params['fastapath']

        MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath)
        MEU.run_meme_command(MEMEMotifCommand)
        meme_out_path = '/kb/module/work/tmp/meme_out/meme.txt'
        meme_params = {
            'ws_name': params['workspace_name'],
            'path': meme_out_path,
            'obj_name': params['obj_name']
        }
        MOU = MotifUtils(self.callback_url)
        dfu = DataFileUtil(self.callback_url)
        locDict = {}
        if 'SS_ref' in params:
            get_ss_params = {'object_refs': [params['SS_ref']]}
            SS = dfu.get_objects(get_ss_params)['data'][0]['data']
            for s in SS['sequences']:
                if s['source'] is not None:
                    locDict['sequence_id'] = {
                        'contig': s['source']['location'][0][0],
                        'start': str(s['source']['location'][0][1])
                    }
        if len(locDict.keys()) > 0:
            meme_params['absolute_locations'] = locDict
        meme_params['min_len'] = motMin
        meme_params['max_len'] = motMax
        obj_ref = MOU.UploadFromMEME(meme_params)['obj_ref']
        #memeMotifList = MEU.parse_meme_output()

        #HERE:
        #we've got object ref
        #we've got html building functions
        #build report, setup return,
        #make report and return it

        #buildReportFromMotifSet()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        #lineCount = 0
        #with open(promoterFastaFilePath,'r') as pFile:
        #    for line in pFile:
        #        lineCount += 1
        #numFeat = lineCount/2
        #with open(promoterFastaFilePath,'r') as pFile:
        #    fileStr = pFile.read()
        #promHtmlStr = '<html><body> '  + fileStr + ' </body></html>'
        #with open(htmlDir + '/promoters.html','w') as promHTML:
        #    promHTML.write(promHtmlStr)
        #JsonPath = '/kb/module/work/tmp'

        dfu = DataFileUtil(self.callback_url)
        get_obj_params = {'object_refs': [obj_ref]}
        memeMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data']
        MakeReport(htmlDir, memeMotifSet)
        #buildReportFromMotifSet(memeMotifSet,htmlDir,'meme')

        #TODO: Here replace the makereport with a call to motifset utils
        #subprocess.call(['python','/kb/module/lib/identify_promoter/Utils/makeReport.py',JsonPath + '/meme_out/meme.json',htmlDir + '/meme.html',str(numFeat)])
        #fullMotifList = []
        #for m in memeMotifList:
        #    fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'MEMEMotifFinder_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by MEME'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def genome_annotation_to_genbank(self, ctx, params):
        """
        :param params: instance of type "GenomeAnnotationToGenbankParams"
           (genome_ref -- Reference to the GenomeAnnotation or Genome object
           in KBase in any ws supported format OR genome_name +
           workspace_name -- specifiy the genome name and workspace name of
           what you want.  If genome_ref is defined, these args are ignored.
           new_genbank_file_name -- specify the output name of the genbank
           file, optional save_to_shock -- set to 1 or 0, if 1 then output is
           saved to shock. default is zero) -> structure: parameter
           "genome_ref" of String, parameter "genome_name" of String,
           parameter "workspace_name" of String, parameter
           "new_genbank_file_name" of String, parameter "save_to_shock" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenbankFile" -> structure: parameter
           "path" of String, parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN genome_annotation_to_genbank

        print('genome_annotation_to_genbank -- paramaters = ')
        pprint(params)

        service_endpoints = {
            "workspace_service_url": self.workspaceURL, 
            "shock_service_url": self.shockURL,
            "handle_service_url": self.handleURL
        }

        # parse/validate parameters.  could do a better job here.
        genome_ref = None
        if 'genome_ref' in params and params['genome_ref'] is not None:
            genome_ref = params['genome_ref']
        else:
            if 'genome_name' not in params:
                raise ValueError('genome_ref and genome_name are not defined.  One of those is required.')
            if 'workspace_name' not in params:
                raise ValueError('workspace_name is not defined.  This is required if genome_name is specified' +
                    ' without a genome_ref')
            genome_ref = params['workspace_name'] + '/' + params['genome_name']

        # do a quick lookup of object info- could use this to do some validation.  Here we need it to provide
        # a nice output file name if it is not set...  We should probably catch errors here and print out a nice
        # message - usually this would mean the ref was bad.
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':genome_ref}],'includeMetadata':0, 'ignoreErrors':0})[0]
        print('resolved object to:');
        pprint(info)

        if 'new_genbank_file_name' not in params or params['new_genbank_file_name'] is None:
            new_genbank_file_name = info[1] + ".gbk"
        else:
            new_genbank_file_name = params['new_genbank_file_name']


        # construct a working directory to hand off to the data_api
        working_directory =  os.path.join(self.sharedFolder, 'genome-download-'+str(uuid.uuid4()))
        os.makedirs(working_directory)
        output_file_destination = os.path.join(working_directory,new_genbank_file_name)

        # do it
        print('calling: doekbase.data_api.downloaders.GenomeAnnotation.downloadAsGBK');
        GenomeAnnotation.downloadAsGBK(
                            genome_ref,
                            service_endpoints,
                            ctx['token'],
                            output_file_destination,
                            working_directory)

        # if we need to upload to shock, well then do that too.
        file = {}
        if 'save_to_shock' in params and params['save_to_shock'] == 1:
            dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
            file['shock_id'] =dfUtil.file_to_shock({
                                    'file_path':output_file_destination,
                                    'gzip':0,
                                    'make_handle':0
                                    #attributes: {} #we can set shock attributes if we want
                                })['shock_id']
        else:
            file['path'] = output_file_destination

        #END genome_annotation_to_genbank

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method genome_annotation_to_genbank return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]
Example #26
0
    def run_Gblocks(self, ctx, params):
        """
        Method for trimming MSAs of either DNA or PROTEIN sequences
        **
        **        input_type: MSA
        **        output_type: MSA
        :param params: instance of type "Gblocks_Params" (Gblocks Input
           Params) -> structure: parameter "workspace_name" of type
           "workspace_name" (** The workspace object refs are of form: ** ** 
           objects = ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "desc" of String, parameter "input_ref" of type
           "data_obj_ref", parameter "output_name" of type "data_obj_name",
           parameter "trim_level" of Long, parameter "min_seqs_for_conserved"
           of Long, parameter "min_seqs_for_flank" of Long, parameter
           "max_pos_contig_nonconserved" of Long, parameter "min_block_len"
           of Long, parameter "remove_mask_positions_flag" of Long
        :returns: instance of type "Gblocks_Output" (Gblocks Output) ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN run_Gblocks
        console = []
        invalid_msgs = []
        self.log(console, 'Running run_Gblocks with params=')
        self.log(console, "\n" + pformat(params))
        report = ''
        #        report = 'Running run_Gblocks with params='
        #        report += "\n"+pformat(params)

        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'input_ref' not in params:
            raise ValueError('input_ref parameter is required')
        if 'output_name' not in params:
            raise ValueError('output_name parameter is required')

        #### Get the input_ref MSA object
        ##
        try:
            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            objects = ws.get_objects([{'ref': params['input_ref']}])
            data = objects[0]['data']
            info = objects[0]['info']
            input_name = info[1]
            input_type_name = info[2].split('.')[1].split('-')[0]

        except Exception as e:
            raise ValueError(
                'Unable to fetch input_ref object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()

        if input_type_name == 'MSA':
            MSA_in = data
            row_order = []
            default_row_labels = dict()
            if 'row_order' in MSA_in.keys():
                row_order = MSA_in['row_order']
            else:
                row_order = sorted(MSA_in['alignment'].keys())

            if 'default_row_labels' in MSA_in.keys():
                default_row_labels = MSA_in['default_row_labels']
            else:
                for row_id in row_order:
                    default_row_labels[row_id] = row_id
            if len(row_order) < 2:
                self.log(
                    invalid_msgs, "must have multiple records in MSA: " +
                    params['input_ref'])

            # export features to FASTA file
            input_MSA_file_path = os.path.join(self.scratch,
                                               input_name + ".fasta")
            self.log(console, 'writing fasta file: ' + input_MSA_file_path)
            records = []
            for row_id in row_order:
                #self.log(console,"row_id: '"+row_id+"'")  # DEBUG
                #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'")  # DEBUG
                # using SeqIO makes multiline sequences.  (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code)
                #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id])
                #records.append(record)
                #SeqIO.write(records, input_MSA_file_path, "fasta")
                records.extend(['>' + row_id, MSA_in['alignment'][row_id]])
            with open(input_MSA_file_path, 'w', 0) as input_MSA_file_handle:
                input_MSA_file_handle.write("\n".join(records) + "\n")

            # Determine whether nuc or protein sequences
            #
            NUC_MSA_pattern = re.compile(
                "^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$")
            all_seqs_nuc = True
            for row_id in row_order:
                #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'")
                if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None:
                    all_seqs_nuc = False
                    break

        # Missing proper input_type
        #
        else:
            raise ValueError('Cannot yet handle input_ref type of: ' +
                             type_name)

        # DEBUG: check the MSA file contents
#        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA_LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA_LINE: '"+line+"'")

# validate input data
#
        N_seqs = 0
        L_first_seq = 0
        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
            for line in input_MSA_file_handle:
                if line.startswith('>'):
                    N_seqs += 1
                    continue
                if L_first_seq == 0:
                    for c in line:
                        if c != '-' and c != ' ' and c != "\n":
                            L_first_seq += 1
        # min_seqs_for_conserved
        if 'min_seqs_for_conserved' in params and params[
                'min_seqs_for_conserved'] != None and int(
                    params['min_seqs_for_conserved']) != 0:
            if int(params['min_seqs_for_conserved']) < int(0.5 * N_seqs) + 1:
                self.log(
                    invalid_msgs, "Min Seqs for Conserved Pos (" +
                    str(params['min_seqs_for_conserved']) +
                    ") must be >= N/2+1 (N=" + str(N_seqs) + ", N/2+1=" +
                    str(int(0.5 * N_seqs) + 1) + ")\n")
            if int(params['min_seqs_for_conserved']) > int(
                    params['min_seqs_for_flank']):
                self.log(
                    invalid_msgs, "Min Seqs for Conserved Pos (" +
                    str(params['min_seqs_for_conserved']) +
                    ") must be <= Min Seqs for Flank Pos (" +
                    str(params['min_seqs_for_flank']) + ")\n")

        # min_seqs_for_flank
        if 'min_seqs_for_flank' in params and params[
                'min_seqs_for_flank'] != None and int(
                    params['min_seqs_for_flank']) != 0:
            if int(params['min_seqs_for_flank']) > N_seqs:
                self.log(
                    invalid_msgs, "Min Seqs for Flank Pos (" +
                    str(params['min_seqs_for_flank']) + ") must be <= N (N=" +
                    str(N_seqs) + ")\n")

        # max_pos_contig_nonconserved
        if 'max_pos_contig_nonconserved' in params and params[
                'max_pos_contig_nonconserved'] != None and int(
                    params['max_pos_contig_nonconserved']) != 0:
            if int(params['max_pos_contig_nonconserved']) < 0:
                self.log(
                    invalid_msgs, "Max Num Non-Conserved Pos (" +
                    str(params['max_pos_contig_nonconserved']) +
                    ") must be >= 0" + "\n")
            if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(
                    params['max_pos_contig_nonconserved']) >= 32000:
                self.log(
                    invalid_msgs, "Max Num Non-Conserved Pos (" +
                    str(params['max_pos_contig_nonconserved']) +
                    ") must be <= L first seq (" + str(L_first_seq) +
                    ") and < 32000\n")

        # min_block_len
        if 'min_block_len' in params and params[
                'min_block_len'] != None and int(params['min_block_len']) != 0:
            if int(params['min_block_len']) < 2:
                self.log(
                    invalid_msgs, "Min Block Len (" +
                    str(params['min_block_len']) + ") must be >= 2" + "\n")
            if int(params['min_block_len']) > L_first_seq or int(
                    params['min_block_len']) >= 32000:
                self.log(
                    invalid_msgs,
                    "Min Block Len (" + str(params['min_block_len']) +
                    ") must be <= L first seq (" + str(L_first_seq) +
                    ") and < 32000\n")

        # trim_level
        if 'trim_level' in params and params['trim_level'] != None and int(
                params['trim_level']) != 0:
            if int(params['trim_level']) < 0 or int(params['trim_level']) > 2:
                self.log(
                    invalid_msgs, "Trim Level (" + str(params['trim_level']) +
                    ") must be >= 0 and <= 2" + "\n")

        if len(invalid_msgs) > 0:

            # load the method provenance from the context object
            self.log(console, "SETTING PROVENANCE")  # DEBUG
            provenance = [{}]
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = []
            provenance[0]['input_ws_objects'].append(params['input_ref'])
            provenance[0]['service'] = 'kb_gblocks'
            provenance[0]['method'] = 'run_Gblocks'

            # report
            report += "FAILURE\n\n" + "\n".join(invalid_msgs) + "\n"
            reportObj = {'objects_created': [], 'text_message': report}

            reportName = 'gblocks_report_' + str(uuid.uuid4())
            report_obj_info = ws.save_objects({
                #                'id':info[6],
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseReport.Report',
                    'data': reportObj,
                    'name': reportName,
                    'meta': {},
                    'hidden': 1,
                    'provenance': provenance
                }]
            })[0]

            self.log(console, "BUILDING RETURN OBJECT")
            returnVal = {
                'report_name':
                reportName,
                'report_ref':
                str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' +
                str(report_obj_info[4])
                #                          'output_ref': None
            }
            self.log(console, "run_Gblocks DONE")
            return [returnVal]

        ### Construct the command
        #
        #  e.g.
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks
        #
        gblocks_cmd = [self.GBLOCKS_bin]

        # check for necessary files
        if not os.path.isfile(self.GBLOCKS_bin):
            raise ValueError("no such file '" + self.GBLOCKS_bin + "'")
        if not os.path.isfile(input_MSA_file_path):
            raise ValueError("no such file '" + input_MSA_file_path + "'")
        if not os.path.getsize(input_MSA_file_path) > 0:
            raise ValueError("empty file '" + input_MSA_file_path + "'")

        # DEBUG
#        with open(input_MSA_file_path,'r',0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA LINE: '"+line+"'")

# set the output path
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Gblocks names output blocks MSA by appending "-gb" to input file
        #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb')
        output_GBLOCKS_file_path = input_MSA_file_path + '-gb'
        output_aln_file_path = output_GBLOCKS_file_path

        # Gblocks is interactive and only accepts args from pipe input
        #if 'arg' in params and params['arg'] != None and params['arg'] != 0:
        #    fasttree_cmd.append('-arg')
        #    fasttree_cmd.append(val)

        # Run GBLOCKS, capture output as it happens
        #
        self.log(console, 'RUNNING GBLOCKS:')
        self.log(console, '    ' + ' '.join(gblocks_cmd))
        #        report += "\n"+'running GBLOCKS:'+"\n"
        #        report += '    '+' '.join(gblocks_cmd)+"\n"

        # FastTree requires shell=True in order to see input data
        env = os.environ.copy()
        #joined_fasttree_cmd = ' '.join(fasttree_cmd)  # redirect out doesn't work with subprocess unless you join command first
        #p = subprocess.Popen([joined_fasttree_cmd], \
        p = subprocess.Popen(gblocks_cmd, \
                             cwd = self.scratch, \
                             stdin = subprocess.PIPE, \
                             stdout = subprocess.PIPE, \
                             stderr = subprocess.PIPE, \
                             shell = True, \
                             env = env)
        #                             executable = '/bin/bash' )

        # write commands to process
        #
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks

        p.stdin.write("o" + "\n")  # open MSA file
        p.stdin.write(input_MSA_file_path + "\n")

        if 'trim_level' in params and params['trim_level'] != None and int(
                params['trim_level']) != 0:
            p.stdin.write("b" + "\n")
            if int(params['trim_level']) >= 1:
                self.log(console, "changing trim level")
                p.stdin.write("5" + "\n")  # set to "half"
                if int(params['trim_level']) == 2:
                    self.log(console, "changing trim level")
                    p.stdin.write("5" + "\n")  # set to "all"
                elif int(params['trim_level']) > 2:
                    raise ValueError("trim_level (" +
                                     str(params['trim_level']) +
                                     ") was not between 0-2")
                p.stdin.write("m" + "\n")

        # flank must precede conserved because it acts us upper bound for acceptable conserved values
        if 'min_seqs_for_flank' in params and params[
                'min_seqs_for_flank'] != None and int(
                    params['min_seqs_for_flank']) != 0:
            self.log(console, "changing min_seqs_for_flank")
            p.stdin.write("b" + "\n")
            p.stdin.write("2" + "\n")
            p.stdin.write(str(params['min_seqs_for_flank']) + "\n")
            p.stdin.write("m" + "\n")

        if 'min_seqs_for_conserved' in params and params[
                'min_seqs_for_conserved'] != None and int(
                    params['min_seqs_for_conserved']) != 0:
            self.log(console, "changing min_seqs_for_conserved")
            p.stdin.write("b" + "\n")
            p.stdin.write("1" + "\n")
            p.stdin.write(str(params['min_seqs_for_conserved']) + "\n")
            p.stdin.write("m" + "\n")

        if 'max_pos_contig_nonconserved' in params and params[
                'max_pos_contig_nonconserved'] != None and int(
                    params['max_pos_contig_nonconserved']) > -1:
            self.log(console, "changing max_pos_contig_nonconserved")
            p.stdin.write("b" + "\n")
            p.stdin.write("3" + "\n")
            p.stdin.write(str(params['max_pos_contig_nonconserved']) + "\n")
            p.stdin.write("m" + "\n")

        if 'min_block_len' in params and params[
                'min_block_len'] != None and params['min_block_len'] != 0:
            self.log(console, "changing min_block_len")
            p.stdin.write("b" + "\n")
            p.stdin.write("4" + "\n")
            p.stdin.write(str(params['min_block_len']) + "\n")
            p.stdin.write("m" + "\n")

        p.stdin.write("g" + "\n")  # get blocks
        p.stdin.write("q" + "\n")  # quit
        p.stdin.close()
        p.wait()

        # Read output
        #
        while True:
            line = p.stdout.readline()
            #line = p.stderr.readline()
            if not line: break
            self.log(console, line.replace('\n', ''))

        p.stdout.close()
        #p.stderr.close()
        p.wait()
        self.log(console, 'return code: ' + str(p.returncode))
        #        if p.returncode != 0:
        if p.returncode != 1:
            raise ValueError('Error running GBLOCKS, return code: ' +
                             str(p.returncode) + '\n\n' + '\n'.join(console))

        # Check that GBLOCKS produced output
        #
        if not os.path.isfile(output_GBLOCKS_file_path):
            raise ValueError("failed to create GBLOCKS output: " +
                             output_GBLOCKS_file_path)
        elif not os.path.getsize(output_GBLOCKS_file_path) > 0:
            raise ValueError("created empty file for GBLOCKS output: " +
                             output_GBLOCKS_file_path)

        # load the method provenance from the context object
        #
        self.log(console, "SETTING PROVENANCE")  # DEBUG
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = []
        provenance[0]['input_ws_objects'].append(params['input_ref'])
        provenance[0]['service'] = 'kb_gblocks'
        provenance[0]['method'] = 'run_Gblocks'

        # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks
        #
        output_fasta_buf = []
        id_order = []
        this_id = None
        ids = dict()
        alignment = dict()
        L_alignment = 0
        L_alignment_set = False
        with open(output_GBLOCKS_file_path, 'r',
                  0) as output_GBLOCKS_file_handle:
            for line in output_GBLOCKS_file_handle:
                line = line.rstrip()
                if line.startswith('>'):
                    this_id = line[1:]
                    output_fasta_buf.append(
                        '>' + re.sub('\s', '_', default_row_labels[this_id]))
                    id_order.append(this_id)
                    alignment[this_id] = ''
                    if L_alignment != 0 and not L_alignment_set:
                        L_alignment_set = True
                    continue
                output_fasta_buf.append(line)
                for c in line:
                    if c != ' ' and c != "\n":
                        alignment[this_id] += c
                        if not L_alignment_set:
                            L_alignment += 1
        if L_alignment == 0:
            self.log(
                invalid_msgs,
                "params produced no blocks.  Consider changing to less stringent values"
            )
        else:
            if 'remove_mask_positions_flag' in params and params[
                    'remove_mask_positions_flag'] != None and params[
                        'remove_mask_positions_flag'] != '' and params[
                            'remove_mask_positions_flag'] == 1:
                self.log(console, "removing mask positions")
                mask = []
                new_alignment = dict()
                for i in range(0, L_alignment):
                    mask[i] = '+'
                    if alignment[id_order[0]][i] == '-' \
                        or alignment[id_order[0]][i] == 'X' \
                        or alignment[id_order[0]][i] == 'x':
                        mask[i] = '-'
                for row_id in id_order:
                    new_alignment[row_id] = ''
                    for i, c in enumerate(alignment[row_id]):
                        if mask[i] == '+':
                            new_alignment[row_id] += c
                alignment = new_alignment

            L_alignment = len(alignment[id_order[0]])

            # write fasta with tidied ids
            output_MSA_file_path = os.path.join(
                output_dir, params['output_name'] + '.fasta')
            with open(output_MSA_file_path, 'w', 0) as output_MSA_file_handle:
                output_MSA_file_handle.write("\n".join(output_fasta_buf) +
                                             "\n")

        # Upload results
        #
        if len(invalid_msgs) == 0:
            self.log(console, "UPLOADING RESULTS")  # DEBUG

            # Didn't write file
            #            with open(output_MSA_file_path,'r',0) as output_MSA_file_handle:
            #                output_MSA_buf = output_MSA_file_handle.read()
            #            output_MSA_buf = output_MSA_buf.rstrip()
            #            self.log(console,"\nMSA:\n"+output_MSA_buf+"\n")

            # Build output_MSA structure
            #   first extract old info from MSA (labels, ws_refs, etc.)
            #
            MSA_out = dict()
            for key in MSA_in.keys():
                MSA_out[key] = MSA_in[key]

            # then replace with new info
            #
            MSA_out['alignment'] = alignment
            MSA_out['name'] = params['output_name']
            MSA_out['alignment_length'] = alignment_length = L_alignment
            MSA_name = params['output_name']
            MSA_description = ''
            if 'desc' in params and params['desc'] != None and params[
                    'desc'] != '':
                MSA_out['desc'] = MSA_description = params['desc']

            # Store MSA_out
            #
            new_obj_info = ws.save_objects({
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseTrees.MSA',
                    'data': MSA_out,
                    'name': params['output_name'],
                    'meta': {},
                    'provenance': provenance
                }]
            })[0]

            # create CLW formatted output file
            max_row_width = 60
            id_aln_gap_width = 1
            gap_chars = ''
            for sp_i in range(id_aln_gap_width):
                gap_chars += ' '
            # DNA
            if all_seqs_nuc:
                strong_groups = {'AG': True, 'CTU': True}
                weak_groups = None
            # PROTEINS
            else:
                strong_groups = {
                    'AST': True,
                    'EKNQ': True,
                    'HKNQ': True,
                    'DENQ': True,
                    'HKQR': True,
                    'ILMV': True,
                    'FILM': True,
                    'HY': True,
                    'FWY': True
                }
                weak_groups = {
                    'ACS': True,
                    'ATV': True,
                    'AGS': True,
                    'KNST': True,
                    'APST': True,
                    'DGNS': True,
                    'DEKNQS': True,
                    'DEHKNQ': True,
                    'EHKNQR': True,
                    'FILMV': True,
                    'FHY': True
                }

            clw_buf = []
            clw_buf.append('CLUSTALW format of GBLOCKS trimmed MSA ' +
                           MSA_name + ': ' + MSA_description)
            clw_buf.append('')

            long_id_len = 0
            aln_pos_by_id = dict()
            for row_id in row_order:
                aln_pos_by_id[row_id] = 0
                row_id_disp = default_row_labels[row_id]
                if long_id_len < len(row_id_disp):
                    long_id_len = len(row_id_disp)

            full_row_cnt = alignment_length // max_row_width
            if alignment_length % max_row_width == 0:
                full_row_cnt -= 1
            for chunk_i in range(full_row_cnt + 1):
                for row_id in row_order:
                    row_id_disp = re.sub('\s', '_', default_row_labels[row_id])
                    for sp_i in range(long_id_len - len(row_id_disp)):
                        row_id_disp += ' '

                    aln_chunk_upper_bound = (chunk_i + 1) * max_row_width
                    if aln_chunk_upper_bound > alignment_length:
                        aln_chunk_upper_bound = alignment_length
                    aln_chunk = alignment[row_id][
                        chunk_i * max_row_width:aln_chunk_upper_bound]
                    for c in aln_chunk:
                        if c != '-':
                            aln_pos_by_id[row_id] += 1

                    clw_buf.append(row_id_disp + gap_chars + aln_chunk + ' ' +
                                   str(aln_pos_by_id[row_id]))

                # conservation line
                cons_line = ''
                for pos_i in range(chunk_i * max_row_width,
                                   aln_chunk_upper_bound):
                    col_chars = dict()
                    seq_cnt = 0
                    for row_id in row_order:
                        char = alignment[row_id][pos_i]
                        if char != '-':
                            seq_cnt += 1
                            col_chars[char] = True
                    if seq_cnt <= 1:
                        cons_char = ' '
                    elif len(col_chars.keys()) == 1:
                        cons_char = '*'
                    else:
                        strong = False
                        for strong_group in strong_groups.keys():
                            this_strong_group = True
                            for seen_char in col_chars.keys():
                                if seen_char not in strong_group:
                                    this_strong_group = False
                                    break
                            if this_strong_group:
                                strong = True
                                break
                        if not strong:
                            weak = False
                            if weak_groups != None:
                                for weak_group in weak_groups.keys():
                                    this_weak_group = True
                                    for seen_char in col_chars.keys():
                                        if seen_char not in weak_group:
                                            this_strong_group = False
                                            break
                                    if this_weak_group:
                                        weak = True
                        if strong:
                            cons_char = ':'
                        elif weak:
                            cons_char = '.'
                        else:
                            cons_char = ' '
                    cons_line += cons_char

                lead_space = ''
                for sp_i in range(long_id_len):
                    lead_space += ' '
                lead_space += gap_chars

                clw_buf.append(lead_space + cons_line)
                clw_buf.append('')

            # write clw to file
            clw_buf_str = "\n".join(clw_buf) + "\n"
            output_clw_file_path = os.path.join(output_dir,
                                                input_name + '-MSA.clw')
            with open(output_clw_file_path, "w", 0) as output_clw_file_handle:
                output_clw_file_handle.write(clw_buf_str)
            output_clw_file_handle.close()

            # upload GBLOCKS FASTA output to SHOCK for file_links
            dfu = DFUClient(self.callbackURL)
            try:
                output_upload_ret = dfu.file_to_shock({
                    'file_path': output_aln_file_path,
                    # DEBUG
                    #                                                      'make_handle': 0,
                    #                                                      'pack': 'zip'})
                    'make_handle': 0
                })
            except:
                raise ValueError('error loading aln_out file to shock')

            # upload GBLOCKS CLW output to SHOCK for file_links
            try:
                output_clw_upload_ret = dfu.file_to_shock({
                    'file_path': output_clw_file_path,
                    # DEBUG
                    #                                                      'make_handle': 0,
                    #                                                      'pack': 'zip'})
                    'make_handle': 0
                })
            except:
                raise ValueError('error loading clw_out file to shock')

            # make HTML reports
            #
            # HERE

            # build output report object
            #
            self.log(console, "BUILDING REPORT")  # DEBUG

            reportName = 'gblocks_report_' + str(uuid.uuid4())
            reportObj = {
                'objects_created': [{
                    'ref':
                    params['workspace_name'] + '/' + params['output_name'],
                    'description':
                    'GBLOCKS MSA'
                }],
                #'message': '',
                'message':
                clw_buf_str,
                'direct_html':
                '',
                #'direct_html_index': None,
                'file_links': [],
                'html_links': [],
                'workspace_name':
                params['workspace_name'],
                'report_object_name':
                reportName
            }
            reportObj['file_links'] = [{
                'shock_id':
                output_upload_ret['shock_id'],
                'name':
                params['output_name'] + '-GBLOCKS.FASTA',
                'label':
                'GBLOCKS-trimmed MSA FASTA'
            }, {
                'shock_id':
                output_clw_upload_ret['shock_id'],
                'name':
                params['output_name'] + '-GBLOCKS.CLW',
                'label':
                'GBLOCKS-trimmed MSA CLUSTALW'
            }]

            # save report object
            #
            SERVICE_VER = 'release'
            reportClient = KBaseReport(self.callbackURL,
                                       token=ctx['token'],
                                       service_ver=SERVICE_VER)
            #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
            report_info = reportClient.create_extended_report(reportObj)

        else:  # len(invalid_msgs) > 0
            reportName = 'gblocks_report_' + str(uuid.uuid4())
            report += "FAILURE:\n\n" + "\n".join(invalid_msgs) + "\n"
            reportObj = {'objects_created': [], 'text_message': report}

            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            report_obj_info = ws.save_objects({
                #'id':info[6],
                'workspace':
                params['workspace_name'],
                'objects': [{
                    'type': 'KBaseReport.Report',
                    'data': reportObj,
                    'name': reportName,
                    'meta': {},
                    'hidden': 1,
                    'provenance': provenance
                }]
            })[0]

            report_info = dict()
            report_info['name'] = report_obj_info[1]
            report_info['ref'] = str(report_obj_info[6]) + '/' + str(
                report_obj_info[0]) + '/' + str(report_obj_info[4])

        # done
        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        self.log(console, "run_Gblocks DONE")
        #END run_Gblocks

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method run_Gblocks return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
Example #27
0
class MutualInfoUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _validate_run_flux_mutual_information_analysis_params(self, params):
        """
        _validate_run_flux_mutual_information_analysis_params:
                validates params passed to run_flux_mutual_information_analysis method
        """

        log('start validating validate_run_flux_mutual_information_analysis params'
            )

        # check for required parameters
        for p in ['fbamodel_id', 'compounds', 'media_id', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _get_file_from_ws(self, workspace, obj_name):
        try:
            file_path = self.ws.get_objects([{
                'name': obj_name,
                'workspace': workspace
            }])[0]
        except Exception as e:
            raise ValueError('Unable to get object from workspace: (' +
                             workspace + '/' + obj_name + ')' + str(e))
        return file_path

    def _make_media_files(self, ws_name, base, compounds):
        """
        Build and store media objects for each combination of compound added to the base media.
        :param base: The base media file
        :param compounds: the set of compound to test
        :return: A list of media ids and a matrix with each media combination defined
        """
        base_media = self._get_file_from_ws(ws_name, base)['data']

        media_ids = [base_media['id']]
        new_media_list = []
        media_matrix = [[""] + compounds]
        media_matrix.append([base_media['id'] + [0] * len(compounds)])
        for n_comp in range(1, len(compounds) + 1):
            for combo in combinations(compounds, n_comp):
                new_media_id = base_media['id'] + '_v%s' % len(media_matrix)
                media_ids.append(new_media_id)
                media_matrix.append(
                    [new_media_id] +
                    [1 if comp in combo else 0 for comp in compounds])
                new_media = deepcopy(base_media)
                new_media['id'] = new_media_id
                new_media['name'] = new_media_id
                for new_comp in combo:
                    new_media['mediacompounds'].append({
                        'compound_ref':
                        '48/1/1/compounds/id/%s' % new_comp.split('_')[0],
                        'concentration':
                        1.0,
                        'maxFlux':
                        1000,
                        'minFlux':
                        -1000
                    })
                new_media_list.append(new_media)

        print("Made %s Media Files" % len(media_ids) - 1)
        info = self.ws.save_objects({
            'workspace':
            ws_name,
            "objects": [{
                "type": "KBaseBiochem.Media",
                "data": media,
                "name": media['name']
            } for media in new_media_list]
        })
        print info
        return media_ids, media_matrix

    def _generate_html_report(self, result_directory, mutual_info_dict):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'mutual_information_report.html')

        shutil.copy(os.path.join(result_directory, 'MI_plot.png'),
                    os.path.join(output_directory, 'MI_plot.png'))

        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'
        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Mutual Information App'
        })

        return html_report

    def _generate_report(self, result_directory, mutual_info_dict, params):
        """
        _generate_report: generate summary report
        """

        log('creating report')
        output_html_files = self._generate_html_report(result_directory,
                                                       mutual_info_dict)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name':
            'MutualInfomation_report_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_mutual_info(self, media_matrix, fba_file):

        df1 = pd.read_csv(fba_file)
        df1.as_matrix()

        #----Input validation of Media/FBAs with Binary Matrix FBAs------
        # 1.0 Number of rows in Media.csv file =  (Number of columns -1)
        #   1.0. If they are different: Through an ERROR saying missed match number of FBAs in media and binary matrix.
        # 1.1 Check whether the elements in Media.csv file contains only binary values (i.e. 0 and 1)
        #   1.1. If the elements are different: Through an ERROR saying not approapriate input values
        # 1.2 Check whether the compounds in Media.csv file match with number of FBAs
        #   1.2. If the compounds are different from number of FBAs: Through an ERROR saying not appropriate input values

        s_df1 = df1.shape
        s_df2 = media_matrix.shape

        Temp_df2 = np.array(media_matrix.values)
        # Create matrix with only the elements remove first column and all the rows
        Temp_df2 = Temp_df2[0:, 1:]

        Bin_val_check = np.array_equal(Temp_df2, Temp_df2.astype(bool))
        num_compounds = (s_df2[1]) - 1

        if ((s_df1[1] - 1) != s_df2[0]) or (Bin_val_check != True) or (int(
                math.log(s_df2[0], 2)) != num_compounds):
            print('invalid input values')

        #-----All possible combination of the chemical compounds----------------------
        # 2.0 Sperating m0 from rest of the lables

        Temp1_df2 = media_matrix

        cols = Temp1_df2.columns
        for i in range(1, len(cols)):
            Temp1_df2.loc[Temp1_df2[cols[i]] == 1, cols[i]] = cols[i]

        print Temp1_df2

        # 2.1 Creating a disctionary for all FBAs except m0
        print len(Temp1_df2)
        mydict = {}
        for x in range(0, len(Temp1_df2)):
            for i in range(1, s_df2[1]):
                currentvalue = Temp1_df2.iloc[x, i]
                currentid = Temp1_df2.iloc[x, 0]
                currentvalue = Temp1_df2.iloc[x, i]
                mydict.setdefault(currentid, [])
                if currentvalue > 0:
                    mydict[currentid].append(currentvalue)

        # Add the first key as m0
        media_0_name = 'm0'
        mydict[media_0_name] = "['0']"
        #Sort the keys
        mydict = collections.OrderedDict(natsort.natsorted(mydict.items()))
        print mydict

        for k, v in mydict.iteritems():
            print k, v

        # List of Compounds combination in the list
        my_combi_list = []
        Compounds_Combi = list(range(1, num_compounds + 1))
        for L in range(0, len(Compounds_Combi) + 1):
            for subset in itertools.combinations(Compounds_Combi, L):
                my_combi_list.append(list(subset))
        print my_combi_list

        # Created a dictionary where the keys:
        # list of compounds combination
        # values are corresponding FBAs list in df2
        result_dict = {}
        for element in my_combi_list[1:]:
            for k, v in mydict.iteritems():
                if set(v).issubset(set(map(lambda x: str(x), element))):
                    key = ','.join(map(lambda x: str(x), element))
                    if result_dict.get(key):
                        media_list = result_dict[key]
                        media_list.append(k)
                        media_list = list(set(media_list))
                        result_dict.update({key: media_list})
                    else:
                        result_dict.update({key: [media_0_name, k]})
        print result_dict

        # Created a dictionary where the keys are:
        # list of compounds combination
        # values are compounds combination FBAs with df1 vaules
        All_Comp_Combi_dic = {}
        for column, value in result_dict.items():
            All_Comp_Combi_dic.update({column: df1.get(value)})

        #To print an item from the All_Comp_Combi_dic
        df = (pd.DataFrame(All_Comp_Combi_dic.items()))

        #print df[0]
        #print df[1][7]

        MI_dict = {}
        for k in range(0, len(df[0])):
            drop_rows_df = df[1][k].drop_duplicates(keep="first")
            drop_columns_df = drop_rows_df.T.drop_duplicates(keep="first").T
            remove = []
            removed = {}
            cols = df[1][k].columns
            for i in range(len(cols) - 1):
                duplicated = []
                v = df[1][k][cols[i]].values
                for j in range(i + 1, len(cols)):
                    if np.array_equal(v, df[1][k][cols[j]].values):
                        remove.append(cols[j])
                        duplicated.append(cols[j])
                if duplicated and cols[i] not in remove:
                    removed.update({cols[i]: duplicated})
                count = {}
                for key, value in removed.items():
                    count.update({key: len(value)})

                #print v

                # print drop_columns_df
                values = count.values()
                # print values
                values = map(lambda x: x + 1, values)
                # print values
                d = {x: values.count(x) for x in values}

                #-------Mutual Inforamtion (MI) calculation-------------
                FBAs = len(df[1][k].columns)
                pure_entropy = math.log(FBAs, 2)
                #print pure_entropy

                # If No duplicates exist and list "value" is empty
                if not values:
                    #print("List is empty")
                    No_duplicate_FBAs = len(drop_columns_df.columns)
                    conditional_entropy = -1 * (No_duplicate_FBAs * (
                        (1 / No_duplicate_FBAs) *
                        ((1 / 1) * math.log(1.0 / 1.0, 2))))
                    Mutual_Info = pure_entropy - conditional_entropy
                    #print('Mutaul Info:', Mutual_Info)

                if values:
                    # If duplicates exist and list "value" is not empty
                    conditional_entropy = 0
                    for key in d:
                        #print key, d[key]
                        Temp = -1 * d[key] * (key / float(FBAs)) * key * (
                            1.0 / key) * math.log(1.0 / key, 2)
                        conditional_entropy = Temp + conditional_entropy
                    #print "%3f" %Temp
                    Mutual_Info = pure_entropy - conditional_entropy

                MI_dict[df[0][k]] = Mutual_Info

        #Sorted MI_dict
        MI_dict = sorted(MI_dict.items(), key=lambda x: (-len(x[0]), x[0]))
        MI_dict = OrderedDict(MI_dict)

        print("Plot MI_dict")
        plt.bar(range(len(MI_dict)),
                MI_dict.values(),
                align='center',
                alpha=0.5,
                width=0.7)
        plt.xticks(range(len(MI_dict)), MI_dict.keys(), rotation='vertical')
        plt.xlabel('Compund Combinations')
        plt.ylabel('Mutual Information (in Bits)')
        plt.title("Organism:XYZ")
        fig1 = plt.gcf()
        fig1.savefig(os.path.join(self.scratch, 'MI_plot.png'), dpi=100)

        return MI_dict
Example #28
0
    def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params):
        """
        :param params: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Params"
           (KButil_Build_InSilico_Metagenomes_with_Grinder() ** **  Use
           Grinder to generate in silico shotgun metagenomes) -> structure:
           parameter "workspace_name" of type "workspace_name" (** The
           workspace object refs are of form: ** **    objects =
           ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "input_refs" of type "data_obj_ref", parameter
           "output_name" of type "data_obj_name", parameter "desc" of String,
           parameter "num_reads_per_lib" of Long, parameter
           "population_percs" of String, parameter "read_len_mean" of Long,
           parameter "read_len_stddev" of Double, parameter "pairs_flag" of
           Long, parameter "mate_orientation" of String, parameter
           "insert_len_mean" of Long, parameter "insert_len_stddev" of
           Double, parameter "mutation_dist" of String, parameter
           "mutation_ratio" of String, parameter "qual_good" of Long,
           parameter "qual_bad" of Long, parameter "len_bias_flag" of Long,
           parameter "random_seed" of Long
        :returns: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder

        #### STEP 0: basic init
        ##
        console = []
        invalid_msgs = []
        report_text = ''
        self.log(console,
                 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ')
        self.log(console, "\n" + pformat(params))

        # Auth
        token = ctx['token']
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # API Clients
        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'
        wsClient = workspaceService(self.workspaceURL, token=token)
        readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                       token=ctx['token'])  # SDK local
        #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # for SDK local.  local doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.serviceWizardURL,
                               token=ctx['token'])  # for dynamic service
        auClient = AssemblyUtil(self.callbackURL,
                                token=ctx['token'],
                                service_ver=SERVICE_VER)
        dfu = DFUClient(self.callbackURL)

        # param checks
        required_params = [
            'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib',
            'population_percs', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'mate_orientation', 'insert_len_mean',
            'insert_len_stddev', 'mutation_dist', 'mutation_ratio',
            'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # cast to str unpredictable numerical params (mostly used in string context)
        numerical_params = [
            'num_reads_per_lib', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good',
            'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in numerical_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                continue
            params[arg] = str(params[arg])

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = []
        for input_ref in params['input_refs']:
            provenance[0]['input_ws_objects'].append(input_ref)

        # set the output paths
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html_output_dir = os.path.join(output_dir, 'html')
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)

        #### STEP 1: Parse population_percs and write to file
        ##
        abundance_str = params['population_percs'].strip()
        abundance_file_path = os.path.join(output_dir, 'my_abundances.txt')
        abundance_config_num_libs = 0
        abundance_config_num_libs_set = False
        grinder_genome_ids = []
        header = []
        out_buf = []

        for row in abundance_str.split("\n"):
            cols = re.split(r'\s+', row)
            if cols[0].upper() == "GENOME":
                for col in cols:
                    if col == '':
                        continue
                    header.append(col)
                continue
            grinder_genome_ids.append(cols[0])
            self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'")  # DEBUG
            out_row = []
            for col in cols:
                if col == '':
                    continue
                elif col == '%':
                    continue
                elif col.endswith('%'):
                    col = col.rstrip('%')
                out_row.append(col)
            out_buf.append("\t".join(out_row))
            num_samples = len(out_row) - 1  # first col is genome id
            if not abundance_config_num_libs_set:
                abundance_config_num_libs_set = True
                abundance_config_num_libs = num_samples
            elif num_samples != abundance_config_num_libs:
                invalid_msgs.append(
                    "inconsistent number of samples in population_percs input field"
                )
        # data validation
        if abundance_config_num_libs == 0:
            invalid_msgs.append(
                "unable to find sample percentages in population_percs input field"
            )
        sample_sums = []
        for row_i, abund_row_str in enumerate(out_buf):
            abund_row = abund_row_str.split()
            for sample_i, abund in enumerate(abund_row[1:]):
                if row_i == 0:
                    sample_sums.append(0)
                #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i))  # DEBUG
                sample_sums[sample_i] += float(abund)
        for sample_i, sample_sum in enumerate(sample_sums):
            if sample_sum < 99.5 or sample_sum > 100.5:
                self.log(
                    invalid_msgs, "Sample: " + str(sample_i + 1) + " " +
                    header[sample_i + 1] +
                    " proportions is not summing to 100.0. Summing to: " +
                    str(sample_sum))

        if len(invalid_msgs) == 0:
            with open(abundance_file_path, 'w') as abundance_fh:
                for out_line in out_buf:
                    abundance_fh.write(out_line + "\n")
            # DEBUG
            with open(abundance_file_path, 'r') as abundance_fh:
                for out_line in abundance_fh.readlines():
                    out_line = out_line.rstrip()
                    self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'")

        #### STEP 2: get genome scaffold sequences
        ##
        if len(invalid_msgs) == 0:
            genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna')
            read_buf_size = 65536
            write_buf_size = 65536
            accepted_input_types = ["KBaseGenomes.Genome"]
            genome_refs = params['input_refs']
            genome_obj_names = []
            genome_sci_names = []
            assembly_refs = []

            for i, input_ref in enumerate(genome_refs):
                # genome obj info
                try:
                    [
                        OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I,
                        SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I,
                        META_I
                    ] = range(11)  # object_info tuple
                    input_obj_info = wsClient.get_object_info_new(
                        {'objects': [{
                            'ref': input_ref
                        }]})[0]
                    input_obj_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        input_obj_info[TYPE_I])  # remove trailing version
                    genome_obj_names.append(input_obj_info[NAME_I])

                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     input_ref + ')' + str(e))
                if input_obj_type not in accepted_input_types:
                    raise ValueError("Input object of type '" +
                                     input_obj_type +
                                     "' not accepted.  Must be one of " +
                                     ", ".join(accepted_input_types))

                # genome obj data
                try:
                    genome_obj = wsClient.get_objects([{
                        'ref': input_ref
                    }])[0]['data']
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " + input_ref)

                # Get assembly_refs
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    self.log(console, msg)
                    self.log(invalid_msgs, msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['contigset_ref'])

        # get fastas for scaffolds
        if len(invalid_msgs) == 0:
            contig_file_paths = []

            for genome_i, input_ref in enumerate(genome_refs):
                contig_file = auClient.get_assembly_as_fasta({
                    'ref':
                    assembly_refs[genome_i]
                }).get('path')
                sys.stdout.flush()
                contig_file_path = dfu.unpack_file({'file_path':
                                                    contig_file})['file_path']
                contig_file_paths.append(contig_file_path)

            # reformat FASTA IDs for Grinder
            with open(genomes_src_db_file_path, 'w',
                      write_buf_size) as genomes_src_db_fh:
                for genome_i, contig_file_path in enumerate(contig_file_paths):
                    #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path)  # DEBUG
                    #contig_ids = []
                    with open(contig_file_path, 'r',
                              read_buf_size) as contig_fh:
                        genome_seq = ''
                        contig_seq = ''
                        contig_seqs = []
                        for contig_line in contig_fh.readlines():
                            contig_line = contig_line.rstrip()
                            if contig_line.startswith('>'):
                                #contig_id = contig_line.strip()[1:].split(' ')[0]
                                #contig_ids.append(contig_id)
                                #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n")
                                if contig_seq != '':
                                    contig_seqs.append(contig_seq)
                                    contig_seq = ''
                                    continue
                            else:
                                #genomes_src_db_fh.write(contig_line)
                                contig_seq += contig_line
                        if contig_seq != '':
                            contig_seqs.append(contig_seq)
                            contig_seq = ''

                    # write joined contigs to file
                    genome_seq = "NNNNNNNNNN".join(
                        contig_seqs
                    )  # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins
                    genome_seq = genome_seq.upper(
                    )  # grinder might require upper case?
                    genomes_src_db_fh.write(">" +
                                            grinder_genome_ids[genome_i] +
                                            "\n")
                    genomes_src_db_fh.write(genome_seq + "\n")
                    genome_seq = ''
                    contig_seqs = []

                    # DEBUG
                    #for contig_id in contig_ids:
                    #    self.log(console, "\tCONTIG_ID: "+contig_id)  # DEBUG
            # DEBUG
            toggle = 0
            with open(genomes_src_db_file_path, 'r',
                      write_buf_size) as genomes_src_db_fh:
                for contig_line in genomes_src_db_fh.readlines():
                    contig_line = contig_line.rstrip()
                    if contig_line.startswith('>'):
                        self.log(console, 'GENOMES_SRC_DB: ' + contig_line)
                        genome_id = contig_line[1:]
                        toggle = 0
                    elif toggle == 0:
                        #elif genome_id == 'G3':
                        self.log(
                            console,
                            'GENOMES_SRC_DB: ' + contig_line[0:50] + '...')
                        toggle += 1

        #### STEP 3: Run Grinder
        ##
        if len(invalid_msgs) == 0:
            cmd = []
            cmd.append(self.GRINDER)
            # output
            cmd.append('-base_name')
            cmd.append(params['output_name'])
            cmd.append('-output_dir')
            cmd.append(output_dir)
            # contigs input
            cmd.append('-reference_file')
            cmd.append(genomes_src_db_file_path)
            # abundances
            cmd.append('-abundance_file')
            cmd.append(abundance_file_path)
            # library size
            cmd.append('-total_reads')
            cmd.append(str(params['num_reads_per_lib']))
            # num libraries (overridden by abundance file?)
            cmd.append('-num_libraries')
            cmd.append(str(abundance_config_num_libs))
            # read and insert lens
            cmd.append('-read_dist')
            cmd.append(str(params['read_len_mean']))
            cmd.append('normal')
            cmd.append(str(params['read_len_stddev']))
            if str(params['pairs_flag']) == '1':
                cmd.append('-insert_dist')
                cmd.append(str(params['insert_len_mean']))
                cmd.append('normal')
                cmd.append(str(params['insert_len_stddev']))
                # mate orientation
                cmd.append('-mate_orientation')
                cmd.append(params['mate_orientation'])
            # genome len bias
            cmd.append('-length_bias')
            cmd.append(str(params['len_bias_flag']))
            # mutation model
            cmd.append('-mutation_dist')
            cmd.append(str(params['mutation_dist']))
            cmd.append('-mutation_ratio')
            cmd.append(str(params['mutation_ratio']))
            # qual scores
            cmd.append('-fastq_output')
            cmd.append('1')
            cmd.append('-qual_levels')
            cmd.append(str(params['qual_good']))
            cmd.append(str(params['qual_bad']))
            # skip contig joins
            cmd.append('-exclude_chars')
            cmd.append('NX')
            # explicitly request bidirectional
            cmd.append('-unidirectional')
            cmd.append('0')
            # random seed
            if 'random_seed' in params and params[
                    'random_seed'] != None and params['random_seed'] != '':
                cmd.append('-random_seed')
                cmd.append(str(params['random_seed']))

            # RUN
            cmd_str = " ".join(cmd)
            self.log(console, "===========================================")
            self.log(console, "RUNNING: " + cmd_str)
            self.log(console, "===========================================")

            cmdProcess = subprocess.Popen(cmd_str,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
            outputlines = []
            while True:
                line = cmdProcess.stdout.readline()
                outputlines.append(line)
                if not line: break
                self.log(console, line.replace('\n', ''))

            cmdProcess.stdout.close()
            cmdProcess.wait()
            self.log(console,
                     'return code: ' + str(cmdProcess.returncode) + '\n')
            if cmdProcess.returncode != 0:
                raise ValueError('Error running kb_grinder, return code: ' +
                                 str(cmdProcess.returncode) + '\n')

            #report_text += "\n".join(outputlines)
            #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr

            # capture output for report and paths to out files
            report_text_buf = []
            struct_file_paths = []
            struct_file_names = []
            fastq_file_paths = []
            for out_line in outputlines:
                out_line = out_line.rstrip()
                if 'Community structure' in out_line:
                    clean_line = out_line.lstrip()
                    struct_file_path = re.split(r'\s+', clean_line)[3]
                    struct_file_paths.append(struct_file_path)
                    struct_file_names.append(struct_file_path.split('/')[-1])
                    self.log(console, "STRUCT_FILE_NAME: '" +
                             struct_file_path.split('/')[-1])  # DEBUG
                elif 'FASTQ file' in out_line:
                    clean_line = out_line.lstrip()
                    fastq_file_paths.append(re.split(r'\s+', clean_line)[3])
                else:
                    report_text_buf.append(out_line)
            report_text += "\n".join(report_text_buf)

        #### STEP 4: Upload Read Libs and create reads set
        ##
        if len(invalid_msgs) == 0:
            lib_obj_refs = []
            lib_obj_names = []
            readsSet_items = []

            for sample_i, fastq_file_path in enumerate(fastq_file_paths):

                if not os.path.isfile (fastq_file_path) \
                   or os.path.getsize (fastq_file_path) == 0:

                    raise ValueError("empty read lib generated: " +
                                     fastq_file_path)
                else:

                    # lib obj name
                    if len(fastq_file_paths) == 0:
                        output_obj_name = params['output_name']
                    else:
                        if str(params['pairs_flag']) == '1':
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".PairedEndLib"
                        else:
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".SingleEndLib"
                    lib_obj_names.append(output_obj_name)

                    # upload lib and get obj ref
                    self.log(
                        console,
                        'Uploading trimmed paired reads: ' + output_obj_name)
                    sequencing_tech = 'artificial reads'
                    if str(params['pairs_flag']) == '1':
                        interleaved = 1
                    else:
                        interleaved = 0
                    lib_obj_ref = readsUtils_Client.upload_reads({
                        'wsname':
                        str(params['workspace_name']),
                        'name':
                        output_obj_name,
                        'fwd_file':
                        fastq_file_path,
                        'interleaved':
                        interleaved,
                        'sequencing_tech':
                        sequencing_tech
                    })['obj_ref']
                    lib_obj_refs.append(lib_obj_ref)
                    os.remove(fastq_file_path)  # free up disk

                    # add to readsSet
                    readsSet_items.append({
                        'ref': lib_obj_ref,
                        'label': output_obj_name
                    })
            # create readsset
            readsSet_obj_ref = None
            if len(lib_obj_refs) > 1:
                readsSet_obj = {
                    'description':
                    "Grinder Metagenome from " + " ".join(genome_obj_names),
                    'items':
                    readsSet_items
                }
                readsSet_obj_name = params['output_name']
                readsSet_obj_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['workspace_name'],
                    'output_object_name':
                    readsSet_obj_name,
                    'data':
                    readsSet_obj
                })['set_ref']

        #### STEP 5: Build report
        ##
        reportName = 'kb_grinder_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            #'text_message': '',  # or is it 'message'?
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            #'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # message
        if len(invalid_msgs) > 0:
            report_text = "\n".join(invalid_msgs)
        reportObj['message'] = report_text

        if len(invalid_msgs) == 0:
            # objs
            if readsSet_obj_ref != None:
                reportObj['objects_created'].append({
                    'ref':
                    readsSet_obj_ref,
                    'desc':
                    params['output_name'] + " ReadsSet"
                })
            for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs):
                reportObj['objects_created'].append({
                    'ref':
                    lib_obj_ref,
                    'desc':
                    lib_obj_names[lib_obj_i]
                })
            # downloadable data
            for data_i, data_path in enumerate(struct_file_paths):
                try:
                    upload_ret = dfu.file_to_shock({
                        'file_path': data_path,
                        #'pack': 'zip'})
                        'make_handle': 0
                    })
                except:
                    raise ValueError('error uploading ' + data_path +
                                     ' file to shock')
                reportObj['file_links'].append({
                    'shock_id':
                    upload_ret['shock_id'],
                    'name':
                    struct_file_names[data_i],
                    'label':
                    struct_file_names[data_i]
                })

            # html report
            """
            try:
                html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir,
                                                     'make_handle': 0,
                                                     'pack': 'zip'})
            except:
                raise ValueError ('error uploading html report to shock')
            reportObj['direct_html_link_index'] = 0
            reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                        'name': html_file,
                                        'label': params['output_name']+' HTML'
                                    }
                                   ]
            """

        # save report object
        #
        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
        report_info = reportClient.create_extended_report(reportObj)

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END KButil_Build_InSilico_Metagenomes_with_Grinder

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value '
                + 'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
class RNASeqDownloaderUtils:
    def __init__(self, config):
        log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' %
            config)
        self.scratch = config['scratch']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url, token=self.token)
        self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token)

    def download_RNASeq(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: one of ['RNASeqAlignment', 
                              'RNASeqExpression', 
                              'RNASeqDifferentialExpression']

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        # Download RNASeq zip file
        # RNASeq Alignemnt, Expression and DifferentialExpression
        # has same object_data/handle_data structure
        returnVal = self._download_rna_seq_zip(params.get('input_ref'))

        return returnVal

    def download_RNASeq_Alignment(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: 'RNASeqAlignment'
        download_file_type: one of 'bam', 'sam' or 'bai'

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq_Alignment:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        input_ref = params.get('input_ref')
        returnVal = dict()

        download_file_type = params.get('download_file_type')
        if download_file_type == 'bam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadBAI': True
            })['destination_dir']
            shock_id = self._upload_dir_to_shock(destination_dir)
        elif download_file_type == 'sam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadSAM': True,
                'downloadBAI': True
            })['destination_dir']
            files = os.listdir(destination_dir)
            bam_files = [x for x in files if re.match('.*\.bam', x)]
            for bam_file in bam_files:
                log('removing file: {}'.format(bam_file))
                os.remove(os.path.join(destination_dir, bam_file))
            shock_id = self._upload_dir_to_shock(destination_dir)

        returnVal['shock_id'] = shock_id

        return returnVal

    def validate_download_rna_seq_alignment_parameters(self, params):
        """
        validate_download_rna_seq_alignment_parameters: 
                        validates params passed to download_rna_seq_alignment method
    
        """

        # check required parameters
        for p in ['input_ref', 'rna_seq_type']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # check supportive RNASeq types
        valid_rnaseq_types = [
            'RNASeqAlignment', 'RNASeqExpression',
            'RNASeqDifferentialExpression'
        ]
        if params['rna_seq_type'] not in valid_rnaseq_types:
            raise ValueError('Unexpected RNASeq type: %s' %
                             params['rna_seq_type'])

    def _download_rna_seq_zip(self, input_ref):
        """
        _download_rna_seq_zip: download RNASeq's archive zip file

        returns:
        shock_id: Shock ID of stored zip file

        """

        # get object data
        object_data = self._get_object_data(input_ref)
        log('---> getting object data\n object_date: %s' %
            json.dumps(object_data, indent=1))

        # get handle data
        handle = self._get_handle_data(object_data)
        log('---> getting handle data\n handle data: %s' %
            json.dumps(object_data, indent=1))

        # make tmp directory for downloading
        dstdir = os.path.join(self.scratch, 'tmp')
        if not os.path.exists(dstdir):
            os.makedirs(dstdir)

        # download original zip file and save to tmp directory
        handle_id = handle.get('hid')
        original_zip_file_path = self._download_original_zip_file(
            handle_id, dstdir)

        log('---> loading %s to shock' % original_zip_file_path)
        shock_id = self._upload_to_shock(original_zip_file_path)

        log('---> removing folder: %s' % dstdir)
        shutil.rmtree(dstdir)

        returnVal = {"shock_id": shock_id}

        return returnVal

    def _get_object_data(self, input_ref):
        """
        _get_object_data: get object_data using DataFileUtil

        """

        get_objects_params = {
            'object_refs': [input_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        return object_data

    def _get_handle_data(self, object_data):
        """
        _get_handle_data: get Handle from object_data

        """

        try:
            handle = object_data.get('data')[0].get('data').get('file')
        except:
            error_msg = "Unexpected object format. Refer to DataFileUtil.get_objects definition\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)

        if handle is None:
            error_msg = "object_data does NOT have Handle(file key)\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)
        elif handle.get('hid') is None:
            error_msg = "Handle does have NOT HandleId(hid key)\n"
            error_msg += "handle_data:\n%s" % json.dumps(handle, indent=1)
            raise ValueError(error_msg)
        else:
            return handle

    def _download_original_zip_file(self, handle_id, dstdir):
        """
        _download_original_zip_file: download original archive .zip file using DataFileUtil
        
        """

        shock_to_file_params = {'handle_id': handle_id, 'file_path': dstdir}
        original_zip_file = self.dfu.shock_to_file(shock_to_file_params)

        original_zip_file_path = original_zip_file.get('file_path')

        return original_zip_file_path

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': file_path}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id

    def _upload_dir_to_shock(self, directory):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': directory, 'pack': 'zip'}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id
Example #30
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file
        """
        path, file = os.path.split(bam_file)
        return self.samtools.get_stats(file, path)

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment  *
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        uuid_prefix = uuid_str[:8]
        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get('downloadBAI', False):
                bai_file = uuid_prefix + '_' + file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get('downloadSAM', False):
                sam_file = uuid_prefix + '_' + file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref - 
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.iteritems():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Example #31
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of type "get_promoter_for_gene_output_params" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs

        #TODO: have these guys return output paths
        for key, value in params.iteritems():
            print key
        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']
        promoterFastaFilePath = self.get_promoter_for_gene(ctx, params)[0]

        gibbsCommandList = []
        for i in range(motMin, motMax + 1, 2):
            gibbsCommandList.append(
                GU.build_gibbs_command(promoterFastaFilePath, i))

        for g in gibbsCommandList:
            GU.run_gibbs_command(g)
        #gibbsCommand = GU.build_gibbs_command(promoterFastaFilePath)
        #GU.run_gibbs_command(gibbsCommand)
        #print(promoterFastaFilePath)
        homerMotifCommand = HU.build_homer_motif_command(promoterFastaFilePath)
        homerLocationCommand = HU.build_homer_location_command(
            promoterFastaFilePath)
        os.mkdir(self.shared_folder + '/homer_out')
        #print(homerMotifCommand)
        HU.run_homer_command(homerMotifCommand)
        HU.run_homer_command(homerLocationCommand)

        MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath)
        MEU.run_meme_command(MEMEMotifCommand)

        gibbsMotifList = GU.parse_gibbs_output(motMin, motMax)
        homerMotifList = HU.parse_homer_output()
        memeMotifList = MEU.parse_meme_output()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        lineCount = 0
        with open(promoterFastaFilePath, 'r') as pFile:
            for line in pFile:
                lineCount += 1
        numFeat = lineCount / 2
        with open(promoterFastaFilePath, 'r') as pFile:
            fileStr = pFile.read()
        promHtmlStr = '<html><body> ' + fileStr + ' </body></html>'
        with open(htmlDir + '/promoters.html', 'w') as promHTML:
            promHTML.write(promHtmlStr)
        JsonPath = '/kb/module/work/tmp'
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/gibbs.json', htmlDir + '/gibbs.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/homer_out/homer.json', htmlDir + '/homer.html',
            str(numFeat)
        ])
        subprocess.call([
            'python', '/kb/module/lib/identify_promoter/Utils/makeReport.py',
            JsonPath + '/meme_out/meme.json', htmlDir + '/meme.html',
            str(numFeat)
        ])
        fullMotifList = []
        for h in homerMotifList:
            add = True
            for g in gibbsMotifList:
                if h['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
            for m in memeMotifList:
                if m['Iupac_signature'] == h['Iupac_signature']:
                    add = False
                    break
            if add:
                fullMotifList.append(h)
        for g in gibbsMotifList:
            add = True
            for m in memeMotifList:
                if m['Iupac_signature'] == g['Iupac_signature']:
                    add = False
                    break
                if add:
                    fullMotifList.append(g)
        for m in memeMotifList:
            fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)
        dfu = DataFileUtil(self.callback_url)
        parsed = ['gibbs.html', 'homer.html', 'meme.html', 'promoters.html']
        indexHtmlStr = '<html>'
        #use js to load the page content
        for p in parsed:
            indexHtmlStr += '<head><script src="https://ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min.js"></script> <script> $(function(){$("#' + p.replace(
                '.html', '_content') + '").load("' + p + '"); });</script> '
        indexHtmlStr += """<style>
            body {font-family: Arial;}

            /* Style the tab */
            .tab {
            overflow: hidden;
    border: 1px solid #ccc;
    background-color: #f1f1f1;
}

/* Style the buttons inside the tab */
.tab button {
    background-color: inherit;
    float: left;
    border: none;
    outline: none;
    cursor: pointer;
    padding: 14px 16px;
    transition: 0.3s;
    font-size: 17px;
}

/* Change background color of buttons on hover */
.tab button:hover {
    background-color: #ddd;
}

/* Create an active/current tablink class */
.tab button.active {
    background-color: #ccc;
}

/* Style the tab content */
.tabcontent {
    display: none;
    padding: 6px 12px;
    border: 1px solid #ccc;
    border-top: none;
}
</style></head> """
        indexHtmlStr += '<body>'
        #adding tabs
        indexHtmlStr += '<div class="tab">\n'
        for p in parsed:
            indexHtmlStr += '<button class="tablinks" onclick="openReport(event, \'' + p.replace(
                '.html', '_content') + '\')">' + p.replace('.html',
                                                           '') + '</button>'
        indexHtmlStr += '</div>'
        for p in parsed:
            indexHtmlStr += '<div id="' + p.replace(
                '.html', '_content') + '" class="tabcontent"></div>'
        indexHtmlStr += """<script>
function openReport(evt, reportName) {
    var i, tabcontent, tablinks;
    tabcontent = document.getElementsByClassName("tabcontent");
    for (i = 0; i < tabcontent.length; i++) {
        tabcontent[i].style.display = "none";
    }
    tablinks = document.getElementsByClassName("tablinks");
    for (i = 0; i < tablinks.length; i++) {
        tablinks[i].className = tablinks[i].className.replace(" active", "");
    }
    document.getElementById(reportName).style.display = "block";
    evt.currentTarget.className += " active";
}
</script>"""

        #for p in parsed:
        #    indexHtmlStr += '<a href="' + p + '">' + p.replace('.html','') +' Output</a>\n'
        #indexHtmlStr += '</body></html>'
        with open(htmlDir + '/index.html', 'w') as html_handle:
            html_handle.write(str(indexHtmlStr))

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['FeatureSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(fullMotifList, MSO)
        objname = 'MotifSet' + str(
            int((datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000))

        #Pass motif set into this
        save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGwasData.MotifSet',
            'data': MSO,
            'name': objname
        }]

        info = dfu.save_objects(save_objects_params)[0]
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref':
                motif_set_ref,
                'description':
                'Motif Set generated by identify promoter'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class MutualInfoUtil:
    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)
        self.scratch = config['scratch']

    def _mkdir_p(self, path):
        """
		_mkdir_p: make directory for given path
		"""
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def test_dfu(self):
        output_directory = self.scratch
        #output_directory = "/kb/module/test1/"
        #os.mkdir(output_directory)
        #self._mkdir_p(output_directory)

        test_file = os.path.join(output_directory, 'index.html')
        with open(test_file, 'w') as file:
            file.write("test!")
        print("OUTPUT DIR")
        print(output_directory)
        print(os.listdir(output_directory))
        print("file_to_shock")
        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
            #'pack': 'zip'
        })
        print(report_shock_id)
        return

    def _validate_run_flux_mutual_information_analysis_params(self, params):
        """
		_validate_run_flux_mutual_information_analysis_params:
				validates params passed to run_flux_mutual_information_analysis method
		"""

        log('start validating validate_run_flux_mutual_information_analysis params'
            )

        # check for required parameters
        for p in ['fbamodel_id', 'compounds', 'media_id', 'workspace_name']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _get_file_from_ws(self, ref):
        try:
            file_path = self.ws.get_objects2({'objects': [{'ref': ref}]})
            file_path = file_path['data'][0]
        except Exception as e:
            raise ValueError('Unable to get object from workspace: (' + ref +
                             ')' + str(e))
        return file_path

    def _make_media_files(self, ws_name, base, compounds):
        """
		Build and store media objects for each combination of compound added to the base media.
		:param base: The base media file
		:param compounds: the set of compound to test
		:return: A list of media ids and a matrix with each media combination defined
		"""

        ref = ws_name + "/" + base
        if base.find("/") != -1:
            ref = base

        output = self._get_file_from_ws(ref)
        base_media = output['data']
        base = output['info'][1]
        myuuid = str(uuid.uuid4())
        media_ids = [base]
        new_media_list = []
        media_matrix = [[""] + compounds]
        media_matrix.append([[base] + [0] * len(compounds)])
        for n_comp in range(1, len(compounds) + 1):
            for combo in combinations(compounds, n_comp):
                new_media_id = base + '_v%s' % len(media_matrix)
                media_ids.append(new_media_id)
                media_matrix.append(
                    [new_media_id] +
                    [1 if comp in combo else 0 for comp in compounds])
                new_media = deepcopy(base_media)
                new_media['id'] = new_media_id
                new_media['name'] = new_media_id
                for new_comp in combo:
                    new_media['mediacompounds'].append({
                        'compound_ref':
                        '48/1/1/compounds/id/%s' % new_comp.split('_')[0],
                        'concentration':
                        1.0,
                        'maxFlux':
                        1000,
                        'minFlux':
                        -1000
                    })
                new_media_list.append(new_media)

        print("Made %s Media Files" % (len(media_ids) - 1))
        info = self.ws.save_objects({
            'workspace':
            ws_name,
            "objects": [{
                "hidden": 1,
                "type": "KBaseBiochem.Media",
                "data": media,
                "name": myuuid + "-" + media['name']
            } for media in new_media_list]
        })
        #print(info)
        return media_ids, media_matrix, myuuid

    def _run_fba(self, workspace_name, media_id_list, fbamodel_id, myuuid,
                 base_media):
        print('running fba')
        fba_tool_obj = fba_tools(self.callback_url, service_ver='dev')
        new_media_list = []
        for media in media_id_list:
            if media == base_media:
                new_media_list.append(workspace_name + "/" + media)
            else:
                new_media_list.append(workspace_name + "/" + myuuid + "-" +
                                      media)

        fba_tool_obj.run_flux_balance_analysis({
            "max_c_uptake":
            60,  #"max_c_uptake": 6, // previously default is 6 later set to 60
            "workspace": workspace_name,
            "fbamodel_id": fbamodel_id,
            "fba_output_id": fbamodel_id + ".mifba",
            "fbamodel_workspace": workspace_name,
            "media_id_list": new_media_list,
            "target_reaction": "bio1",
            "minimize_flux": 1
        })
        output = self.ws.get_objects2({
            'objects': [{
                'ref': workspace_name + "/" + fbamodel_id + '.mifba'
            }]
        })

        #json.dump(output, open(self.scratch+'/fba.json', 'w'))

        fba = output['data'][0]['data']
        biomass_data = "FBAs,Biomass\n"
        secretion_file = "," + ','.join(media_id_list) + "\n"
        full_secretion_file = "," + ','.join(media_id_list) + "\n"
        full_flux_file = "," + ','.join(media_id_list) + "\n"
        flux_file = "," + ','.join(media_id_list) + "\n"
        objectives = fba['other_objectives']
        for i in range(0, len(objectives)):
            biomass_data = biomass_data + media_id_list[i] + "," + str(
                objectives[i]) + "\n"

        flux_vars = fba['FBAReactionVariables']
        for var in flux_vars:
            id = var['modelreaction_ref'].split("/").pop()
            flux_file = flux_file + id
            full_flux_file = full_flux_file + id
            fluxes = var['other_values']
            for i in range(0, len(objectives)):
                if objectives[i] == 0:
                    full_flux_file = full_flux_file + ",0"
                    flux_file = flux_file + ",0"
                else:
                    full_flux_file = full_flux_file + "," + str(fluxes[i])
                    if abs(fluxes[i]) < 1e-7:
                        flux_file = flux_file + ",0"
                    else:
                        flux_file = flux_file + ",1"
            flux_file = flux_file + "\n"
            full_flux_file = full_flux_file + "\n"

        secretion_vars = fba['FBACompoundVariables']
        for var in secretion_vars:
            id = var['modelcompound_ref'].split("/").pop()
            secretion_file = secretion_file + id
            full_secretion_file = full_secretion_file + id
            fluxes = var['other_values']
            for i in range(0, len(objectives)):
                if objectives[i] == 0:
                    full_secretion_file = full_secretion_file + ",0"
                    secretion_file = secretion_file + ",0"
                else:
                    full_secretion_file = full_secretion_file + "," + str(
                        fluxes[i])
                    if abs(fluxes[i]) < 1e-7:
                        secretion_file = secretion_file + ",0"
                    elif fluxes[i] < 0:
                        secretion_file = secretion_file + ",-1"
                    else:
                        secretion_file = secretion_file + ",1"
            secretion_file = secretion_file + "\n"
            full_secretion_file = full_secretion_file + "\n"

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        biomass_path = os.path.join(output_directory, 'biomass.csv')
        secretion_path = os.path.join(output_directory, 'secretion.csv')
        flux_path = os.path.join(output_directory, 'flux.csv')
        full_secretion_path = os.path.join(output_directory,
                                           'full_secretion.csv')
        full_flux_path = os.path.join(output_directory, 'full_flux.csv')

        with open(biomass_path, 'w') as biomass_f:
            biomass_f.write(biomass_data)

        with open(secretion_path, 'w') as secretion_f:
            secretion_f.write(secretion_file)

        with open(flux_path, 'w') as flux_f:
            flux_f.write(flux_file)

        with open(full_secretion_path, 'w') as full_secretion_f:
            full_secretion_f.write(full_secretion_file)

        with open(full_flux_path, 'w') as full_flux_f:
            full_flux_f.write(full_flux_file)

        return [
            biomass_path, secretion_path, flux_path, full_secretion_path,
            full_flux_path
        ]

    def _make_index_html(self, result_file_path, mutual_info_dict):
        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'

        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'
        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)
        return

    def _generate_html_report(self, result_directory, mutual_info_dict):
        """
		_generate_html_report: generate html summary report
		"""
        #scratch, uui, datafileutil, file_to_shock, shockId, extended report

        log('start generating html report')

        html_report = list()

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))

        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'mutual_information_report.html')

        shutil.copy(os.path.join(result_directory, 'MI_plot.png'),
                    os.path.join(output_directory, 'MI_plot.png'))

        overview_content = ''
        overview_content += '<table><tr><th>Mutual Information for various chemical compound combinations'
        overview_content += ' Object</th></td>'
        overview_content += '<tr><th>Input Chemical Compound Combination</th>'
        overview_content += '<th>Mutual Information (in Bits)</th>'
        overview_content += '</tr>'

        for k, v in mutual_info_dict.items():
            overview_content += '<tr><td>{}</td><td>{}</td></tr>'.format(k, v)
        overview_content += '</table>'

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Overview_Content</p>', overview_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
        })['shock_id']

        #report_shock_id = self.dfu.file_to_shock({'file_path': output_directory,
        #										  'pack': 'zip'})['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Mutual Information App'
        })

        return html_report

    def _generate_report(self, result_directory, mutual_info_dict,
                         workspace_name):
        """
		_generate_report: generate summary report
		"""
        print('-->I am here *************')
        uuidStr = str(uuid.uuid4())
        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        test_file = os.path.join(output_directory, "index.html")
        self._make_index_html(test_file, mutual_info_dict[1])
        #shutil.copy2(os.path.join(os.path.dirname(__file__), 'data', 'index.html'), output_directory)

        # shutil.copy('/kb/module/data/index.html', result_directory + '/' + uuidStr + '/index.html')
        json.dump(mutual_info_dict[0],
                  open(os.path.join(output_directory, 'pdata.json'), 'w'))
        #shutil.copy('pdata.json', result_directory + '/' + uuidStr + '/pdata.json')

        # DataFileUtils to shock
        print(output_directory)
        print(os.listdir(output_directory))
        report_shock_result = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'targz'
        })
        #report_shock_result = self.dfu.file_to_shock({'file_path': output_directory,
        #											 'pack': 'zip'})

        report_shock_id = report_shock_result['shock_id']
        print(report_shock_result)

        report_file = {
            'name': 'index.html',
            'description': 'the report',
            'shock_id': report_shock_id
        }
        log('creating report')
        #output_html_files = self._generate_html_report(result_directory,
        #											   mutual_info_dict)
        report_params = {
            'message': '',
            'workspace_name': workspace_name,
            'html_links': [report_file],
            'file_links': [],
            'direct_html_link_index': 0,
            'html_window_height': 333,
            'report_object_name': 'MutualInfomation_report_' + uuidStr
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

######### @@@@@@@ALL THREE MUTUAL INFORMATION CALCULATION START FROM HERE@@@@@@@#############

    def _generate_mutual_info(self, media_matrix, fba_file, mi_options):

        #print('this is fba_file')
        #print(fba_file)
        df1 = pd.read_csv(fba_file[0])
        df1.values

        #df1.as_matrix()
        #print('-->printing df1')# **** rm
        #print(df1.to_string())# **** rm
        #print(type(df1))  # **** rm
        #print('-->printing media_matrix')
        #print(media_matrix)

        df3 = pd.DataFrame(columns=media_matrix[0][1:])
        for i in range(1, len(media_matrix)):
            if i == 1:
                df3.loc[media_matrix[i][0][0]] = media_matrix[i][0][1:]
            else:
                df3.loc[media_matrix[i][0]] = media_matrix[i][1:]

        #print('-->*************OK')
        #print(df3)

        #----Input validation of Media/FBAs with Binary Matrix FBAs------
        # 1.0 Number of rows in Media.csv file =  (Number of columns -1)
        #   1.0. If they are different: Through an ERROR saying missed match number of FBAs in media and binary matrix.
        # 1.1 Check whether the elements in Media.csv file contains only binary values (i.e. 0 and 1)
        #   1.1. If the elements are different: Through an ERROR saying not approapriate input values
        # 1.2 Check whether the compounds in Media.csv file match with number of FBAs
        #   1.2. If the compounds are different from number of FBAs: Through an ERROR saying not appropriate input values

        media_matrix = df3
        s_df1 = df1.shape
        s_df2 = media_matrix.shape
        #print(media_matrix,type(media_matrix))

        Temp_df2 = np.array(media_matrix.values)
        #print('-->******')
        #print(Temp_df2)
        # Create matrix with only the elements remove first column and all the rows
        Temp_df2 = Temp_df2[0:, 1:]

        Bin_val_check = np.array_equal(Temp_df2, Temp_df2.astype(bool))
        #num_compounds = (s_df2[1])-1
        num_compounds = s_df2[1]

        if ((s_df1[1] - 1) != s_df2[0]) or (Bin_val_check != True) or (int(
                math.log(s_df2[0], 2)) != num_compounds):
            print('invalid input values')

        #-----All possible combination of the chemical compounds----------------------
        # 2.0 Sperating m0 from rest of the lables

        Temp1_df2 = media_matrix
        #print('-->*************OK')
        #print(Temp1_df2)
        cols = Temp1_df2.columns
        for i in range(0, len(cols)):
            Temp1_df2.loc[Temp1_df2[cols[i]] == 1, cols[i]] = cols[i]
        #print('-->*************OK')
        #print (Temp1_df2)

        # 2.1 Creating a disctionary for all FBAs except m0
        #print(len(Temp1_df2))
        #print('--->*********')
        #print(Temp1_df2)

        mydict = {}
        for x in range(0, len(Temp1_df2)):
            for i in range(0, s_df2[1]):
                currentvalue = Temp1_df2.iloc[x, i]
                currentid = Temp1_df2.index[x]
                mydict.setdefault(currentid, [])
                if currentvalue != 0:
                    mydict[currentid].append(currentvalue)
                # Add the first key as m0
        media_0_name = Temp1_df2.index[0]
        mydict[media_0_name] = ["0"]
        # Sort the keys
        mydict = collections.OrderedDict(natsort.natsorted(mydict.items()))
        #print ('--> ********')
        compoundslist = Temp1_df2.columns.get_values()
        compoundslist.tolist()
        #print(compoundslist)
        #print('all possible combination')
        #print(len(compoundslist))

        # List of Compounds combination in the list
        my_combi_list = []
        for L in range(0, len(compoundslist) + 1):
            for subset in itertools.combinations(compoundslist, L):
                my_combi_list.append(list(subset))

        my_combi_list[0] = [0]
        # print(my_combi_list)
        '''
		for k, v in mydict.iteritems():
			#print('--> ********')
			print(k, v)
		'''

        # Created a dictionary where the keys:
        # list of compounds combination
        # values are corresponding FBAs list in df2
        result_dict = {}
        for element in my_combi_list[1:]:
            for k, v in mydict.iteritems():
                if set(v).issubset(set(map(lambda x: str(x), element))):
                    key = ','.join(map(lambda x: str(x), element))
                    if result_dict.get(key):
                        media_list = result_dict[key]
                        media_list.append(k)
                        media_list = list(set(media_list))
                        result_dict.update({key: media_list})
                    else:
                        result_dict.update({key: [media_0_name, k]})

        # Sort the keys
        result_dict['0'] = [media_0_name]
        result_dict = collections.OrderedDict(
            natsort.natsorted(result_dict.items()))
        # print(result_dict)
        #print('-->I am here **** OK')
        #print(result_dict)
        #print (df1)

        # Created a dictionary where the keys are:
        # list of compounds combination
        # values are compounds combination FBAs with df1 vaules
        All_Comp_Combi_dic = {}
        for column, value in result_dict.items():
            All_Comp_Combi_dic.update({column: df1.get(value)})

        # print('-->All_Comp_Combi_dic******')
        # print (All_Comp_Combi_dic)
        # print(result_dict)

        # To print an item from the All_Comp_Combi_dic
        df = (pd.DataFrame(All_Comp_Combi_dic.items()))
        #print('--> printing df')
        #print(df[0].to_string())
        #print(df[1][7])

        ######### INTRACELLULAR FLUX MUTUAL INFORMATION CALCULATION #############
        if mi_options == "flux":
            print('Intracellular flux')
            MI_dict = {}
            for k in range(0, len(df[0])):
                drop_rows_df = df[1][k].drop_duplicates(keep="first")
                drop_columns_df = drop_rows_df.T.drop_duplicates(
                    keep="first").T
                remove = []
                removed = {}
                count_values = {}
                cols = df[1][k].columns
                for i in range(len(cols) - 1):
                    duplicated = []
                    v = df[1][k][cols[i]].values
                    for j in range(i + 1, len(cols)):
                        if np.array_equal(v, df[1][k][cols[j]].values):
                            remove.append(cols[j])
                            duplicated.append(cols[j])
                    if duplicated and cols[i] not in remove:
                        removed.update({cols[i]: duplicated})
                    count = {}
                    for key, value in removed.items():
                        count.update({key: len(value)})

                    #print v

                    # print drop_columns_df
                    count_values = count.values()
                    # print count_values
                    count_values = map(lambda x: x + 1, count_values)
                    # print count_values
                    d = {x: count_values.count(x) for x in count_values}
                #print('-->count_values')
                #print(count_values)

                #-------Mutual Inforamtion (MI) calculation-------------
                FBAs = len(df[1][k].columns)
                pure_entropy = math.log(FBAs, 2)
                #print (pure_entropy) (-->ok rm)

                # If No duplicates exist and list "value" is empty
                if not count_values:
                    #print("List is empty")
                    No_duplicate_FBAs = len(drop_columns_df.columns)
                    conditional_entropy = -1 * (No_duplicate_FBAs * (
                        (1 / No_duplicate_FBAs) *
                        ((1 / 1) * math.log(1.0 / 1.0, 2))))
                    Mutual_Info = pure_entropy - conditional_entropy
                    #print('Mutaul Info:', Mutual_Info)

                if count_values:
                    # If duplicates exist and list "value" is not empty
                    conditional_entropy = 0
                    for key in d:
                        #print key, d[key]
                        Temp = -1 * d[key] * (key / float(FBAs)) * key * (
                            1.0 / key) * math.log(1.0 / key, 2)
                        conditional_entropy = Temp + conditional_entropy
                    #print "%3f" %Temp
                    Mutual_Info = pure_entropy - conditional_entropy

                MI_dict[df[0][k]] = Mutual_Info
                MI_dict['0'] = 0.0

            #Sorted MI_dict
            MI_dict = sorted(MI_dict.items(), key=lambda x: (-len(x[0]), x[0]))
            MI_dict = OrderedDict(MI_dict)
            #print(MI_dict)

            #print('-->rest')
            #print(compoundslist)
            #print(num_compounds)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

                # for x, y in zip(x_groups, y_groups):
                # data.append(go.Bar(x=x, y=y, name='test'))

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)

            #print (pdata)
            #json.dump(pdata, open(self.scratch+'/pdata.json', 'w'))
            return [pdata, MI_dict]
            #return MI_dict

######### INPUT COMPONENTS AND BIOMASS FLUX MUTUAL INFORMATION CALCULATION #############
        if mi_options == "biomass":
            # Load the file contain the information of FBAs(media) along with corresponding Biomass (growth)
            print('biomass flux')
            df2 = pd.read_csv(fba_file[1])
            df2.values
            #print(df)

            MI_dict_biomass = {}
            for r in range(0, len(df[0])):
                reaction_states = df[1][r].head(1000)

                def get_groups(flux_df):
                    groups = collections.defaultdict(list)
                    unique = flux_df.aggregate(lambda x: hash(str(x.values)))
                    for k, v in unique[0:].iteritems():
                        groups[v].append(k)
                    return dict([(i, g)
                                 for i, g in enumerate(groups.values())])

                n_group = collections.defaultdict(int)
                groups = get_groups(reaction_states)

                for group in groups.values():
                    n_group[len(group)] += 1

                groups_count = {}
                for key, values in groups.items():
                    groups_count[key] = len(values)
                    # print groups_count

                # Take first FBA label of every group
                group_id = {}
                for k, v in groups.items():
                    group_id.update({k: groups.values()[k][0]})

                # Obtain the Biomass of each Group
                cols_df = group_id.values()
                cols_df2 = df2.columns
                #print (cols_df)

                # Dictionary of first FBA label of every group and its corresponding number of members
                groups_label_count = {}
                for k, v in groups_count.items():
                    groups_label_count.update({cols_df[k]: v})
                #print('groups_label_count')
                #print(groups_label_count)

                def get_cond_count(re_group):
                    media_cond = 0
                    for media in re_group['FBAs']:
                        media_cond += groups_label_count[media]
                    return media_cond

                # Extract FBA Groups biomass inside df2
                Groups_Biomass = df2[df2['FBAs'].isin(cols_df)]
                #print('-->I am here')
                #print(Groups_Biomass)

                # Regroup based on the biomass values
                re_group = Groups_Biomass.groupby('Biomass')
                biomass_FBAs_groups = re_group.aggregate(get_cond_count)

                biomass_FBAs_label_groups = Groups_Biomass.groupby(
                    "Biomass", sort=True).sum()
                #print(biomass_FBAs_label_groups)

                #print (biomass_FBAs_label_groups)

                Summery = pd.merge(left=biomass_FBAs_label_groups,
                                   left_index=True,
                                   right=biomass_FBAs_groups,
                                   right_index=True,
                                   how='inner')
                Data_4_CondMI = Summery.groupby('FBAs_y').count()
                Data_4_CondMI = Data_4_CondMI.to_dict(orient='dict')
                for k, v in Data_4_CondMI.items():
                    Data_4_CondMI = v

                Num_of_FBAs = Data_4_CondMI.keys()
                Count_Num_of_FBAs = Data_4_CondMI.values()

                # -------Mutual Inforamtion (MI) calculation Stage II (input compounds respect to BIOMASS-------------
                # Pure Entropy
                FBAs = len(df[1][r].columns)
                pure_entropy = math.log(FBAs, 2)

                conditional_entropy = 0.0
                for l in range(0, len(Count_Num_of_FBAs)):
                    temp = -1 * Count_Num_of_FBAs[l] * (
                        Num_of_FBAs[l] / float(FBAs)) * Num_of_FBAs[l] * (
                            1.0 / float(Num_of_FBAs[l]) *
                            (math.log(1.0 / float(Num_of_FBAs[l]), 2)))
                    conditional_entropy += temp

                Mutual_Info_Biomass = pure_entropy - conditional_entropy
                # print('Mutaul Info:', Mutual_Info_Biomass)

                #print(Mutual_Info_Biomass)
                MI_dict_biomass.update({df[0][r]: Mutual_Info_Biomass})

                #print(MI_dict_biomass)

            # Sorted MI_dict_biomass
            MI_dict_biomass = sorted(MI_dict_biomass.items(),
                                     key=lambda x: (-len(x[0]), x[0]))
            MI_dict_biomass = OrderedDict(MI_dict_biomass)

            #print(MI_dict_biomass)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict_biomass.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)
            return [pdata, MI_dict_biomass]

######### INPUT COMPONENTS AND BIOMASS, SECRETION FLUX MUTUAL INFORMATION CALCULATION #############

        if mi_options == "secretion":
            #Load the file contain the information of FBAs(media) along with corresponding Biomass (growth)
            print('secretion flux')
            df4 = pd.read_csv(fba_file[2], header=0, index_col=0)

            df4.index.name = 'FBAs'
            df4 = df4.T

            dfbiomass = pd.read_csv(fba_file[1])
            aa = dfbiomass['Biomass'].values.tolist()
            # print(len(aa))
            df4['Biomass'] = aa
            # print(df4.shape)
            compoundslist_b_u_s = list(df4.columns.values)
            #print(compoundslist_b_u_s)

            MI_dict_b_u_s = {}
            for r in range(0, len(df[0])):
                reaction_states = df[1][r].head(1000)

                def get_groups(flux_df):
                    groups = collections.defaultdict(list)
                    unique = flux_df.aggregate(lambda x: hash(str(x.values)))
                    for k, v in unique[0:].iteritems():
                        groups[v].append(k)
                    return dict([(i, g)
                                 for i, g in enumerate(groups.values())])

                n_group = collections.defaultdict(int)
                groups = get_groups(reaction_states)
                for group in groups.values():
                    n_group[len(group)] += 1
                #print(n_group)
                #print(groups)

                groups_count = {}
                for key, values in groups.items():
                    groups_count[key] = len(values)
                # print(groups_count)

                # Take first FBA label of every group
                group_id = {}
                for k, v in groups.items():
                    group_id.update({k: groups.values()[k][0]})

                # Obtain the Biomass of each Group
                cols_df = group_id.values()
                cols_df4 = df4.columns

                # Dictionary of first FBA label of every group and its corresponding number of members
                groups_label_count = {}
                for k, v in groups_count.items():
                    groups_label_count.update({cols_df[k]: v})

                #print(groups_label_count)

                # Extract FBA Groups biomass inside df4
                df5 = df4.reset_index()
                Groups_Biomass = df5[df5['index'].isin(cols_df)]
                #print(Groups_Biomass)

                # Regroup based on the biomass values
                re_group = Groups_Biomass.groupby(compoundslist_b_u_s)
                #print(re_group)

                my_list = []
                for index, values in re_group:
                    my_list.append(values['index'].values)

                #print(my_list)

                B_U_S_dict = {}
                for media in my_list:
                    if len(media) > 1:
                        media_cond = 0
                        for i in (0, len(media) - 1):
                            media_cond += groups_label_count[media[i]]
                        B_U_S_dict.update({str(media)[1:-1]: media_cond})
                        #final_my_dict.update({tuple(media.tolist()):media_cond})
                    else:
                        B_U_S_dict.update({
                            str(media)[1:-1]:
                            groups_label_count[str(tuple(
                                media.tolist()))[1:-1][:-1][1:-1]]
                        })

                B_U_S_dict = {k: v for k, v in B_U_S_dict.iteritems()}
                #print(B_U_S_dict)

                Summery = pd.DataFrame(B_U_S_dict.items(),
                                       columns=['index_x', 'index_y'])

                Data_4_CondMI = Summery.groupby('index_y').count()
                Data_4_CondMI = Data_4_CondMI.to_dict(orient='dict')

                #print(Data_4_CondMI)
                for k, v in Data_4_CondMI.items():
                    Data_4_CondMI = v

                Num_of_FBAs = Data_4_CondMI.keys()
                Count_Num_of_FBAs = Data_4_CondMI.values()
                #print(Num_of_FBAs)
                #print(Count_Num_of_FBAs)
                #print('-->***<---')

                # -------Mutual Inforamtion (MI) calculation Stage II (input compounds respect to Biomass, Uptake and Secretion-------------
                # Pure Entropy
                FBAs = len(df[1][r].columns)
                pure_entropy = math.log(FBAs, 2)

                conditional_entropy = 0.0
                for l in range(0, len(Count_Num_of_FBAs)):
                    temp = -1 * Count_Num_of_FBAs[l] * (
                        Num_of_FBAs[l] / float(FBAs)) * Num_of_FBAs[l] * (
                            1.0 / float(Num_of_FBAs[l]) *
                            (math.log(1.0 / float(Num_of_FBAs[l]), 2)))
                    conditional_entropy += temp

                Mutual_Info_B_U_S = pure_entropy - conditional_entropy
                # print('Mutaul Info:', Mutual_Info_B_U_S)

                MI_dict_b_u_s.update({df[0][r]: Mutual_Info_B_U_S})

            # Sorted MI_dict_biomass
            MI_dict_b_u_s = sorted(MI_dict_b_u_s.items(),
                                   key=lambda x: (-len(x[0]), x[0]))
            MI_dict_b_u_s = OrderedDict(MI_dict_b_u_s)

            #print(MI_dict_b_u_s)

            x_groups = [[] for x in range(num_compounds)]
            y_groups = [[] for x in range(num_compounds)]
            names = [[] for x in range(num_compounds)]
            Comp_Mapping = [[] for x in range(num_compounds)]

            for key, val in MI_dict_b_u_s.iteritems():
                del_count = key.count(',')
                x_groups[del_count].append(key)
                y_groups[del_count].append(val)

            # for x, y in zip(x_groups, y_groups):
            # data.append(go.Bar(x=x, y=y, name='test'))

            pdata = []
            for i in range(0, len(x_groups)):
                names[i] = str(i + 1) + ' Compound Combination'
                Comp_Mapping = str(i + 1) + '-' + compoundslist[i]

                record = {}
                record["x"] = []
                for e in x_groups[i]:
                    record["x"].append("c" + e)
                record["y"] = y_groups[i]
                record["names"] = names[i]
                record["Comp_Mapping"] = Comp_Mapping
                pdata.append(record)

            return [pdata, MI_dict_b_u_s]
Example #33
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
Example #34
0
    def get_promoter_for_gene(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_promoter_for_gene
        #code goes here
        dfu = DataFileUtil(self.callback_url)
        #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]}
        objectRefs = {'object_refs': [params['featureSet_ref']]}
        ws = Workspace('https://appdev.kbase.us/services/ws')
        ws_name = params['workspace_name']
        subset = ws.get_object_subset([{
            'included':
            ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'],
            'ref':
            params['genome_ref']
        }])
        features = subset[0]['data']['features']
        aref = subset[0]['data']['assembly_ref']
        objects = dfu.get_objects(objectRefs)
        #genome = objects['data'][0]['data']
        #featureSet = objects['data'][1]['data']
        featureSet = objects['data'][0]['data']
        assembly_ref = {'ref': aref}
        #print assembly_ref
        #with open(self.shared_folder + '/genome.json','w') as f:
        #    json.dump(genome,f)
        #with open(self.shared_folder + '/featureSet.json','w') as f:
        #    json.dump(featureSet,f)
        #with open('/kb/module/work/asssembly.json','w') as f:
        #    json.dump(assembly,f)
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)

        #pprint(fasta_file)
        #loop over featureSet
        #find matching feature in genome
        #get record, start, orientation, length
        #TODO: add some error checking logic to the bounds of the promoter
        prom = ""
        featureFound = False
        for feature in featureSet['elements']:
            #print(feature)
            #print(featureSet['elements'][feature])
            featureFound = False
            for f in features:
                #print f['id']
                #print feature
                if f['id'] == feature:
                    attributes = f['location'][0]
                    featureFound = True
                    #print('found match ' + feature)
                    #print(f['location'])
                    break
            if featureFound:
                for record in SeqIO.parse(fasta_file['path'], 'fasta'):
                    #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'):
                    #print(record.id)
                    #print(attributes[0])
                    if record.id == attributes[0]:
                        #print('adding to prom string')
                        #print(attributes[0])
                        if attributes[2] == '+':
                            #print('1')
                            #might need to offset by 1?
                            end = attributes[1]
                            start = end - params['promoter_length']
                            if end < 0:
                                end = 0
                            promoter = record.seq[start:end].upper()
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        elif attributes[2] == '-':
                            #print('2')
                            start = attributes[1]
                            end = start + params['promoter_length']
                            if end > len(record.seq) - 1:
                                end = len(record.seq) - 1
                            promoter = record.seq[start:end].upper()
                            complement = {
                                'A': 'T',
                                'C': 'G',
                                'G': 'C',
                                'T': 'A',
                                'N': 'N'
                            }
                            promoter = ''.join(
                                [complement[base] for base in promoter[::-1]])
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        else:
                            print('Error on orientation')
            else:
                print('Could not find feature ' + feature + 'in genome')
        promOutputPath = '/kb/module/work/tmp/promFile.fa'
        #print('prom string\n' + str(prom))
        with open(promOutputPath, 'w') as promFile:
            promFile.write(str(prom))

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        html_output_dir = os.path.join(self.shared_folder,
                                       'output_html.' + str(timestamp))
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)
        html_file = 'promoter.html'
        output_html_file_path = os.path.join(html_output_dir, html_file)

        html_report_lines = '<html><body>'
        html_report_lines += '<pre>' + prom + '</pre>'
        html_report_lines += '</body></html>'

        with open(output_html_file_path, 'w', 0) as html_handle:
            html_handle.write(str(html_report_lines))

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': html_output_dir,
                #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path,
                #'make_handle': 0})
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [],
            'message': '',
            'direct_html': None,
            'direct_html_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 220,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': html_file,
            'label': 'View'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #changing output to be path string
        #TODO: get rid of this html maybe and move into find_motifs
        output = promOutputPath

        #iterate over records in fasta
        #for record in SeqIO.parse(fasta_file['path'], 'fasta'):

        #objects list of Genome and featureSet

        #pprint(objects)
        #END get_promoter_for_gene

        # At some point might do deeper type checking...
        if not isinstance(output, basestring):
            raise ValueError('Method get_promoter_for_gene return value ' +
                             'output is not type basestring as required.')
        # return the results
        return [output]