Exemple #1
0
    def test_GenomeInterface_own_handle(self):
        # no handle in genome
        genome = {'missing_genbank_handle_ref': 'hid'}
        origin_genome = genome.copy()
        self.genome_interface._own_handle(genome, 'genbank_handle_ref')
        self.assertItemsEqual(origin_genome, genome)

        # user unauthorized
        temp_shock_file = "/kb/module/work/tmp/shock1.txt"
        with open(temp_shock_file, "w") as f1:
            f1.write("Test Shock Handle")
        token2 = self.ctx2['token']
        dfu2 = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token2)
        shock_ret = dfu2.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})
        self.nodes_to_delete.append(shock_ret['shock_id'])
        hid = shock_ret['handle']['hid']

        genome = {'genbank_handle_ref': hid}
        with self.assertRaisesRegex(ValueError, 
                                     'Error getting ACLs for Shock node'):
            self.genome_interface._own_handle(genome, 'genbank_handle_ref')

        # same user
        shock_ret = self.dfu.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})
        self.nodes_to_delete.append(shock_ret['shock_id'])
        hid = shock_ret['handle']['hid']
        genome = {'genbank_handle_ref': hid}
        origin_genome = genome.copy()
        self.genome_interface._own_handle(genome, 'genbank_handle_ref')
        self.assertDictEqual(origin_genome, genome)

        # differet user
        self.wsClient.set_permissions({'workspace': self.wsName, 'new_permission': 'w',
                                       'users': [self.ctx2['user_id']]})

        token2 = self.ctx2['token']
        dfu2 = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token2)
        shock_ret = dfu2.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})
        node = shock_ret['shock_id']
        self.nodes_to_delete.append(node)
        hid = shock_ret['handle']['hid']
    
        # grant user1 read access to node
        user1 = self.ctx['user_id']
        acl = 'read'
        url = self.shockURL + '/node/' + node + '/acl'
        url += '/' + acl + '?users=' + urllib.parse.quote(user1)
        auth_header = {'Authorization': 'OAuth {}'.format(token2)}
        req = requests.put(url, headers=auth_header, allow_redirects=True)
        if not req.ok:
            err = json.loads(req.content)['error'][0]
            print('response error: {}'.format(err))

        genome = {'genbank_handle_ref': hid}
        origin_genome = genome.copy()
        self.genome_interface._own_handle(genome, 'genbank_handle_ref')
        self.assertNotEqual(origin_genome['genbank_handle_ref'], 
                            genome['genbank_handle_ref'])
Exemple #2
0
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
         function for creating html report
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'VariationReport' + str(uuid.uuid4())
        report = KBaseReport(callback_url)

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'Variation HTML report'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
Exemple #3
0
def create_report(callback_url, scratch, workspace_name, result_file):

    html = create_html_tables(result_file)

    dfu = DataFileUtil(callback_url)
    report_name = 'fastANI_report_' + str(uuid.uuid4())
    report_client = KBaseReport(callback_url)
    html_dir = os.path.join(scratch, report_name)
    os.mkdir(html_dir)

    with open(os.path.join(html_dir, "index.html"), 'w') as file:
        file.write(html)

    shock = dfu.file_to_shock({
        'file_path': html_dir,
        'make_handle': 0,
        'pack': 'zip'
    })
    html_file = {
        'shock_id': shock['shock_id'],
        'name': 'index.html',
        'label': 'html_files',
        'description': 'FastANI HTML report'
    }
    report = report_client.create_extended_report({
        'direct_html_link_index':
        0,
        'html_links': [html_file],
        'report_object_name':
        report_name,
        'workspace_name':
        params['workspace_name']
    })

    return {'report_name': report['name'], 'report_ref': report['ref']}
Exemple #4
0
    def _upload_report(self, report_dir, file_links, workspace_name,
                       saved_objects):
        dfu = DataFileUtil(self.callback_url)
        upload_info = dfu.file_to_shock({
            'file_path': report_dir,
            'pack': 'zip'
        })
        shock_id = upload_info['shock_id']

        report_params = {
            'message':
            'JGI metagenome assembly report',
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': shock_id,
                'name': 'index.html',
                'description': 'assembly report'
            }],
            'file_links':
            file_links,
            'report_object_name':
            'JGI_assembly_pipeline.' + str(uuid.uuid4()),
            'workspace_name':
            workspace_name,
            'objects_created':
            saved_objects
        }

        report_client = KBaseReport(self.callback_url)
        report = report_client.create_extended_report(report_params)
        return {'report_ref': report['ref'], 'report_name': report['name']}
Exemple #5
0
 def test_simple_shock_upload(self):
     ### Test for upload from SHOCK - upload the file to shock first
     print('attempting upload through shock')
     gbk_path = "data/e_coli/GCF_000005845.2_ASM584v2_genomic.gbff"
     data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     shutil.copy(gbk_path, self.cfg['scratch'])
     shock_id = data_file_cli.file_to_shock({
         'file_path':
         os.path.join(self.cfg['scratch'],
                      gbk_path.split("/")[-1])
     })['shock_id']
     print("Running test")
     ws_obj_name2 = 'MyGenome.2'
     result = self.getImpl().genbank_to_genome(
         self.getContext(), {
             'file': {
                 'shock_id': shock_id
             },
             'workspace_name': self.getWsName(),
             'genome_name': ws_obj_name2,
         })[0]
     self.assertIsNotNone(result['genome_ref'])
     self.assertTrue(
         int(result['genome_info'][10]['Number of Protein Encoding Genes'])
         > 0)
Exemple #6
0
 def test_handles(self):
     wsName = self.generatePesudoRandomWorkspaceName()
     self.ws.set_permissions({'workspace': wsName, 'new_permission': 'w',
                              'users': [self.ctx2['user_id']]})
     temp_shock_file = "/kb/module/work/tmp/shock1.txt"
     with open(temp_shock_file, "w") as f1:
         f1.write("Test Shock Handle")
     token1 = self.ctx['token']
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token1)
     handle1 = dfu.file_to_shock({'file_path': temp_shock_file, 'make_handle': 1})['handle']
     hid1 = handle1['hid']
     genome_name = "Genome.1"
     self.impl.save_one_genome_v1(self.ctx, {
         'workspace': wsName, 'name': genome_name, 'data': {
             'id': "qwerty", 'scientific_name': "Qwerty",
             'domain': "Bacteria", 'genetic_code': 11,
             'genbank_handle_ref': hid1}
         })
     genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name}
                                                              ]})[0]['genomes'][0]['data']
     self.impl.save_one_genome_v1(self.ctx2, {'workspace': wsName, 'name': genome_name,
                                              'data': genome})[0]
     genome = self.impl.get_genome_v1(self.ctx2, {'genomes': [{'ref': wsName + '/' + genome_name}
                                                              ]})[0]['genomes'][0]['data']
     self.assertTrue('genbank_handle_ref' in genome)
     hid2 = genome['genbank_handle_ref']
     self.assertNotEqual(hid1, hid2)
Exemple #7
0
def make_fake_expression(callback_url, dummy_file, name, genome_ref,
                         annotation_ref, alignment_ref, ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqExpression object and returns a ref to it.
    genome_ref: reference to a genome object
    annotation_ref: reference to a KBaseRNASeq.GFFAnnotation
    alignment_ref: reference to a KBaseRNASeq.RNASeqAlignment
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    exp = {
        "id": "fake",
        "type": "fake",
        "numerical_interpretation": "fake",
        "expression_levels": {
            "feature_1": 0,
            "feature_2": 1,
            "feature_3": 2
        },
        "genome_id": genome_ref,
        "annotation_id": annotation_ref,
        "mapped_rnaseq_alignment": {
            "id1": alignment_ref
        },
        "condition": "",
        "tool_used": "none",
        "tool_version": "0.0.0",
        "file": dummy_shock_info['handle']
    }
    return make_fake_object(exp, "KBaseRNASeq.RNASeqExpression", name, ws_name,
                            ws_client)
Exemple #8
0
def make_fake_alignment(callback_url, dummy_file, name, reads_ref, genome_ref,
                        ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqAlignment object and returns a ref to it.
    callback_url: needed for DataFileUtil,
    dummy_file: path to some dummy "alignment" file (make it small - needs to be uploaded to shock)
    name: the name of the object
    reads_ref: a reference to a valid (probably fake) reads library
    genome_ref: a reference to a valid (also probably fake) genome
    workspace_name: the name of the workspace to save this object
    workspace_client: a Workspace client tuned to the server of your choice
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    fake_alignment = {
        "file": dummy_shock_info['handle'],
        "library_type": "fake",
        "read_sample_id": reads_ref,
        "condition": "fake",
        "genome_id": genome_ref
    }
    return make_fake_object(fake_alignment, "KBaseRNASeq.RNASeqAlignment",
                            name, ws_name, ws_client)
Exemple #9
0
def create_html_report(callback_url, scratch, workspace_name):
    '''
    '''
    output_dir = os.path.join(scratch, 'output')
    dfu = DataFileUtil(callback_url)
    report_name = 'METABOLIC_report_' + str(uuid.uuid4())
    report = KBaseReport(callback_url)
    copyfile(os.path.join(os.path.dirname(__file__), 'report_template.html'),
             os.path.join(output_dir, 'report_template.html'))

    report_shock_id = dfu.file_to_shock({
        'file_path': output_dir,
        'pack': 'zip'
    })['shock_id']

    html_file = {
        'shock_id': report_shock_id,
        'name': 'report_template.html',
        'label': 'report_template.html',
        'description': 'HTML report for METABOLIC'
    }

    report_info = report.create_extended_report({
        'direct_html_link_index':
        0,
        'html_links': [html_file],
        'report_object_name':
        report_name,
        'workspace_name':
        workspace_name
    })
    return {
        'report_name': report_info['name'],
        'report_ref': report_info['ref']
    }
Exemple #10
0
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
        '''
        dfu = DataFileUtil(callback_url)
        report_name = 'kb_gsea_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)
        #copyfile(os.path.join(os.path.dirname(__file__), 'index.html'),
        #         os.path.join(output_dir, 'index.html'))

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
Exemple #11
0
def package_directory(callback_url, dir_path, zip_file_name, zip_file_description):
    ''' Simple utility for packaging a folder and saving to shock '''
    dfu = DataFileUtil(callback_url)
    output = dfu.file_to_shock({'file_path': dir_path,
                                'make_handle': 0,
                                'pack': 'zip'})
    return {'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description}
Exemple #12
0
    def _put_cached_index(self, assembly_info, index_files_basename,
                          output_dir, ws_for_cache):

        if not ws_for_cache:
            print(
                'WARNING: bowtie2 index cannot be cached because "ws_for_cache" field not set'
            )
            return False

        try:
            dfu = DataFileUtil(self.callback_url)
            result = dfu.file_to_shock({
                'file_path': output_dir,
                'make_handle': 1,
                'pack': 'targz'
            })

            bowtie2_index = {
                'handle': result['handle'],
                'size': result['size'],
                'assembly_ref': assembly_info['ref'],
                'index_files_basename': index_files_basename
            }

            ws = Workspace(self.ws_url)
            save_params = {
                'objects': [{
                    'hidden': 1,
                    'provenance': self.provenance,
                    'name': os.path.basename(output_dir),
                    'data': bowtie2_index,
                    'type': 'KBaseRNASeq.Bowtie2IndexV2'
                }]
            }
            if ws_for_cache.strip().isdigit():
                save_params['id'] = int(ws_for_cache)
            else:
                save_params['workspace'] = ws_for_cache.strip()
            save_result = ws.save_objects(save_params)
            print('Bowtie2IndexV2 cached to: ')
            pprint(save_result[0])
            return True

        except Exception:
            # if we fail in saving the cached object, don't worry
            print(
                'WARNING: exception encountered when trying to cache the index files:'
            )
            print(traceback.format_exc())
            print(
                'END WARNING: exception encountered when trying to cache the index files'
            )

        return False
def read_sdf(file_path,
             inchi_path='/kb/module/data/Inchikey_IDs.json',
             mol2_file_dir=None,
             callback_url=None):

    inchi_dict = json.load(open(inchi_path))
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    sdf = AllChem.SDMolSupplier(file_path.encode('ascii', 'ignore'))
    compounds = []
    for i, mol in enumerate(sdf):
        user_id = mol.GetPropsAsDict().get('id')
        print('Found compound ID: {}'.format(user_id))
        handle_id = None
        if user_id and mol2_file_dir:
            mol2_file_path = None
            for root, dirs, files in os.walk(mol2_file_dir):
                for file in files:
                    if os.path.splitext(file)[0] == user_id:
                        logging.info(
                            'Found a matching mol2 file {} for compound {}'.
                            format(str(file), user_id))
                        mol2_file_path = os.path.join(root, str(file))

            if mol2_file_path:
                dfu = DataFileUtil(callback_url)
                handle_id = dfu.file_to_shock({
                    'file_path': mol2_file_path,
                    'make_handle': True
                })['handle']['hid']
            else:
                logging.warning(
                    'Unable to find a matching mol2 file for compound: {}'.
                    format(user_id))

        comp = _make_compound_info(mol)
        comp['name'] = mol.GetProp("_Name")
        comp['mol'] = AllChem.MolToMolBlock(mol)
        if comp['inchikey'] in inchi_dict:
            comp['kb_id'] = inchi_dict[comp['inchikey']]
        else:
            comp['kb_id'] = '%s_%s' % (file_name, i + 1)

        if user_id:
            comp['id'] = user_id
        else:
            comp['id'] = comp['kb_id']

        if handle_id:
            comp['mol2_handle_ref'] = handle_id
            comp['mol2_source'] = 'user uploaded'

        compounds.append(comp)
    return compounds
Exemple #14
0
    def test_basic_upload_and_download(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "trimmed.fasta"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'MyNewAssembly'
        result = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'file': {
                    'path': fasta_path
                },
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name,
                'taxon_ref': 'ReferenceTaxons/unknown_taxon',
            })
        pprint(result)
        self.check_fasta_file(ws_obj_name, fasta_path)
        return

        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shock_id = data_file_cli.file_to_shock({'file_path':
                                                fasta_path})['shock_id']
        ws_obj_name2 = 'MyNewAssembly.2'
        result2 = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'shock_id': shock_id,
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name2
            })
        pprint(result2)
        self.check_fasta_file(ws_obj_name2, fasta_path)

        print('attempting upload via ftp url')
        ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz'
        ws_obj_name3 = 'MyNewAssembly.3'
        result3 = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'ftp_url': ftp_url,
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name3
            })
        pprint(result3)
        # todo: add checks here on ws object

        ws_obj_name3 = 'MyNewAssembly.3'
        result4 = assemblyUtil.export_assembly_as_fasta(
            self.getContext(),
            {'input_ref': self.getWsName() + '/' + ws_obj_name3})
        pprint(result4)
Exemple #15
0
def make_fake_annotation(callback_url, dummy_file, name, ws_name, ws_client):
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    annotation = {
        "handle": dummy_shock_info['handle'],
        "size": 0,
        "genome_id": "not_a_real_genome",
        "genome_scientific_name": "Genomus falsus"
    }
    return make_fake_object(annotation, "KBaseRNASeq.GFFAnnotation", name,
                            ws_name, ws_client)
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
         function for creating html report
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_gsea_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)
        report_dir = "localhost"
        #htmlstring = "<a href=" + report_dir + "/jbrowse/index.html>report link</a>"
        htmlstring = "<a href='./jbrowse/index.html'>report link</a>"
        index_file_path = output_dir + "/index.html"
        html_file = open(index_file_path, "wt")
        n = html_file.write(htmlstring)
        html_file.close()
        # Source path
        #source = "/kb/module/deps/jbrowse"

        # Destination path
        #destination = output_dir +"/jbrowse"

        #dest = shutil.copytree(source, destination)
        #os.system("cp -r " + source +" "+ destination)

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
Exemple #17
0
 def package_folder(self, folder_path, zip_file_name, zip_file_description):
     ''' Simple utility for packaging a folder and saving to shock '''
     if folder_path == self.scratch:
         raise ValueError ("cannot package scatch itself.  folder path: "+folder_path)
     elif not folder_path.startswith(self.scratch):
         raise ValueError ("cannot package folder that is not a subfolder of scratch.  folder path: "+folder_path)
     dfu = DataFileUtil(self.callback_url)
     if not os.path.exists(folder_path):
         raise ValueError ("cannot package folder that doesn't exist: "+folder_path)
     output = dfu.file_to_shock({'file_path': folder_path,
                                 'make_handle': 0,
                                 'pack': 'zip'})
     return {'shock_id': output['shock_id'],
             'name': zip_file_name,
             'label': zip_file_description}
Exemple #18
0
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
        function for creating html report
        :param callback_url:
        :param output_dir:
        :param workspace_name:
        :return:
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_variant_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)
        index_file_path = output_dir + "/snpEff_genes.txt"
        htmlstring = self.create_enrichment_report("snpEff_genes.txt",
                                                   output_dir)

        try:
            with open(output_dir + "/index.html", "w") as html_file:
                html_file.write(htmlstring + "\n")
        except IOError:
            print("Unable to write " + index_file_path + " file on disk.")

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
Exemple #19
0
def generate_product_report(callback_url,
                            workspace_name,
                            output_dir,
                            product_html_loc,
                            output_files,
                            output_objects=None):
    # check params
    if output_objects is None:
        output_objects = []

    # setup utils
    datafile_util = DataFileUtil(callback_url)
    report_util = KBaseReport(callback_url)

    # move html to main directory uploaded to shock so kbase can find it
    html_file = os.path.join(output_dir, 'product.html')
    os.rename(product_html_loc, html_file)
    report_shock_id = datafile_util.file_to_shock({
        'file_path': output_dir,
        'pack': 'zip'
    })['shock_id']
    html_report = [{
        'shock_id': report_shock_id,
        'name': os.path.basename(html_file),
        'label': os.path.basename(html_file),
        'description': 'DRAM product.'
    }]
    report = report_util.create_extended_report({
        'message':
        'Here are the results from your DRAM run.',
        'workspace_name':
        workspace_name,
        'html_links':
        html_report,
        'direct_html_link_index':
        0,
        'file_links': [
            value for key, value in output_files.items()
            if value['path'] is not None
        ],
        'objects_created':
        output_objects,
    })
    return report
Exemple #20
0
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
         function for creating html report
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_gsea_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)

        htmlstring = "<a href='./jbrowse/index.html'>report link</a>"
        index_file_path = output_dir + "/index.html"

        try:
            with open(index_file_path, "wt") as html_file:
                n = html_file.write(htmlstring)
        except IOError:
            print("Unable to write " + index_file_path + " file on disk.")
        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
class htmlreportutils:
    def __init__(self):
        callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(callback_url)
        self.report = KBaseReport(callback_url)
        pass

    def create_html_report(self, output_dir, workspace_name, objects_created):
        '''
         function for creating html report
        '''

        report_name = 'VariationReport' + str(uuid.uuid4())

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'Variation HTML report'
        }

        report_info = self.report.create_extended_report({
            'objects_created':
            objects_created,
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
    def create_html_report(self, callback_url, output_dir, workspace_name):
        '''
         function for creating html report
        '''

        dfu = DataFileUtil(callback_url)
        report_name = 'kb_gsea_report_' + str(uuid.uuid4())
        report = KBaseReport(callback_url)

        htmlstring = self.format_files_to_html_report(output_dir)
        index_file_path = output_dir + "/index.html"
        html_file = open(index_file_path, "wt")
        n = html_file.write(htmlstring)
        html_file.close()

        report_shock_id = dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_file = {
            'shock_id': report_shock_id,
            'name': 'index.html',
            'label': 'index.html',
            'description': 'HTMLL report for GSEA'
        }

        report_info = report.create_extended_report({
            'direct_html_link_index':
            0,
            'html_links': [html_file],
            'report_object_name':
            report_name,
            'workspace_name':
            workspace_name
        })
        return {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
    def run_Gblocks(self, ctx, params):
        """
        Method for trimming MSAs of either DNA or PROTEIN sequences
        **
        **        input_type: MSA
        **        output_type: MSA
        :param params: instance of type "Gblocks_Params" (Gblocks Input
           Params) -> structure: parameter "workspace_name" of type
           "workspace_name" (** The workspace object refs are of form: ** ** 
           objects = ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "desc" of String, parameter "input_ref" of type
           "data_obj_ref", parameter "output_name" of type "data_obj_name",
           parameter "trim_level" of Long, parameter "min_seqs_for_conserved"
           of Long, parameter "min_seqs_for_flank" of Long, parameter
           "max_pos_contig_nonconserved" of Long, parameter "min_block_len"
           of Long, parameter "remove_mask_positions_flag" of Long
        :returns: instance of type "Gblocks_Output" (Gblocks Output) ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN run_Gblocks
        console = []
        invalid_msgs = []
        self.log(console,'Running run_Gblocks with params=')
        self.log(console, "\n"+pformat(params))
        report = ''
#        report = 'Running run_Gblocks with params='
#        report += "\n"+pformat(params)


        #### do some basic checks
        #
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'input_ref' not in params:
            raise ValueError('input_ref parameter is required')
        if 'output_name' not in params:
            raise ValueError('output_name parameter is required')


        #### Get the input_ref MSA object
        ##
        try:
            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            objects = ws.get_objects([{'ref': params['input_ref']}])
            data = objects[0]['data']
            info = objects[0]['info']
            input_name = info[1]
            input_type_name = info[2].split('.')[1].split('-')[0]

        except Exception as e:
            raise ValueError('Unable to fetch input_ref object from workspace: ' + str(e))
            #to get the full stack trace: traceback.format_exc()

        if input_type_name == 'MSA':
            MSA_in = data
            row_order = []
            default_row_labels = dict()
            if 'row_order' in MSA_in.keys():
                row_order = MSA_in['row_order']
            else:
                row_order = sorted(MSA_in['alignment'].keys())

            if 'default_row_labels' in MSA_in.keys():
                default_row_labels = MSA_in['default_row_labels']
            else:
                for row_id in row_order:
                    default_row_labels[row_id] = row_id
            if len(row_order) < 2:
                self.log(invalid_msgs,"must have multiple records in MSA: "+params['input_ref'])

            # export features to FASTA file
            input_MSA_file_path = os.path.join(self.scratch, input_name+".fasta")
            self.log(console, 'writing fasta file: '+input_MSA_file_path)
            records = []
            for row_id in row_order:
                #self.log(console,"row_id: '"+row_id+"'")  # DEBUG
                #self.log(console,"alignment: '"+MSA_in['alignment'][row_id]+"'")  # DEBUG
            # using SeqIO makes multiline sequences.  (Gblocks doesn't care, but FastTree doesn't like multiline, and I don't care enough to change code)
                #record = SeqRecord(Seq(MSA_in['alignment'][row_id]), id=row_id, description=default_row_labels[row_id])
                #records.append(record)
            #SeqIO.write(records, input_MSA_file_path, "fasta")
                records.extend(['>'+row_id,
                                MSA_in['alignment'][row_id]
                               ])
            with open(input_MSA_file_path,'w',0) as input_MSA_file_handle:
                input_MSA_file_handle.write("\n".join(records)+"\n")


            # Determine whether nuc or protein sequences
            #
            NUC_MSA_pattern = re.compile("^[\.\-_ACGTUXNRYSWKMBDHVacgtuxnryswkmbdhv \t\n]+$")
            all_seqs_nuc = True
            for row_id in row_order:
                #self.log(console, row_id+": '"+MSA_in['alignment'][row_id]+"'")
                if NUC_MSA_pattern.match(MSA_in['alignment'][row_id]) == None:
                    all_seqs_nuc = False
                    break

        # Missing proper input_type
        #
        else:
            raise ValueError('Cannot yet handle input_ref type of: '+type_name)


        # DEBUG: check the MSA file contents
#        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA_LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA_LINE: '"+line+"'")


        # validate input data
        #
        N_seqs = 0
        L_first_seq = 0
        with open(input_MSA_file_path, 'r', 0) as input_MSA_file_handle:
            for line in input_MSA_file_handle:
                if line.startswith('>'):
                    N_seqs += 1
                    continue
                if L_first_seq == 0:
                    for c in line:
                        if c != '-' and c != ' ' and c != "\n":
                            L_first_seq += 1
        # min_seqs_for_conserved
        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            if int(params['min_seqs_for_conserved']) < int(0.5*N_seqs)+1:
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be >= N/2+1 (N="+str(N_seqs)+", N/2+1="+str(int(0.5*N_seqs)+1)+")\n")
            if int(params['min_seqs_for_conserved']) > int(params['min_seqs_for_flank']):
                self.log(invalid_msgs,"Min Seqs for Conserved Pos ("+str(params['min_seqs_for_conserved'])+") must be <= Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+")\n")

        # min_seqs_for_flank
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            if int(params['min_seqs_for_flank']) > N_seqs:
                self.log(invalid_msgs,"Min Seqs for Flank Pos ("+str(params['min_seqs_for_flank'])+") must be <= N (N="+str(N_seqs)+")\n")

        # max_pos_contig_nonconserved
        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) != 0:
            if int(params['max_pos_contig_nonconserved']) < 0:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be >= 0"+"\n")
            if int(params['max_pos_contig_nonconserved']) > L_first_seq or int(params['max_pos_contig_nonconserved']) >= 32000:
                self.log(invalid_msgs,"Max Num Non-Conserved Pos ("+str(params['max_pos_contig_nonconserved'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # min_block_len
        if 'min_block_len' in params and params['min_block_len'] != None and int(params['min_block_len']) != 0:
            if int(params['min_block_len']) < 2:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be >= 2"+"\n")
            if int(params['min_block_len']) > L_first_seq or int(params['min_block_len']) >= 32000:
                self.log(invalid_msgs,"Min Block Len ("+str(params['min_block_len'])+") must be <= L first seq ("+str(L_first_seq)+") and < 32000\n")

        # trim_level
        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            if int(params['trim_level']) < 0 or int(params['trim_level']) > 2:
                self.log(invalid_msgs,"Trim Level ("+str(params['trim_level'])+") must be >= 0 and <= 2"+"\n")


        if len(invalid_msgs) > 0:

            # load the method provenance from the context object
            self.log(console,"SETTING PROVENANCE")  # DEBUG
            provenance = [{}]
            if 'provenance' in ctx:
                provenance = ctx['provenance']
            # add additional info to provenance here, in this case the input data object reference
            provenance[0]['input_ws_objects'] = []
            provenance[0]['input_ws_objects'].append(params['input_ref'])
            provenance[0]['service'] = 'kb_gblocks'
            provenance[0]['method'] = 'run_Gblocks'

            # report
            report += "FAILURE\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report_obj_info = ws.save_objects({
#                'id':info[6],
                'workspace':params['workspace_name'],
                'objects':[
                    {
                        'type':'KBaseReport.Report',
                        'data':reportObj,
                        'name':reportName,
                        'meta':{},
                        'hidden':1,
                        'provenance':provenance
                    }
                ]
            })[0]


            self.log(console,"BUILDING RETURN OBJECT")
            returnVal = { 'report_name': reportName,
                          'report_ref': str(report_obj_info[6]) + '/' + str(report_obj_info[0]) + '/' + str(report_obj_info[4])
#                          'output_ref': None
                          }
            self.log(console,"run_Gblocks DONE")
            return [returnVal]


        ### Construct the command
        #
        #  e.g.
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks
        #
        gblocks_cmd = [self.GBLOCKS_bin]

        # check for necessary files
        if not os.path.isfile(self.GBLOCKS_bin):
            raise ValueError("no such file '"+self.GBLOCKS_bin+"'")
        if not os.path.isfile(input_MSA_file_path):
            raise ValueError("no such file '"+input_MSA_file_path+"'")
        if not os.path.getsize(input_MSA_file_path) > 0:
            raise ValueError("empty file '"+input_MSA_file_path+"'")

        # DEBUG
#        with open(input_MSA_file_path,'r',0) as input_MSA_file_handle:
#            for line in input_MSA_file_handle:
#                #self.log(console,"MSA LINE: '"+line+"'")  # too big for console
#                self.log(invalid_msgs,"MSA LINE: '"+line+"'")


        # set the output path
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000)
        output_dir = os.path.join(self.scratch,'output.'+str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Gblocks names output blocks MSA by appending "-gb" to input file
        #output_GBLOCKS_file_path = os.path.join(output_dir, input_name+'-gb')
        output_GBLOCKS_file_path = input_MSA_file_path+'-gb'
        output_aln_file_path = output_GBLOCKS_file_path

        # Gblocks is interactive and only accepts args from pipe input
        #if 'arg' in params and params['arg'] != None and params['arg'] != 0:
        #    fasttree_cmd.append('-arg')
        #    fasttree_cmd.append(val)


        # Run GBLOCKS, capture output as it happens
        #
        self.log(console, 'RUNNING GBLOCKS:')
        self.log(console, '    '+' '.join(gblocks_cmd))
#        report += "\n"+'running GBLOCKS:'+"\n"
#        report += '    '+' '.join(gblocks_cmd)+"\n"

        # FastTree requires shell=True in order to see input data
        env = os.environ.copy()
        #joined_fasttree_cmd = ' '.join(fasttree_cmd)  # redirect out doesn't work with subprocess unless you join command first
        #p = subprocess.Popen([joined_fasttree_cmd], \
        p = subprocess.Popen(gblocks_cmd, \
                             cwd = self.scratch, \
                             stdin = subprocess.PIPE, \
                             stdout = subprocess.PIPE, \
                             stderr = subprocess.PIPE, \
                             shell = True, \
                             env = env)
#                             executable = '/bin/bash' )

        
        # write commands to process
        #
        #  for "0.5" gaps: cat "o\n<MSA_file>\nb\n5\ng\nm\nq\n" | Gblocks
        #  for "all" gaps: cat "o\n<MSA_file>\nb\n5\n5\ng\nm\nq\n" | Gblocks

        p.stdin.write("o"+"\n")  # open MSA file
        p.stdin.write(input_MSA_file_path+"\n")

        if 'trim_level' in params and params['trim_level'] != None and int(params['trim_level']) != 0:
            p.stdin.write("b"+"\n")
            if int(params['trim_level']) >= 1:
                self.log (console,"changing trim level")
                p.stdin.write("5"+"\n")  # set to "half"
                if int(params['trim_level']) == 2:
                    self.log (console,"changing trim level")
                    p.stdin.write("5"+"\n")  # set to "all"
                elif int(params['trim_level']) > 2:
                    raise ValueError ("trim_level ("+str(params['trim_level'])+") was not between 0-2")
                p.stdin.write("m"+"\n")

        # flank must precede conserved because it acts us upper bound for acceptable conserved values
        if 'min_seqs_for_flank' in params and params['min_seqs_for_flank'] != None and int(params['min_seqs_for_flank']) != 0:
            self.log (console,"changing min_seqs_for_flank")
            p.stdin.write("b"+"\n")
            p.stdin.write("2"+"\n")
            p.stdin.write(str(params['min_seqs_for_flank'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_seqs_for_conserved' in params and params['min_seqs_for_conserved'] != None and int(params['min_seqs_for_conserved']) != 0:
            self.log (console,"changing min_seqs_for_conserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("1"+"\n")
            p.stdin.write(str(params['min_seqs_for_conserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'max_pos_contig_nonconserved' in params and params['max_pos_contig_nonconserved'] != None and int(params['max_pos_contig_nonconserved']) > -1:
            self.log (console,"changing max_pos_contig_nonconserved")
            p.stdin.write("b"+"\n")
            p.stdin.write("3"+"\n")
            p.stdin.write(str(params['max_pos_contig_nonconserved'])+"\n")
            p.stdin.write("m"+"\n")

        if 'min_block_len' in params and params['min_block_len'] != None and params['min_block_len'] != 0:
            self.log (console,"changing min_block_len")
            p.stdin.write("b"+"\n")
            p.stdin.write("4"+"\n")
            p.stdin.write(str(params['min_block_len'])+"\n")
            p.stdin.write("m"+"\n")
        
        p.stdin.write("g"+"\n")  # get blocks
        p.stdin.write("q"+"\n")  # quit
        p.stdin.close()
        p.wait()


        # Read output
        #
        while True:
            line = p.stdout.readline()
            #line = p.stderr.readline()
            if not line: break
            self.log(console, line.replace('\n', ''))

        p.stdout.close()
        #p.stderr.close()
        p.wait()
        self.log(console, 'return code: ' + str(p.returncode))
#        if p.returncode != 0:
        if p.returncode != 1:
            raise ValueError('Error running GBLOCKS, return code: '+str(p.returncode) + 
                '\n\n'+ '\n'.join(console))

        # Check that GBLOCKS produced output
        #
        if not os.path.isfile(output_GBLOCKS_file_path):
            raise ValueError("failed to create GBLOCKS output: "+output_GBLOCKS_file_path)
        elif not os.path.getsize(output_GBLOCKS_file_path) > 0:
            raise ValueError("created empty file for GBLOCKS output: "+output_GBLOCKS_file_path)


        # load the method provenance from the context object
        #
        self.log(console,"SETTING PROVENANCE")  # DEBUG
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = []
        provenance[0]['input_ws_objects'].append(params['input_ref'])
        provenance[0]['service'] = 'kb_gblocks'
        provenance[0]['method'] = 'run_Gblocks'


        # reformat output to single-line FASTA MSA and check that output not empty (often happens when param combinations don't produce viable blocks
        #
        output_fasta_buf = []
        id_order = []
        this_id = None
        ids = dict()
        alignment = dict()
        L_alignment = 0;
        L_alignment_set = False
        with open(output_GBLOCKS_file_path,'r',0) as output_GBLOCKS_file_handle:
            for line in output_GBLOCKS_file_handle:
                line = line.rstrip()
                if line.startswith('>'):
                    this_id = line[1:]
                    output_fasta_buf.append ('>'+re.sub('\s','_',default_row_labels[this_id]))
                    id_order.append(this_id)
                    alignment[this_id] = ''
                    if L_alignment != 0 and not L_alignment_set:
                         L_alignment_set = True
                    continue
                output_fasta_buf.append (line)
                for c in line:
                    if c != ' ' and c != "\n":
                        alignment[this_id] += c
                        if not L_alignment_set:
                            L_alignment += 1
        if L_alignment == 0:
            self.log(invalid_msgs,"params produced no blocks.  Consider changing to less stringent values")
        else:
            if 'remove_mask_positions_flag' in params and params['remove_mask_positions_flag'] != None and params['remove_mask_positions_flag'] != '' and params['remove_mask_positions_flag'] == 1:
                self.log (console,"removing mask positions")
                mask = []
                new_alignment = dict()
                for i in range(0,L_alignment):
                    mask[i] = '+'
                    if alignment[id_order[0]][i] == '-' \
                        or alignment[id_order[0]][i] == 'X' \
                        or alignment[id_order[0]][i] == 'x':
                        mask[i] = '-'
                for row_id in id_order:
                    new_alignment[row_id] = ''
                    for i,c in enumerate(alignment[row_id]):
                         if mask[i] == '+':
                            new_alignment[row_id] += c
                alignment = new_alignment

            L_alignment = len(alignment[id_order[0]])

            # write fasta with tidied ids
            output_MSA_file_path = os.path.join(output_dir, params['output_name']+'.fasta');
            with open(output_MSA_file_path,'w',0) as output_MSA_file_handle:
                output_MSA_file_handle.write("\n".join(output_fasta_buf)+"\n")


        # Upload results
        #
        if len(invalid_msgs) == 0:
            self.log(console,"UPLOADING RESULTS")  # DEBUG

# Didn't write file
#            with open(output_MSA_file_path,'r',0) as output_MSA_file_handle:
#                output_MSA_buf = output_MSA_file_handle.read()
#            output_MSA_buf = output_MSA_buf.rstrip()
#            self.log(console,"\nMSA:\n"+output_MSA_buf+"\n")
        
            # Build output_MSA structure
            #   first extract old info from MSA (labels, ws_refs, etc.)
            #
            MSA_out = dict()
            for key in MSA_in.keys():
                 MSA_out[key] = MSA_in[key]

            # then replace with new info
            #
            MSA_out['alignment'] = alignment
            MSA_out['name'] = params['output_name']
            MSA_out['alignment_length'] = alignment_length = L_alignment
            MSA_name = params['output_name']
            MSA_description = ''
            if 'desc' in params and params['desc'] != None and params['desc'] != '':
                MSA_out['desc'] = MSA_description = params['desc']

            # Store MSA_out
            #
            new_obj_info = ws.save_objects({
                            'workspace': params['workspace_name'],
                            'objects':[{
                                    'type': 'KBaseTrees.MSA',
                                    'data': MSA_out,
                                    'name': params['output_name'],
                                    'meta': {},
                                    'provenance': provenance
                                }]
                        })[0]


            # create CLW formatted output file
            max_row_width = 60
            id_aln_gap_width = 1
            gap_chars = ''
            for sp_i in range(id_aln_gap_width):
                gap_chars += ' '
            # DNA
            if all_seqs_nuc:
                strong_groups = { 'AG': True,
                                  'CTU': True
                                  }
                weak_groups = None
            # PROTEINS
            else:
                strong_groups = { 'AST':  True,
                                  'EKNQ': True,
                                  'HKNQ': True,
                                  'DENQ': True,
                                  'HKQR': True,
                                  'ILMV': True,
                                  'FILM': True,
                                  'HY':   True,
                                  'FWY':  True
                                  }
                weak_groups = { 'ACS':    True,
                                'ATV':    True,
                                'AGS':    True,
                                'KNST':   True,
                                'APST':   True,
                                'DGNS':   True,
                                'DEKNQS': True,
                                'DEHKNQ': True,
                                'EHKNQR': True,
                                'FILMV':  True,
                                'FHY':    True
                                }
                
            clw_buf = []
            clw_buf.append ('CLUSTALW format of GBLOCKS trimmed MSA '+MSA_name+': '+MSA_description)
            clw_buf.append ('')

            long_id_len = 0
            aln_pos_by_id = dict()
            for row_id in row_order:
                aln_pos_by_id[row_id] = 0
                row_id_disp = default_row_labels[row_id]
                if long_id_len < len(row_id_disp):
                    long_id_len = len(row_id_disp)

            full_row_cnt = alignment_length // max_row_width
            if alignment_length % max_row_width == 0:
                full_row_cnt -= 1
            for chunk_i in range (full_row_cnt + 1):
                for row_id in row_order:
                    row_id_disp = re.sub('\s','_',default_row_labels[row_id])
                    for sp_i in range (long_id_len-len(row_id_disp)):
                        row_id_disp += ' '

                    aln_chunk_upper_bound = (chunk_i+1)*max_row_width
                    if aln_chunk_upper_bound > alignment_length:
                        aln_chunk_upper_bound = alignment_length
                    aln_chunk = alignment[row_id][chunk_i*max_row_width:aln_chunk_upper_bound]
                    for c in aln_chunk:
                        if c != '-':
                            aln_pos_by_id[row_id] += 1

                    clw_buf.append (row_id_disp+gap_chars+aln_chunk+' '+str(aln_pos_by_id[row_id]))

                # conservation line
                cons_line = ''
                for pos_i in range(chunk_i*max_row_width, aln_chunk_upper_bound):
                    col_chars = dict()
                    seq_cnt = 0
                    for row_id in row_order:
                        char = alignment[row_id][pos_i]
                        if char != '-':
                            seq_cnt += 1
                            col_chars[char] = True
                    if seq_cnt <= 1:
                        cons_char = ' '
                    elif len(col_chars.keys()) == 1:
                        cons_char = '*'
                    else:
                        strong = False
                        for strong_group in strong_groups.keys():
                            this_strong_group = True
                            for seen_char in col_chars.keys():
                                if seen_char not in strong_group:
                                    this_strong_group = False
                                    break
                            if this_strong_group:
                                strong = True
                                break
                        if not strong:
                            weak = False
                            if weak_groups != None:
                                for weak_group in weak_groups.keys():
                                    this_weak_group = True
                                    for seen_char in col_chars.keys():
                                        if seen_char not in weak_group:
                                            this_strong_group = False
                                            break
                                    if this_weak_group:
                                        weak = True
                        if strong:
                            cons_char = ':'
                        elif weak:
                            cons_char = '.'
                        else:
                            cons_char = ' '
                    cons_line += cons_char

                lead_space = ''
                for sp_i in range(long_id_len):
                    lead_space += ' '
                lead_space += gap_chars

                clw_buf.append(lead_space+cons_line)
                clw_buf.append('')

            # write clw to file
            clw_buf_str = "\n".join(clw_buf)+"\n"
            output_clw_file_path = os.path.join(output_dir, input_name+'-MSA.clw');
            with open (output_clw_file_path, "w", 0) as output_clw_file_handle:
                output_clw_file_handle.write(clw_buf_str)
            output_clw_file_handle.close()


            # upload GBLOCKS FASTA output to SHOCK for file_links
            dfu = DFUClient(self.callbackURL)
            try:
                output_upload_ret = dfu.file_to_shock({'file_path': output_aln_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                       'make_handle': 0})
            except:
                raise ValueError ('error loading aln_out file to shock')

            # upload GBLOCKS CLW output to SHOCK for file_links
            try:
                output_clw_upload_ret = dfu.file_to_shock({'file_path': output_clw_file_path,
# DEBUG
#                                                      'make_handle': 0,
#                                                      'pack': 'zip'})
                                                           'make_handle': 0})
            except:
                raise ValueError ('error loading clw_out file to shock')


            # make HTML reports
            #
            # HERE


            # build output report object
            #
            self.log(console,"BUILDING REPORT")  # DEBUG

            reportName = 'gblocks_report_'+str(uuid.uuid4())
            reportObj = {
                'objects_created':[{'ref':params['workspace_name']+'/'+params['output_name'],
                                    'description':'GBLOCKS MSA'}],
                #'message': '',
                'message': clw_buf_str,
                'direct_html': '',
                #'direct_html_link_index': 0,
                'file_links': [],
                'html_links': [],
                'workspace_name': params['workspace_name'],
                'report_object_name': reportName
                }
            reportObj['file_links'] = [{'shock_id': output_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.FASTA',
                                        'label': 'GBLOCKS-trimmed MSA FASTA'
                                        },
                                       {'shock_id': output_clw_upload_ret['shock_id'],
                                        'name': params['output_name']+'-GBLOCKS.CLW',
                                        'label': 'GBLOCKS-trimmed MSA CLUSTALW'
                                        }]

            # save report object
            #
            SERVICE_VER = 'release'
            reportClient = KBaseReport(self.callbackURL, token=ctx['token'], service_ver=SERVICE_VER)
            #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
            report_info = reportClient.create_extended_report(reportObj)                                       

        else:  # len(invalid_msgs) > 0
            reportName = 'gblocks_report_'+str(uuid.uuid4())
            report += "FAILURE:\n\n"+"\n".join(invalid_msgs)+"\n"
            reportObj = {
                'objects_created':[],
                'text_message':report
                }

            ws = workspaceService(self.workspaceURL, token=ctx['token'])
            report_obj_info = ws.save_objects({
                    #'id':info[6],
                    'workspace':params['workspace_name'],
                    'objects':[
                        {
                            'type':'KBaseReport.Report',
                            'data':reportObj,
                            'name':reportName,
                            'meta':{},
                            'hidden':1,
                            'provenance':provenance
                            }
                        ]
                    })[0]

            report_info = dict()
            report_info['name'] = report_obj_info[1]
            report_info['ref'] = str(report_obj_info[6])+'/'+str(report_obj_info[0])+'/'+str(report_obj_info[4])


        # done
        returnVal = { 'report_name': report_info['name'],
                      'report_ref': report_info['ref']
                      }

        self.log(console,"run_Gblocks DONE")
        #END run_Gblocks

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method run_Gblocks return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
class BaseModule:
    def __init__(self, config, version, name):
        self.config = config
        if "SDK_CALLBACK_URL" in os.environ:
            self.callback_url = os.environ['SDK_CALLBACK_URL']
            self.dfu = DataFileUtil(self.callback_url)
        self.version = version
        self.name = name
        self.scratch_folder = config['scratch']
        logging.basicConfig(format='%(created)s %(levelname)s: %(message)s',
                            level=logging.INFO)
        self.clear_context()
        self.report_html = None

    def validate_args(self, params, required, defaults):
        for item in required:
            if item not in params:
                raise ValueError('Required argument ' + item + ' is missing!')
        for key in defaults:
            if key not in params:
                params[key] = defaults[key]
        return params

    def clear_context(self):
        self.report_info = None
        self.ctx = None
        self.output_type = None
        self.output_id = None
        self.wsclient = None

    def finalize_call(self, output):
        if self.report_info != None:
            output['report_name'] = self.report_info['name']
            output['report_ref'] = self.report_info['ref']
        if self.workspace != None:
            output['workspace_name'] = self.workspace
            output['ws'] = self.workspace
        if self.output_type != None:
            output['type'] = self.output_type
            output['obj'] = self.output_id
        return output

    def initialize_call(self,
                        ctx,
                        workspace=None,
                        output_type=None,
                        output_id=None):
        self.clear_context()
        self.workspace = workspace
        self.ctx = ctx
        self.output_type = output_type
        self.output_id = output_id
        self.objects_created = []
        self.wsclient = Workspace(self.config["workspace-url"],
                                  token=self.ctx['token'])

    def add_created_object(self, ref, description):
        self.objects_created.append({"ref": ref, "description": description})

    def create_report(self, context, template_file=None, height=500):
        html_report_folder = os.path.join(self.scratch_folder, 'htmlreport')
        os.makedirs(html_report_folder, exist_ok=True)

        with open(os.path.join(html_report_folder, 'view.html'), 'w') as f:
            self.report_html = self.build_report(context, template_file)
            f.write(self.report_html)

        report_shock_id = ""
        if self.config["save_report_to_kbase"] == "1":
            report_shock_id = self.dfu.file_to_shock({
                'file_path': html_report_folder,
                'pack': 'zip'
            })['shock_id']

        html_output = {'name': 'view.html', 'shock_id': report_shock_id}
        report_params = {
            'objects_created': self.objects_created,
            'workspace_name': self.workspace,
            'html_links': [html_output],
            'direct_html_link_index': 0,
            'html_window_height': height,
            'report_object_name': self.name + '_report_' + str(uuid.uuid4())
        }
        if self.config["save_report_to_kbase"] == "1":
            report = KBaseReport(self.callback_url, token=self.ctx['token'])
            self.report_info = report.create_extended_report(report_params)
        return self.report_html

    def build_report(self, context, template_file=None):
        if template_file == None:
            template_file = self.config["template_file"]
        # Directory this file is in
        array = template_file.split("/")
        filename = array.pop()
        template_dir = "/".join(array)
        env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_dir),
                                 autoescape=jinja2.select_autoescape(
                                     ['html', 'xml']))
        # Return string of html
        return env.get_template(filename).render(context)
Exemple #25
0
class BiomUtil:
    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _process_params(self, params):
        logging.info('start validating import_matrix_from_biom params')

        # check for required parameters
        for p in [
                'obj_type', 'matrix_name', 'workspace_id', 'scale',
                'amplicon_type', 'sequencing_technology',
                'sequencing_instrument', 'target_gene', 'target_subfragment',
                'taxon_calling'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # check sequencing_technology and sequencing_instrument matching
        sequencing_technology = params.get('sequencing_technology')
        sequencing_instrument = params.get('sequencing_instrument')
        if sequencing_technology not in SEQ_INSTRUMENTS_MAP:
            raise ValueError('Unexpected sequencing technology: {}'.format(
                sequencing_technology))
        expected_instruments = SEQ_INSTRUMENTS_MAP.get(sequencing_technology)
        if sequencing_instrument not in expected_instruments:
            raise ValueError(
                'Please select sequencing instrument among {} for {}'.format(
                    expected_instruments, sequencing_technology))

        # check target_gene and target_subfragment matching
        target_gene = params.get('target_gene')
        target_subfragment = list(set(params.get('target_subfragment')))
        params['target_subfragment'] = target_subfragment

        if target_gene not in TARGET_GENE_SUBFRAGMENT_MAP:
            raise ValueError('Unexpected target gene: {}'.format(target_gene))
        expected_subfragments = TARGET_GENE_SUBFRAGMENT_MAP.get(target_gene)
        if not set(target_subfragment) <= set(expected_subfragments):
            raise ValueError(
                'Please select target subfragments among {} for {}'.format(
                    expected_subfragments, target_gene))

        # check taxon_calling
        taxon_calling = params.get('taxon_calling')
        taxon_calling_method = list(
            set(taxon_calling.get('taxon_calling_method')))
        params['taxon_calling_method'] = taxon_calling_method

        if 'denoising' in taxon_calling_method:
            denoise_method = taxon_calling.get('denoise_method')
            sequence_error_cutoff = taxon_calling.get('sequence_error_cutoff')

            if not (denoise_method and sequence_error_cutoff):
                raise ValueError(
                    'Please provide denoise_method and sequence_error_cutoff')

            params['denoise_method'] = denoise_method
            params['sequence_error_cutoff'] = sequence_error_cutoff

        if 'clustering' in taxon_calling_method:
            clustering_method = taxon_calling.get('clustering_method')
            clustering_cutoff = taxon_calling.get('clustering_cutoff')

            if not (clustering_method and clustering_cutoff):
                raise ValueError(
                    'Please provide clustering_method and clustering_cutoff')

            params['clustering_method'] = clustering_method
            params['clustering_cutoff'] = clustering_cutoff

        obj_type = params.get('obj_type')
        if obj_type not in self.matrix_types:
            raise ValueError('Unknown matrix object type: {}'.format(obj_type))

        scale = params.get('scale')
        if scale not in SCALE_TYPES:
            raise ValueError('Unknown scale type: {}'.format(scale))

        biom_file = None
        tsv_file = None
        fasta_file = None
        metadata_keys = DEFAULT_META_KEYS

        input_local_file = params.get('input_local_file', False)

        if params.get('taxonomic_abundance_tsv') and params.get(
                'taxonomic_fasta'):
            tsv_file = params.get('taxonomic_abundance_tsv')
            fasta_file = params.get('taxonomic_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            if not input_local_file:
                tsv_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    tsv_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')

            metadata_keys_str = params.get('metadata_keys')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        elif params.get('biom_fasta'):
            biom_fasta = params.get('biom_fasta')
            biom_file = biom_fasta.get('biom_file_biom_fasta')
            fasta_file = biom_fasta.get('fasta_file_biom_fasta')

            if not (biom_file and fasta_file):
                raise ValueError('missing BIOM or FASTA file')

            if not input_local_file:
                biom_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    biom_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')
            mode = 'biom_fasta'
        elif params.get('tsv_fasta'):
            tsv_fasta = params.get('tsv_fasta')
            tsv_file = tsv_fasta.get('tsv_file_tsv_fasta')
            fasta_file = tsv_fasta.get('fasta_file_tsv_fasta')

            if not (tsv_file and fasta_file):
                raise ValueError('missing TSV or FASTA file')

            if not input_local_file:
                tsv_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    tsv_file
                }).get('copy_file_path')

                fasta_file = self.dfu.download_staging_file({
                    'staging_file_subdir_path':
                    fasta_file
                }).get('copy_file_path')

            metadata_keys_str = tsv_fasta.get('metadata_keys_tsv_fasta')
            if metadata_keys_str:
                metadata_keys += [
                    x.strip() for x in metadata_keys_str.split(',')
                ]
            mode = 'tsv_fasta'
        else:
            raise ValueError('missing valide file group type in parameters')

        return (biom_file, tsv_file, fasta_file, mode,
                list(set(metadata_keys)))

    def _validate_fasta_file(self, df, fasta_file):
        logging.info('start validating FASTA file')
        try:
            fastq_dict = SeqIO.index(fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        matrix_ids = df.index
        file_ids = fastq_dict.keys()

        unmatched_ids = set(matrix_ids) - set(file_ids)

        if unmatched_ids:
            raise ValueError(
                'FASTA file does not have [{}] OTU id'.format(unmatched_ids))

    def _file_to_amplicon_data(self,
                               biom_file,
                               tsv_file,
                               fasta_file,
                               mode,
                               refs,
                               matrix_name,
                               workspace_id,
                               scale,
                               description,
                               metadata_keys=None):

        amplicon_data = refs

        if mode.startswith('biom'):
            logging.info('start parsing BIOM file for matrix data')
            table = biom.load_table(biom_file)
            observation_metadata = table._observation_metadata
            sample_metadata = table._sample_metadata

            matrix_data = {
                'row_ids': table._observation_ids.tolist(),
                'col_ids': table._sample_ids.tolist(),
                'values': table.matrix_data.toarray().tolist()
            }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row", observation_metadata,
                                           matrix_data, matrix_name, refs,
                                           workspace_id))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
            for k in ('create_date', 'generated_by'):
                val = getattr(table, k)
                if not val:
                    continue
                if isinstance(val, bytes):
                    amplicon_data['attributes'][k] = val.decode('utf-8')
                else:
                    amplicon_data['attributes'][k] = str(val)
        elif mode.startswith('tsv'):
            observation_metadata = None
            sample_metadata = None
            try:
                logging.info('start parsing TSV file for matrix data')
                reader = pd.read_csv(tsv_file, sep=None, iterator=True)
                inferred_sep = reader._engine.data.dialect.delimiter
                df = pd.read_csv(tsv_file, sep=inferred_sep, index_col=0)
            except Exception:
                raise ValueError(
                    'Cannot parse file. Please provide valide tsv file')
            else:
                self._validate_fasta_file(df, fasta_file)
                metadata_df = None
                if metadata_keys:
                    shared_metadata_keys = list(
                        set(metadata_keys) & set(df.columns))
                    if mode == 'tsv' and 'consensus_sequence' not in shared_metadata_keys:
                        raise ValueError(
                            'TSV file does not include consensus_sequence')
                    if shared_metadata_keys:
                        metadata_df = df[shared_metadata_keys]
                        df.drop(columns=shared_metadata_keys, inplace=True)
                try:
                    df = df.astype(float)
                except ValueError:
                    err_msg = 'Found some non-float values. Matrix contains only numeric values\n'
                    err_msg += 'Please list any non-numeric column names in  Metadata Keys field'
                    raise ValueError(err_msg)
                df.fillna(0, inplace=True)
                df.index = df.index.astype('str')
                df.columns = df.columns.astype('str')
                matrix_data = {
                    'row_ids': df.index.tolist(),
                    'col_ids': df.columns.tolist(),
                    'values': df.values.tolist()
                }

            logging.info('start building attribute mapping object')
            amplicon_data.update(
                self.get_attribute_mapping("row",
                                           observation_metadata,
                                           matrix_data,
                                           matrix_name,
                                           refs,
                                           workspace_id,
                                           metadata_df=metadata_df))
            amplicon_data.update(
                self.get_attribute_mapping("col", sample_metadata, matrix_data,
                                           matrix_name, refs, workspace_id))

            amplicon_data['attributes'] = {}
        else:
            raise ValueError(
                'error parsing _file_to_amplicon_data, mode: {}'.format(mode))

        amplicon_data.update({'data': matrix_data})

        amplicon_data['search_attributes'] = [
            f'{k}|{v}' for k, v in amplicon_data['attributes'].items()
        ]

        amplicon_data['scale'] = scale
        if description:
            amplicon_data['description'] = description

        return amplicon_data

    def get_attribute_mapping(self,
                              axis,
                              metadata,
                              matrix_data,
                              matrix_name,
                              refs,
                              workspace_id,
                              metadata_df=None):
        mapping_data = {}
        axis_ids = matrix_data[f'{axis}_ids']
        if refs.get('sample_set_ref') and axis == 'col':
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._sample_set_to_attribute_mapping(
                    axis_ids, refs.get('sample_set_ref'), name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif refs.get(f'{axis}_attributemapping_ref'):
            am_data = self.dfu.get_objects(
                {'object_refs':
                 [refs[f'{axis}_attributemapping_ref']]})['data'][0]['data']
            unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
            if unmatched_ids:
                name = "Column" if axis == 'col' else "Row"
                raise ValueError(
                    f"The following {name} IDs from the uploaded matrix do not match "
                    f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                    f"\nPlease verify the input data or upload an excel file with a"
                    f"{name} mapping tab.")
            else:
                mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._metadata_to_attribute_mapping(
                    axis_ids, metadata, name, workspace_id)
            # if coming from biom file, metadata and axis IDs are guaranteed to match
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}
        elif metadata_df is not None:
            name = matrix_name + "_{}_attributes".format(axis)
            mapping_data[
                f'{axis}_attributemapping_ref'] = self._meta_df_to_attribute_mapping(
                    axis_ids, metadata_df, name, workspace_id)
            mapping_data[f'{axis}_mapping'] = {x: x for x in axis_ids}

        return mapping_data

    def _meta_df_to_attribute_mapping(self, axis_ids, metadata_df, obj_name,
                                      ws_id):
        data = {'ontology_mapping_method': "TSV file", 'instances': {}}
        metadata_df = metadata_df.astype(str)
        attribute_keys = metadata_df.columns.tolist()
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in attribute_keys]

        if 'taxonomy' in attribute_keys:
            data['attributes'].append({
                'attribute': 'parsed_user_taxonomy',
                'source': 'upload'
            })

        for axis_id in axis_ids:
            data['instances'][axis_id] = metadata_df.loc[axis_id].tolist()
            if 'taxonomy' in attribute_keys:
                parsed_user_taxonomy = None
                taxonomy_index = attribute_keys.index('taxonomy')
                taxonomy_str = metadata_df.loc[axis_id].tolist(
                )[taxonomy_index]
                parsed_user_taxonomy = self.taxon_util.process_taxonomic_str(
                    taxonomy_str)
                data['instances'][axis_id].append(parsed_user_taxonomy)

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _sample_set_to_attribute_mapping(self, axis_ids, sample_set_ref,
                                         obj_name, ws_id):

        am_data = self.sampleservice_util.sample_set_to_attribute_mapping(
            sample_set_ref)

        unmatched_ids = set(axis_ids) - set(am_data['instances'].keys())
        if unmatched_ids:
            name = "Column"
            raise ValueError(
                f"The following {name} IDs from the uploaded matrix do not match "
                f"the supplied {name} attribute mapping: {', '.join(unmatched_ids)}"
                f"\nPlease verify the input data or upload an excel file with a"
                f"{name} mapping tab.")

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": am_data,
                "name": obj_name
            }]
        })[0]

        return f'{info[6]}/{info[0]}/{info[4]}'

    def _metadata_to_attribute_mapping(self, instances, metadata, obj_name,
                                       ws_id):
        data = {'ontology_mapping_method': "BIOM file", 'instances': {}}
        sample_set = metadata[0:min(len(metadata), 25)]
        metadata_keys = sorted(
            set((k for m_dict in sample_set for k in m_dict)))
        data['attributes'] = [{
            'attribute': key,
            'source': 'upload'
        } for key in metadata_keys]
        for inst, meta in zip(instances, metadata):
            data['instances'][inst] = [
                str(meta[attr]) for attr in metadata_keys
            ]

        logging.info(
            'start saving AttributeMapping object: {}'.format(obj_name))
        info = self.dfu.save_objects({
            "id":
            ws_id,
            "objects": [{
                "type": "KBaseExperiments.AttributeMapping",
                "data": data,
                "name": obj_name
            }]
        })[0]
        return f'{info[6]}/{info[0]}/{info[4]}'

    def _generate_visualization_content(self, output_directory, heatmap_dir,
                                        data_df, top_heatmap_dir, top_percent,
                                        display_count):

        row_data_summary = data_df.T.describe().round(2).to_string()
        col_data_summary = data_df.describe().round(2).to_string()

        tab_def_content = ''
        tab_content = ''

        viewer_name = 'data_summary'
        tab_def_content += '''\n<div class="tab">\n'''
        tab_def_content += '''\n<button class="tablinks" '''
        tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
            viewer_name)
        tab_def_content += ''' id="defaultOpen"'''
        tab_def_content += '''>Matrix Statistics</button>\n'''

        tab_content += '''\n<div id="{}" class="tabcontent" style="overflow:auto">'''.format(
            viewer_name)
        tab_content += '''\n<h5>Amplicon Matrix Size: {} x {}</h5>'''.format(
            len(data_df.index), len(data_df.columns))
        tab_content += '''\n<h5>Row Aggregating Statistics</h5>'''
        html = '''\n<pre class="tab">''' + str(row_data_summary).replace(
            "\n", "<br>") + "</pre>"
        tab_content += html
        tab_content += '''\n<br>'''
        tab_content += '''\n<hr style="height:2px;border-width:0;color:gray;background-color:gray">'''
        tab_content += '''\n<br>'''
        tab_content += '''\n<h5>Column Aggregating Statistics</h5>'''
        html = '''\n<pre class="tab">''' + str(col_data_summary).replace(
            "\n", "<br>") + "</pre>"
        tab_content += html
        tab_content += '\n</div>\n'

        if top_heatmap_dir:
            viewer_name = 'TopHeatmapViewer'
            tab_def_content += '''\n<button class="tablinks" '''
            tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
                viewer_name)
            tab_def_content += '''>Top {}% ({} Rows) Heatmap</button>\n'''.format(
                round(top_percent, 2), display_count)

            heatmap_report_files = os.listdir(top_heatmap_dir)

            heatmap_index_page = None
            for heatmap_report_file in heatmap_report_files:
                if heatmap_report_file.endswith('.html'):
                    heatmap_index_page = heatmap_report_file

                shutil.copy2(
                    os.path.join(top_heatmap_dir, heatmap_report_file),
                    output_directory)

            if heatmap_index_page:
                tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                    viewer_name)
                msg = 'Top {} percent of matrix sorted by sum of abundance values.'.format(
                    round(top_percent, 2))
                tab_content += '''<p style="color:red;" >{}</p>'''.format(msg)

                tab_content += '\n<iframe height="1300px" width="100%" '
                tab_content += 'src="{}" '.format(heatmap_index_page)
                tab_content += 'style="border:none;"></iframe>'
                tab_content += '\n</div>\n'
            else:
                tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                    viewer_name)
                tab_content += '''\n<p style="color:red;" >'''
                tab_content += '''Heatmap is too large to be displayed.</p>\n'''
                tab_content += '\n</div>\n'

        viewer_name = 'MatrixHeatmapViewer'
        tab_def_content += '''\n<button class="tablinks" '''
        tab_def_content += '''onclick="openTab(event, '{}')"'''.format(
            viewer_name)
        tab_def_content += '''>Matrix Heatmap</button>\n'''

        heatmap_report_files = os.listdir(heatmap_dir)

        heatmap_index_page = None
        for heatmap_report_file in heatmap_report_files:
            if heatmap_report_file.endswith('.html'):
                heatmap_index_page = heatmap_report_file

            shutil.copy2(os.path.join(heatmap_dir, heatmap_report_file),
                         output_directory)

        if heatmap_index_page:
            tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                viewer_name)
            tab_content += '\n<iframe height="1300px" width="100%" '
            tab_content += 'src="{}" '.format(heatmap_index_page)
            tab_content += 'style="border:none;"></iframe>'
            tab_content += '\n</div>\n'
        else:
            tab_content += '''\n<div id="{}" class="tabcontent">'''.format(
                viewer_name)
            tab_content += '''\n<p style="color:red;" >'''
            tab_content += '''Heatmap is too large to be displayed.</p>\n'''
            tab_content += '\n</div>\n'

        tab_def_content += '\n</div>\n'
        return tab_def_content + tab_content

    def _generate_heatmap_html_report(self, data):

        logging.info('Start generating heatmap report page')

        data_df = pd.DataFrame(data['values'],
                               index=data['row_ids'],
                               columns=data['col_ids'])
        result_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_directory)
        tsv_file_path = os.path.join(
            result_directory, 'heatmap_data_{}.tsv'.format(str(uuid.uuid4())))
        data_df.to_csv(tsv_file_path)

        if data_df.index.size < 10000:
            heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'cluster_data':
                True
            })['html_dir']
        else:
            logging.info(
                'Original matrix is too large. Skip clustering data in report.'
            )
            heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'cluster_data':
                False
            })['html_dir']
        top_heatmap_dir = None
        top_percent = 100
        display_count = 200  # roughly count for display items
        if len(data_df.index) > 1000:
            top_percent = min(display_count / data_df.index.size * 100, 100)
            top_heatmap_dir = self.report_util.build_heatmap_html({
                'tsv_file_path':
                tsv_file_path,
                'sort_by_sum':
                True,
                'top_percent':
                top_percent
            })['html_dir']

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        logging.info(
            'Start generating html report in {}'.format(output_directory))

        html_report = list()

        self._mkdir_p(output_directory)
        result_file_path = os.path.join(output_directory,
                                        'matrix_viewer_report.html')

        visualization_content = self._generate_visualization_content(
            output_directory, heatmap_dir, data_df, top_heatmap_dir,
            top_percent, display_count)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'templates',
                                 'matrix_template.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>Visualization_Content</p>', visualization_content)
                result_file.write(report_template)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_directory,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Import Amplicon Matrix App'
        })
        return html_report

    def _generate_report(self,
                         matrix_obj_ref,
                         new_row_attr_ref,
                         new_col_attr_ref,
                         workspace_id,
                         data=None):
        """
        _generate_report: generate summary report
        """

        objects_created = [{
            'ref': matrix_obj_ref,
            'description': 'Imported Amplicon Matrix'
        }]

        if new_row_attr_ref:
            objects_created.append({
                'ref':
                new_row_attr_ref,
                'description':
                'Imported Amplicons(Row) Attribute Mapping'
            })

        if new_col_attr_ref:
            objects_created.append({
                'ref':
                new_col_attr_ref,
                'description':
                'Imported Samples(Column) Attribute Mapping'
            })

        if data:
            output_html_files = self._generate_heatmap_html_report(data)

            report_params = {
                'message':
                '',
                'objects_created':
                objects_created,
                'workspace_id':
                workspace_id,
                'html_links':
                output_html_files,
                'direct_html_link_index':
                0,
                'html_window_height':
                1400,
                'report_object_name':
                'import_matrix_from_biom_' + str(uuid.uuid4())
            }

        else:
            report_params = {
                'message':
                '',
                'objects_created':
                objects_created,
                'workspace_id':
                workspace_id,
                'report_object_name':
                'import_matrix_from_biom_' + str(uuid.uuid4())
            }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.report_util = kb_GenericsReport(self.callback_url)
        self.data_util = DataUtil(config)
        self.sampleservice_util = SampleServiceUtil(config)
        self.attr_util = AttributesUtil(config)
        self.matrix_util = MatrixUtil(config)
        self.taxon_util = TaxonUtil(config)
        self.matrix_types = [
            x.split(".")[1].split('-')[0]
            for x in self.data_util.list_generic_types()
        ]
        self.taxon_wsname = config['taxon-workspace-name']
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.taxon_cache = dict()

    def fetch_sequence(self, matrix_ref):
        logging.info('start to fetch consensus sequence')

        input_matrix_obj = self.dfu.get_objects({'object_refs':
                                                 [matrix_ref]})['data'][0]
        input_matrix_info = input_matrix_obj['info']
        matrix_name = input_matrix_info[1]
        matrix_type = input_matrix_info[2]
        matrix_data = input_matrix_obj['data']

        if 'KBaseMatrices.AmpliconMatrix' not in matrix_type:
            raise ValueError('Unexpected data type: {}'.format(matrix_type))

        handle = matrix_data.get('sequencing_file_handle')
        if not handle:
            raise ValueError(
                'Missing sequencing_file_handle from the matrix object')

        output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
        logging.info('Start generating consensus sequence file in {}'.format(
            output_directory))
        self._mkdir_p(output_directory)

        matrix_fasta_file = self.dfu.shock_to_file({
            'handle_id': handle,
            'file_path': self.scratch
        }).get('file_path')

        try:
            logging.info('start parsing FASTA file')
            fastq_dict = SeqIO.index(matrix_fasta_file, "fasta")
        except Exception:
            raise ValueError(
                'Cannot parse file. Please provide valide FASTA file')

        row_ids = matrix_data['data']['row_ids']

        fasta_file_path = os.path.join(
            output_directory, matrix_name + 'consensus_sequence.fasta')

        with open(fasta_file_path, 'w') as f:
            for row_id in row_ids:
                consensus_sequence = str(fastq_dict.get(row_id).seq)
                f.write('>' + str(row_id) + '\n')
                f.write(consensus_sequence + '\n')

        return fasta_file_path

    def import_matrix_from_biom(self, params):
        """
        arguments:
        obj_type: one of ExpressionMatrix, FitnessMatrix, DifferentialExpressionMatrix
        matrix_name: matrix object name
        workspace_id: workspace id matrix object to be saved to
        input_shock_id: file shock id
        or
        input_file_path: absolute file path
        or
        input_staging_file_path: staging area file path

        optional arguments:
        col_attributemapping_ref: column AttributeMapping reference
        row_attributemapping_ref: row AttributeMapping reference
        genome_ref: genome reference
        matrix_obj_ref: Matrix reference
        """

        (biom_file, tsv_file, fasta_file, mode,
         metadata_keys) = self._process_params(params)

        workspace_id = params.get('workspace_id')
        matrix_name = params.get('matrix_name')
        obj_type = params.get('obj_type')
        scale = params.get('scale')
        description = params.get('description')
        refs = {k: v for k, v in params.items() if "_ref" in k}

        amplicon_data = self._file_to_amplicon_data(biom_file, tsv_file,
                                                    fasta_file, mode, refs,
                                                    matrix_name, workspace_id,
                                                    scale, description,
                                                    metadata_keys)

        for key in [
                'amplicon_type', 'amplification', 'extraction', 'target_gene',
                'target_subfragment', 'pcr_primers', 'library_kit',
                'library_layout', 'library_screening_strategy',
                'sequencing_center', 'sequencing_date',
                'sequencing_technology', 'sequencing_instrument',
                'sequencing_quality_filter_cutoff', 'read_length_cutoff',
                'read_pairing', 'barcode_error_rate',
                'chimera_detection_and_removal', 'taxon_calling_method',
                'denoise_method', 'sequence_error_cutoff', 'clustering_method',
                'clustering_cutoff', 'sample_set_ref', 'reads_set_ref'
        ]:
            if params.get(key):
                amplicon_data[key] = params[key]

        new_row_attr_ref = None
        if not params.get('row_attributemapping_ref'):
            new_row_attr_ref = amplicon_data.get('row_attributemapping_ref')

        new_col_attr_ref = None
        if not params.get('col_attributemapping_ref'):
            new_col_attr_ref = amplicon_data.get('col_attributemapping_ref')

        if fasta_file:
            logging.info(
                'start saving consensus sequence file to shock: {}'.format(
                    fasta_file))
            handle_id = self.dfu.file_to_shock({
                'file_path': fasta_file,
                'make_handle': True
            })['handle']['hid']
            amplicon_data['sequencing_file_handle'] = handle_id

        logging.info('start saving Matrix object: {}'.format(matrix_name))
        matrix_obj_ref = self.data_util.save_object({
            'obj_type':
            'KBaseMatrices.{}'.format(obj_type),
            'obj_name':
            matrix_name,
            'data':
            amplicon_data,
            'workspace_id':
            workspace_id
        })['obj_ref']

        if params.get('sample_set_ref'):
            self.matrix_util._link_matrix_to_samples(matrix_obj_ref,
                                                     amplicon_data,
                                                     params['sample_set_ref'])

        returnVal = {'matrix_obj_ref': matrix_obj_ref}

        report_output = self._generate_report(matrix_obj_ref,
                                              new_row_attr_ref,
                                              new_col_attr_ref,
                                              workspace_id,
                                              data=amplicon_data['data'])

        returnVal.update(report_output)

        return returnVal
Exemple #26
0
class DataUtil:

    @staticmethod
    def _find_between(s, start, end):
        """
        _find_between: find string in between start and end
        """

        return re.search('{}(.*){}'.format(start, end), s).group(1)

    def _find_constraints(self, obj_type):
        """
        _find_constraints: retrieve constraints (@contains, rowsum, unique, conditionally_required)
        """

        type_info = self.wsClient.get_type_info(obj_type)
        type_desc = type_info.get('description')
        constraints = {}

        for tag in ('contains', 'rowsum', 'unique', 'conditionally_required'):
            constraints[tag] = [line.strip().split()[1:] for line in type_desc.split("\n")
                                if line.startswith(f'@{tag}')]

        return constraints

    def _filter_constraints(self, constraints, data):
        """filters out constraints with missing keys"""
        contains_constraints = constraints.get('contains')

        filtered_constraints = []
        for contains_constraint in contains_constraints:
            in_values = contains_constraint[1:]
            missing_key = True
            for in_value in in_values:
                if in_value.startswith('values'):
                    search_value = re.search('{}(.*){}'.format('\(', '\)'), in_value).group(1)
                    unique_list = search_value.split('.')
                    key = unique_list[0]
                elif ':' in in_value:
                    key = in_value.split(':')[0]
                else:
                    unique_list = in_value.split('.')
                    key = unique_list[0]

                if key in data:
                    missing_key = False
                    break

            if missing_key:
                filtered_constraints.append(contains_constraint)

        for x in filtered_constraints:
            contains_constraints.remove(x)

        return constraints

    def _retrieve_value(self, data, value):
        """Parse the provided 'data' object to retrieve the item in 'value'."""
        logging.info('Getting value for {}'.format(value))
        retrieve_data = []
        m_data = DotMap(data)
        if value.startswith('set('):
            retrieve_data = value[4:-1].split(",")
        elif value.startswith('values('):  # TODO: nested values e.g. values(values(ids))
            search_value = re.search('{}(.*){}'.format('\(', '\)'), value).group(1)
            unique_list = search_value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp.values())
        elif ':' in value:
            obj_ref = getattr(m_data, value.split(':')[0])
            if obj_ref:
                included = value.split(':')[1]
                included = '/' + included.replace('.', '/')
                ref_data = self.wsClient.get_objects2({'objects': [{'ref': obj_ref,
                                                       'included': [included]}]})['data'][0]['data']
                m_ref_data = DotMap(ref_data)
                if ref_data:
                    if '*' not in included:
                        for key in included.split('/')[1:]:
                            m_ref_data = getattr(m_ref_data, key)
                    else:
                        keys = included.split('/')[1:]
                        m_ref_data = [x.get(keys[2]) for x in ref_data.get(keys[0])]  # TODO: only works for 2 level nested data like '/features/[*]/id'

                retrieve_data = list(m_ref_data)
        else:
            unique_list = value.split('.')
            m_data_cp = m_data.copy()
            for attr in unique_list:
                m_data_cp = getattr(m_data_cp, attr)
            retrieve_data = list(m_data_cp)

        logging.info('Retrieved value (first 20):\n{}\n'.format(retrieve_data[:20]))

        return retrieve_data

    def _validate(self, constraints, data):
        """
        _validate: validate data
        """

        validated = True
        failed_constraints = defaultdict(list)

        unique_constraints = constraints.get('unique')
        for unique_constraint in unique_constraints:
            retrieved_value = self._retrieve_value(data, unique_constraint[0])
            if len(set(retrieved_value)) != len(retrieved_value):
                validated = False
                failed_constraints['unique'].append(unique_constraint[0])

        contains_constraints = constraints.get('contains')
        for contains_constraint in contains_constraints:
            value = contains_constraint[0]
            in_values = contains_constraint[1:]
            retrieved_in_values = []
            for in_value in in_values:
                retrieved_in_values += self._retrieve_value(data, in_value)
            if not (set(self._retrieve_value(data, value)) <= set(retrieved_in_values)):
                validated = False
                failed_constraints['contains'].append(" ".join(contains_constraint))

        conditional_constraints = constraints.get('conditionally_required')
        for conditional_constraint in conditional_constraints:
            trigger = conditional_constraint[0]
            required_keys = conditional_constraint[1:]
            if trigger in data:
                missing_keys = [key for key in required_keys if key not in data]
                if missing_keys:
                    validated = False
                    failed_constraints['conditionally_required'].append(
                        (trigger, required_keys, missing_keys))

        return validated, failed_constraints

    @staticmethod
    def _mkdir_p(path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    @staticmethod
    def _raise_validation_error(params, validate):
        """Raise a meaningful error message for failed validation"""
        logging.error('Data failed type checking')
        failed_constraints = validate.get('failed_constraints')
        error_msg = ['Object {} failed type checking:'.format(params.get('obj_name'))]
        if failed_constraints.get('unique'):
            unique_values = failed_constraints.get('unique')
            error_msg.append('Object should have unique field: {}'.format(unique_values))
        if failed_constraints.get('contains'):
            contained_values = failed_constraints.get('contains')
            for contained_value in contained_values:
                subset_value = contained_value.split(' ')[0]
                super_value = ' '.join(contained_value.split(' ')[1:])
                if 'col_mapping' in super_value:
                    error_msg.append('Column attribute mapping instances should contain all '
                                     'column index from original data')

                if 'row_mapping' in super_value:
                    error_msg.append('Row attribute mapping instances should contain all row '
                                     'index from original data')

                error_msg.append('Object field [{}] should contain field [{}]'.format(
                    super_value,
                    subset_value))
        for failure in failed_constraints.get('conditionally_required', []):
            error_msg.append('If object field "{}" is present than object field(s) {} should '
                             'also be present. Object is missing {}'.format(*failure))
        raise ValueError('\n'.join(error_msg))

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']
        self.serviceWizardURL = config['srv-wiz-url']
        self.wsClient = workspaceService(self.ws_url, token=self.token)
        self.dfu = DataFileUtil(self.callback_url)
        self.generics_service = GenericsService(self.serviceWizardURL)
        self.ws_large_data = WsLargeDataIO(self.callback_url)

    def list_generic_types(self, params=None):
        """
        *Not yet exposed in spec*
        list_generic_types: lists the current valid generics types

        arguments:
            none

        return:
            A list of generic types in the current environment
        """
        returnVal = [x['type_def'] for module in GENERICS_MODULES
                     for x in self.wsClient.get_all_type_info(module)]
        return returnVal

    def fetch_data(self, params):
        """
        fetch_data: fetch generics data as pandas dataframe for a generics data object

        arguments:
        obj_ref: generics object reference

        optional arguments:
        generics_module: the generics data module to be retrieved from
                        e.g. for an given data type like below:
                        typedef structure {
                          FloatMatrix2D data;
                          condition_set_ref condition_set_ref;
                        } SomeGenericsMatrix;
                        generics_module should be
                        {'data': 'FloatMatrix2D',
                         'condition_set_ref': 'condition_set_ref'}

        return:
        data_matrix: a pandas dataframe in json format
        """
        for p in ['obj_ref']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        return self.generics_service.fetch_data(params)

    def validate_data(self, params):
        """
        validate_data: validate data

        arguments:
        obj_type: obj type e.g.: 'KBaseMatrices.ExpressionMatrix-1.1'
        data: obj data to be validated

        return:
        validated: True or False
        """

        constraints = self._find_constraints(params.get('obj_type'))
        data = params.get('data')

        constraints = self._filter_constraints(constraints, data)

        validated, failed_constraints = self._validate(constraints, data)

        return {'validated': validated,
                'failed_constraints': failed_constraints}

    def save_object(self, params):
        """
        save_object: validate data constraints and save matrix object

        arguments:
        obj_type: saving object data type
        obj_name: saving object name
        data: data to be saved
        workspace_name: workspace name matrix object to be saved to

        return:
        obj_ref: object reference
        """
        logging.info('Starting validating and saving object data')

        obj_type = params.get('obj_type').split('-')[0]

        module_name = obj_type.split('.')[0]
        type_name = obj_type.split('.')[1]

        types = self.wsClient.get_module_info({'mod': module_name}).get('types')

        for module_type in types:
            if self._find_between(module_type, '\.', '\-') == type_name:
                obj_type = module_type
                break

        data = dict((k, v) for k, v in params.get('data').items() if v)
        validate = self.validate_data({'obj_type': obj_type,
                                       'data': data})

        if not validate.get('validated'):
            self._raise_validation_error(params, validate)

        # make sure users with shared object have access to the handle file upon saving
        handle = data.get('sequencing_file_handle')
        if handle:
            output_directory = os.path.join(self.scratch, str(uuid.uuid4()))
            logging.info('Downloading consensus sequence file in {}'.format(output_directory))
            self._mkdir_p(output_directory)
            matrix_fasta_file = self.dfu.shock_to_file({
                'handle_id': handle,
                'file_path': self.scratch}).get('file_path')
            logging.info('Saving consensus sequence file to shock: {}'.format(matrix_fasta_file))
            handle_id = self.dfu.file_to_shock({'file_path': matrix_fasta_file,
                                                'make_handle': True})['handle']['hid']
            data['sequencing_file_handle'] = handle_id

        # cast data
        int_data_names = ['sequencing_quality_filter_cutoff', 'read_length_cutoff']
        for data_name in int_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to int'.format(data_name))
                    data[data_name] = int(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be an integer value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        float_data_names = ['barcode_error_rate', 'sequence_error_cutoff', 'clustering_cutoff']
        for data_name in float_data_names:
            if data_name in data:
                try:
                    logging.info('Casting {} to float'.format(data_name))
                    data[data_name] = float(data[data_name])
                except Exception as e:
                    err_msg = 'Unexpected data type {}. '.format(data_name)
                    err_msg += 'Data type {} requests {} to be a float value. '.format(
                        obj_type, data_name)
                    err_msg += 'Provided [{}] {} instead'.format(
                        type(data[data_name]), data[data_name])
                    raise ValueError(err_msg) from e

        ws_name_id = params.get('workspace_id')
        workspace_name = params.get('workspace_name')
        if not ws_name_id:
            if not isinstance(workspace_name, int):
                ws_name_id = self.dfu.ws_name_to_id(workspace_name)
            else:
                ws_name_id = workspace_name

        try:
            logging.info('Starting saving object via DataFileUtil')
            info = self.dfu.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data": data,
                    "name": params.get('obj_name')
                }]
            })[0]
        except Exception:
            logging.info('Saving object via DataFileUtil failed')
            logging.info('Starting saving object via WsLargeDataIO')
            data_path = os.path.join(self.scratch,
                                     params.get('obj_name') + "_" + str(uuid.uuid4()) + ".json")
            json.dump(data, open(data_path, 'w'))

            info = self.ws_large_data.save_objects({
                "id": ws_name_id,
                "objects": [{
                    "type": obj_type,
                    "data_json_file": data_path,
                    "name": params.get('obj_name')
                }]
            })[0]

        return {"obj_ref": "%s/%s/%s" % (info[6], info[0], info[4])}
Exemple #27
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.3.6"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "75ef2c24694c056dfca71859d6f344ccff7d4725"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file

        How we compute this stats:

        For each segment (line) in SAM/BAM file:
            we take the first element as `reads_id`
                    the second element as `flag`

            if the last bit (0x1) of flag is `1`:
                we treat this segment as paired end reads
            otherwise:
                we treat this segment as single end reads

            For single end reads:
                if the 3rd last bit (0x8) of flag is `1`:
                    we increment unmapped_reads_count
                else:
                    we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`

            For paired end reads:
                if the 7th last bit (0x40) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_left_reads_count
                    else:
                        we treat this `reads_id` as mapped

                if the 8th last bit ( 0x80) of flag is `1`:
                    if the 3rd last bit (0x8) of flag is `1`:
                        we increment unmapped_right_reads_count
                    else:
                        we treat this `reads_id` as mapped

                for all mapped `reads_ids`"
                    if it appears only once:
                        we treat this `reads_id` as `singletons`
                    else:
                        we treat this `reads_id` as `multiple_alignments`

                lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        """
        path, file = os.path.split(bam_file)

        self.__LOGGER.info('Start to generate aligner stats')
        start_time = time.time()

        infile = pysam.AlignmentFile(bam_file, 'r')

        properly_paired = 0
        unmapped_reads_count = 0
        unmapped_left_reads_count = 0
        unmapped_right_reads_count = 0
        mapped_reads_ids = []
        mapped_left_reads_ids = []
        mapped_right_reads_ids = []
        paired = False
        for alignment in infile:
            seg = alignment.to_string().split('\t')
            reads_id = seg[0]
            flag = "0000000" + "{0:b}".format(int(seg[1]))

            if flag[-1] == '1':
                paired = True

            if paired:  # process paired end sequence

                if flag[-7] == '1':  # first sequence of a pair
                    if flag[-3] == '1':
                        unmapped_left_reads_count += 1
                    else:
                        mapped_left_reads_ids.append(reads_id)

                if flag[-8] == '1':  # second sequence of a pair
                    if flag[-3] == '1':
                        unmapped_right_reads_count += 1
                    else:
                        mapped_right_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1
            else:  # process single end sequence
                if flag[-3] == '1':
                    unmapped_reads_count += 1
                else:
                    mapped_reads_ids.append(reads_id)

                if flag[-2] == '1':
                    properly_paired += 1

        infile.close()

        if paired:
            mapped_reads_ids = mapped_left_reads_ids + mapped_right_reads_ids
            unmapped_reads_count = unmapped_left_reads_count + unmapped_right_reads_count

            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

            properly_paired = properly_paired // 2

        else:
            mapped_reads_ids_counter = Counter(mapped_reads_ids)
            mapped_reads_count = len(list(mapped_reads_ids_counter))

            singletons = list(mapped_reads_ids_counter.values()).count(1)
            multiple_alignments = mapped_reads_count - singletons

            total_reads = unmapped_reads_count + mapped_reads_count

        try:
            alignment_rate = round(
                float(mapped_reads_count) / total_reads * 100, 3)
        except ZeroDivisionError:
            alignment_rate = 0

        if alignment_rate > 100:
            alignment_rate = 100.0

        elapsed_time = time.time() - start_time
        self.__LOGGER.info('Used: {}'.format(
            time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

        stats_data = {
            "alignment_rate": alignment_rate,
            "mapped_reads": mapped_reads_count,
            "multiple_alignments": multiple_alignments,
            "properly_paired": properly_paired,
            "singletons": singletons,
            "total_reads": total_reads,
            "unmapped_reads": unmapped_reads_count
        }
        return stats_data

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment
                How we compute BAM stats:
                For each segment (line) in SAM/BAM file:
                    we take the first element as `reads_id`
                            the second element as `flag`
                    if the last bit (0x1) of flag is `1`:
                        we treat this segment as paired end reads
                    otherwise:
                        we treat this segment as single end reads
                    For single end reads:
                        if the 3rd last bit (0x8) of flag is `1`:
                            we increment unmapped_reads_count
                        else:
                            we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_reads_count + identical mapped `reads_id`
                    For paired end reads:
                        if the 7th last bit (0x40) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_left_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        if the 8th last bit ( 0x80) of flag is `1`:
                            if the 3rd last bit (0x8) of flag is `1`:
                                we increment unmapped_right_reads_count
                            else:
                                we treat this `reads_id` as mapped
                        for all mapped `reads_ids`"
                            if it appears only once:
                                we treat this `reads_id` as `singletons`
                            else:
                                we treat this `reads_id` as `multiple_alignments`
                        lastly, total_reads = unmapped_left_reads_count + unmapped_right_reads_count + identical mapped `reads_id`
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_BAI, False):
                bai_file = file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get(self.PARAM_IN_DOWNLOAD_SAM, False):
                sam_file = file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref -
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.items():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class ProkkaUtils:
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx']
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key +
                             " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{
            "ref":
            "KBaseOntology/seed_subsystem_ontology"
        }])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4:-1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(
            sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            #raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: A tuple with The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence,
                               id=new_id,
                               description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple(
            "renamed_assembly",
            "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath,
                                contig_counter, new_ids_to_old, records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = [
            "perl", "/kb/prokka/bin/prokka", "--metagenome", "--outdir",
            output_dir, "--prefix", "mygenome", "--kingdom", kingdom
        ]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(
                ["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError(
                "gram parameter is not supported in current Prokka installation"
            )
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(
                ["--mincontiglen",
                 str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        #tbl2asn or some other non essential prokka binary will fail, so supress that
        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results",
                                    "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object for
        genome annotation only.

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: A tuple with Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(
                        qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(
                        qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {
                        "id": fid,
                        "location": location,
                        "type": "gene",
                        "aliases": aliases,
                        "md5": md5,
                        "dna_sequence": dna,
                        "dna_sequence_length": len(dna),
                    }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {
                                "id": sso_item["id"],
                                "evidence": [evidence],
                                "term_name": sso_item["name"],
                                "ontology_ref": self.sso_ref,
                                "term_lineage": []
                            }
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {
                            "id": cds_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "parent_mrna": mrna_id,
                            "function": (product if product else ""),
                            "ontology_terms": {},
                            "protein_translation": prot,
                            "protein_translation_length": prot_len,
                            "aliases": aliases
                        }
                        mrna = {
                            "id": mrna_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "cds": cds_id
                        }
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(
            len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(
            non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(
            genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(
            genes_with_sso) + "\n"
        report += "Average protein length: " + str(
            int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly",
                                        "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(
                            qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {
                                    "id": sso_item["id"],
                                    "evidence": [evidence],
                                    "term_name": sso_item["name"],
                                    "ontology_ref": self.sso_ref,
                                    "term_lineage": []
                                }

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(
            self.scratch, "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] +
                            "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty."
            )

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """
        Create a dict for the evidence field for the genome
        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        """
        Create ontology event fields for a genome object
        :param genome_data:  A genome object's data filed
        :return: a named tuple containg the modified genome object and a new ontology event index
        """
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(
                genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified',
                                         'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        """
        Update the feature's ontologies for an old genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :return: The feature with the ontology updated, in the old style
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        """
        Update the feature's ontologies for a new genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :param ontology_event_index: Ontology index to update the feature with
        :return: the updated feature
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(
                    ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """
        Annotate the genome with new annotations for  Genome ReAnnotation
        :param annotation_args:  genome_data from the genome obj, new_annotations from prokka, and the output_genome_name
        :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations
          """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(
                genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {
            "current_functions": len(genome_data["data"]["features"]),
            "new_functions": 0,
            "found_functions": 0,
            "new_ontologies": 0
        }

        function_summary_fp = os.path.join(self.scratch, "ontology_report")
        ontology_summary_fp = os.path.join(self.scratch, "function_report")
        onto_r = open(function_summary_fp, "w")
        func_r = open(ontology_summary_fp, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        ontologies_present = {"SSO": {}}
        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function
                            and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i][
                        "function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [
                        new_function
                    ]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        # New style
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)

                        # Add to ontologies Present
                        for key in new_ontology.keys():
                            oid = new_ontology[key]["id"]
                            name = new_ontology[key].get("name", "Unknown")
                            ontologies_present["SSO"][oid] = name

                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)

            if current_function:
                func_r.write(
                    json.dumps([fid, [current_function], [new_function]]) +
                    "\n")
            else:
                func_r.write(
                    json.dumps([fid, current_functions, [new_function]]) +
                    "\n")

            onto_r.write(
                json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        if ontologies_present:
            if "ontologies_present" in genome_data["data"]:
                if "SSO" in genome_data["data"]["ontologies_present"]:
                    for key, value in ontologies_present["SSO"].items():
                        genome_data["data"]["ontologies_present"]["SSO"][
                            key] = value
                else:
                    genome_data["data"][
                        "ontologies_present"] = ontologies_present["SSO"]

            else:
                genome_data["data"]["ontologies_present"] = ontologies_present

        info = self.gfu.save_one_genome({
            "workspace":
            self.output_workspace,
            "name":
            annotation_args["output_genome_name"],
            "data":
            genome_data["data"],
            "provenance":
            self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        annotated_genome = namedtuple(
            "annotated_genome",
            "genome_ref function_summary_filepath ontology_summary_filepath stats"
        )

        return annotated_genome(genome_ref, function_summary_fp,
                                ontology_summary_fp, stats)

    def upload_file(self,
                    filepath,
                    message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path":
                                                       filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" +
              output_file_shock_id)
        return {
            "shock_id": output_file_shock_id,
            "name": os.path.basename(filepath),
            "label": os.path.basename(filepath),
            "description": message
        }

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [
            self.upload_file(genome.ontology_summary_filepath),
            self.upload_file(genome.function_summary_filepath)
        ]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n").format(
                              genome_ref, stats["current_functions"],
                              stats["new_functions"], stats["new_ontologies"])

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "file_links":
            file_links,
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            self.output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(
            genome_data=genome_data,
            new_annotations=new_annotations,
            output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params,
                                                   "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref":
                                                         assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(
            gff_filepath=prokka_results.gff_filepath,
            cds_to_dna=prokka_results.cds_to_dna,
            cds_to_prot=prokka_results.cds_to_prot,
            new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {
            "id": "Unknown",
            "features": annotated_assembly.features,
            "scientific_name": scientific_name,
            "domain": domain,
            "genetic_code": gcode,
            "assembly_ref": assembly_ref,
            "cdss": annotated_assembly.cdss,
            "mrnas": annotated_assembly.mrnas,
            "source": "PROKKA annotation pipeline",
            "gc_content": assembly_info.gc_content,
            "dna_size": assembly_info.dna_size,
            "reference_annotation": 0
        }

        info = self.gfu.save_one_genome({
            "workspace": output_workspace,
            "name": output_genome_name,
            "data": genome,
            "provenance": self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }
Exemple #29
0
class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path, params):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            self._validate_paired_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            self._validate_single_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {
            'fwd_file': fwd_file,
            'rev_file': rev_file
        }
        return fastq_file_path


    def _validate_single_end_advanced_params(self, params):
        """
        _validate_single_end_advanced_params: validate advanced params for single end reads
        """
        if (params.get('insert_size_mean')
           or params.get('insert_size_std_dev')
           or params.get('read_orientation_outward')):
            error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or '
            error_msg += '"Reads Orientation Outward" is Paried End Reads specific'
            raise ValueError(error_msg)

        if 'interleaved' in params:
            del params['interleaved']

    def _validate_paired_end_advanced_params(self, params):
        """
        _validate_paired_end_advanced_params: validate advanced params for paired end reads

        """
        sequencing_tech = params.get('sequencing_tech')

        if sequencing_tech in ['PacBio CCS', 'PacBio CLR']:
            error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" '
            error_msg += 'is Single End Reads specific'
            raise ValueError(error_msg)

    def _validate_upload_staging_file_availability(self, staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)

        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'),
                                                   returnVal['obj_ref'])
        return returnVal

    def import_sra_from_web(self, params):
        '''
        import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome

        required params:
        download_type: download type for web source fastq file
                       ('Direct Download', 'FTP', 'DropBox', 'Google Drive')
        workspace_name: workspace name/ID of the object

        sra_urls_to_add: dict of SRA file URLs
            required params:
            file_url: SRA file URL
            sequencing_tech: sequencing technology
            name: output reads file name

            Optional Params:
            single_genome: whether the reads are from a single genome or a metagenome.
            insert_size_mean: mean (average) insert length
            insert_size_std_dev: standard deviation of insert lengths
            read_orientation_outward: whether reads in a pair point outward

        return:
        obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_web_params(params)

        download_type = params.get('download_type')
        workspace_name = params.get('workspace_name')

        obj_refs = []
        uploaded_files = []

        for sra_url_to_add in params.get('sra_urls_to_add'):
            download_web_file_params = {
                'download_type': download_type,
                'file_url': sra_url_to_add.get('file_url')
            }
            scratch_sra_file_path = self.dfu.download_web_file(
                        download_web_file_params).get('copy_file_path')
            log('Downloaded web file to: {}'.format(scratch_sra_file_path))

            fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add)

            import_sra_reads_params = sra_url_to_add
            import_sra_reads_params.update(fastq_file_path)

            workspace_name_or_id = workspace_name
            if str(workspace_name_or_id).isdigit():
                import_sra_reads_params['wsid'] = int(workspace_name_or_id)
            else:
                import_sra_reads_params['wsname'] = str(workspace_name_or_id)

            log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))

            obj_ref = self.ru.upload_reads(import_sra_reads_params).get('obj_ref')
            obj_refs.append(obj_ref)
            uploaded_files.append(sra_url_to_add.get('file_url'))

        return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files}

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(params.get('staging_file_subdir_path'))

    def validate_import_sra_from_web_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['download_type', 'workspace_name', 'sra_urls_to_add']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        if not isinstance(params.get('sra_urls_to_add'), list):
            raise ValueError('sra_urls_to_add is not type list as required')

        for sra_url_to_add in params.get('sra_urls_to_add'):
            for p in ['file_url', 'sequencing_tech', 'name']:
                if p not in sra_url_to_add:
                    raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_refs_list, params):
        """
        generate_report: generate summary report

        obj_refs: generated workspace object references. (return of import_sra_from_staging/web)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        uuid_string = str(uuid.uuid4())

        objects_created = list()
        objects_data = list()

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }
            objects_data.append(self.dfu.get_objects(get_objects_params))

            objects_created.append({'ref': obj_ref,
                                    'description': 'Imported Reads'})

        output_html_files = self.generate_html_report(objects_data, params, uuid_string)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 460,
            'report_object_name': 'kb_sra_upload_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def generate_html_report(self, reads_objs, params, uuid_string):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        pprint(params)

        tmp_dir = os.path.join(self.scratch, uuid_string)
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')
        html_report = list()
        objects_content = ''

        for index, reads_obj in enumerate(reads_objs):

            idx = str(index)
            reads_data = reads_obj.get('data')[0].get('data')
            reads_info = reads_obj.get('data')[0].get('info')
            reads_ref = str(reads_info[6]) + '/' + str(reads_info[0]) + '/' + str(reads_info[4])
            reads_obj_name = str(reads_info[1])

            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'),
                      'r') as object_content_file:
                report_template = object_content_file.read()
                report_template = report_template.replace('_NUM', str(idx))
                report_template = report_template.replace('OBJECT_NAME', reads_obj_name)
                if index == 0:
                    report_template = report_template.replace('panel-collapse collapse', 'panel-collapse collapse in')

            objects_content += report_template
            base_percentages = ''
            for key, val in reads_data.get('base_percentages').items():
                base_percentages += '{}({}%) '.format(key, val)

            reads_overview_data = collections.OrderedDict()

            reads_overview_data['Name'] = '{} ({})'.format(reads_obj_name, reads_ref)
            reads_overview_data['Uploaded File'] = params.get('uploaded_files')[index]
            reads_overview_data['Date Uploaded'] = time.strftime("%c")
            reads_overview_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))

            reads_type = reads_info[2].lower()
            if 'single' in reads_type:
                reads_overview_data['Type'] = 'Single End'
            elif 'paired' in reads_type:
                reads_overview_data['Type'] = 'Paired End'
            else:
                reads_overview_data['Type'] = 'Unknown'

            reads_overview_data['Platform'] = reads_data.get('sequencing_tech', 'Unknown')

            reads_single_genome = str(reads_data.get('single_genome', 'Unknown'))
            if '0' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'No'
            elif '1' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'Yes'
            else:
                reads_overview_data['Single Genome'] = 'Unknown'

            insert_size_mean = params.get('insert_size_mean', 'Not Specified')
            if insert_size_mean is not None:
                reads_overview_data['Insert Size Mean'] = str(insert_size_mean)
            else:
                reads_overview_data['Insert Size Mean'] = 'Not Specified'

            insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified')
            if insert_size_std_dev is not None:
                reads_overview_data['Insert Size Std Dev'] = str(insert_size_std_dev)
            else:
                reads_overview_data['Insert Size Std Dev'] = 'Not Specified'

            reads_outward_orientation = str(reads_data.get('read_orientation_outward', 'Unknown'))
            if '0' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'No'
            elif '1' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'Yes'
            else:
                reads_overview_data['Outward Read Orientation'] = 'Unknown'

            reads_stats_data = collections.OrderedDict()

            reads_stats_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))
            reads_stats_data['Total Number of Bases'] = '{:,}'.format(reads_data.get('total_bases'))
            reads_stats_data['Mean Read Length'] = str(reads_data.get('read_length_mean'))
            reads_stats_data['Read Length Std Dev'] = str(reads_data.get('read_length_stdev'))
            dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \
                                                reads_data.get('read_count'))
            reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \
                .format(str(reads_data.get('number_of_duplicates')),
                        dup_reads_percent)
            reads_stats_data['Phred Type'] = str(reads_data.get('phred_type'))
            reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(reads_data.get('qual_mean'))
            reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(str(reads_data.get('qual_min')),
                                                                         str(reads_data.get('qual_max')))
            reads_stats_data['GC Percentage'] = str(round(reads_data.get('gc_content') * 100, 2)) + '%'
            reads_stats_data['Base Percentages'] = base_percentages

            overview_content = ''
            for key, val in reads_overview_data.items():
                overview_content += '<tr><td><b>{}</b></td>'.format(key)
                overview_content += '<td>{}</td>'.format(val)
                overview_content += '</tr>'

            stats_content = ''
            for key, val in reads_stats_data.items():
                stats_content += '<tr><td><b>{}</b></td>'.format(key)
                stats_content += '<td>{}</td>'.format(val)
                stats_content += '</tr>'

            objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content)
            objects_content = objects_content.replace('###STATS_CONTENT###', stats_content)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('###TABLE_PANELS_CONTENT###',
                                                          objects_content)
                result_file.write(report_template)
        result_file.close()

        shutil.copytree(os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'),
                        os.path.join(tmp_dir, 'bootstrap-3.3.7'))
        shutil.copy(os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'),
                    os.path.join(tmp_dir, 'jquery-3.2.1.min.js'))

        matched_files = []
        for root, dirnames, filenames in os.walk(tmp_dir):
            for filename in fnmatch.filter(filenames, '*.gz'):
                matched_files.append(os.path.join(root, filename))

        for gz_file in matched_files:
            print(('Removing ' + gz_file))
            os.remove(gz_file)

        report_shock_id = self.dfu.file_to_shock({'file_path': tmp_dir,
                                                  'pack': 'zip'})['shock_id']
        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Imported Assembly'})
        return html_report
Exemple #30
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)
        self.max_contigs_for_report = 200

    def import_fasta_as_assembly_from_staging(self, params):
        """
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        """
        logging.info(
            '--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        # self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        logging.info('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ['<br/><table>\n']
        for key, val in assembly_overview_data.items():
            overview_content.append(f'<tr><td><b>{key}</b></td>')
            overview_content.append(f'<td>{val}</td></tr>\n')
        overview_content.append('</table>')

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([str(e['contig_id']), e['length']]
                             for e in contig_data)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'report_template',
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', ''.join(overview_content))
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': tmp_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        object_data = self.dfu.get_objects({'object_refs': [obj_ref]})

        report_params = {
            'workspace_name':
            params.get('workspace_name'),
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported Assembly'
            }],
            'report_object_name':
            f'kb_upload_assembly_report_{uuid.uuid4()}'
        }

        num_contigs = object_data['data'][0]['data']['num_contigs']
        if num_contigs > self.max_contigs_for_report:
            report_params['message'] = (
                "The uploaded assembly has too many contigs to display "
                "here. Click on the object for a dedicated viewer")
        else:
            output_html_files = self.generate_html_report(
                obj_ref, object_data, params)
            report_params.update({
                'html_links': output_html_files,
                'direct_html_link_index': 0,
                'html_window_height': 375,
            })

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Exemple #31
0
    def create_fake_reads(self, ctx, params):
        """
        :param params: instance of type "CreateFakeReadsParams"
           (ws_id/ws_name - two alternative ways to set target workspace,
           obj_names - list of names for target workspace objects (of type
           'KBaseFile.SingleEndLibrary'), metadata - optional metadata.) ->
           structure: parameter "ws_id" of Long, parameter "ws_name" of
           String, parameter "obj_names" of list of String, parameter
           "metadata" of mapping from String to String
        :returns: instance of list of type "object_info" (Information about
           an object, including user provided metadata. obj_id objid - the
           numerical id of the object. obj_name name - the name of the
           object. type_string type - the type of the object. timestamp
           save_date - the save date of the object. obj_ver ver - the version
           of the object. username saved_by - the user that saved or copied
           the object. ws_id wsid - the workspace containing the object.
           ws_name workspace - the workspace containing the object. string
           chsum - the md5 checksum of the object. int size - the size of the
           object in bytes. usermeta meta - arbitrary user-supplied metadata
           about the object.) -> tuple of size 11: parameter "objid" of type
           "obj_id" (The unique, permanent numerical ID of an object.),
           parameter "name" of type "obj_name" (A string used as a name for
           an object. Any string consisting of alphanumeric characters and
           the characters |._- that is not an integer is acceptable.),
           parameter "type" of type "type_string" (A type string. Specifies
           the type and its version in a single string in the format
           [module].[typename]-[major].[minor]: module - a string. The module
           name of the typespec containing the type. typename - a string. The
           name of the type as assigned by the typedef statement. major - an
           integer. The major version of the type. A change in the major
           version implies the type has changed in a non-backwards compatible
           way. minor - an integer. The minor version of the type. A change
           in the minor version implies that the type has changed in a way
           that is backwards compatible with previous type definitions. In
           many cases, the major and minor versions are optional, and if not
           provided the most recent version will be used. Example:
           MyModule.MyType-3.1), parameter "save_date" of type "timestamp" (A
           time in the format YYYY-MM-DDThh:mm:ssZ, where Z is either the
           character Z (representing the UTC timezone) or the difference in
           time to UTC in the format +/-HHMM, eg: 2012-12-17T23:24:06-0500
           (EST time) 2013-04-03T08:56:32+0000 (UTC time)
           2013-04-03T08:56:32Z (UTC time)), parameter "version" of Long,
           parameter "saved_by" of type "username" (Login name of a KBase
           user account.), parameter "wsid" of type "ws_id" (The unique,
           permanent numerical ID of a workspace.), parameter "workspace" of
           type "ws_name" (A string used as a name for a workspace. Any
           string consisting of alphanumeric characters and "_", ".", or "-"
           that is not an integer is acceptable. The name may optionally be
           prefixed with the workspace owner's user name and a colon, e.g.
           kbasetest:my_workspace.), parameter "chsum" of String, parameter
           "size" of Long, parameter "meta" of type "usermeta" (User provided
           metadata about an object. Arbitrary key-value pairs provided by
           the user.) -> mapping from String to String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN create_fake_reads
        metadata = params.get('metadata')
        objects = []
        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        path_to_temp_file = "/kb/module/work/tmp/temp_" + str(
            time.time()) + ".fq"
        with open(path_to_temp_file, 'w') as f:
            f.write(' ')
        uploadedfile = dfu.file_to_shock({
            'file_path': path_to_temp_file,
            'make_handle': 1,
            'pack': 'gzip'
        })
        fhandle = uploadedfile['handle']
        os.remove(path_to_temp_file)
        data = {
            'lib': {
                'encoding': "ascii",
                'file': fhandle,
                'size': 1,
                'type': "fq"
            },
            'sequencing_tech': "Illumina",
            'single_genome': 1
        }
        for obj_name in params['obj_names']:
            objects.append({
                'type': 'KBaseFile.SingleEndLibrary',
                'data': data,
                'name': obj_name,
                'meta': metadata
            })
        so_params = {'objects': objects}
        if 'ws_id' in params:
            so_params['id'] = params['ws_id']
        elif 'ws_name' in params:
            so_params['workspace'] = params['ws_name']
        returnVal = self.ws(ctx).save_objects(so_params)
        #END create_fake_reads

        # At some point might do deeper type checking...
        if not isinstance(returnVal, list):
            raise ValueError('Method create_fake_reads return value ' +
                             'returnVal is not type list as required.')
        # return the results
        return [returnVal]