def export_genome_annotation_as_genbank(self, ctx, params): """ A method designed especially for download, this calls 'genome_annotation_to_genbank' to do the work, but then packages the output with WS provenance and object info into a zip file and saves to shock. :param params: instance of type "ExportParams" -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_annotation_as_genbank # validate parameters if 'input_ref' not in params: raise ValueError('Cannot export GenomeAnnotation- not input_ref field defined.') # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0] # export to a file file = self.genome_annotation_to_genbank(ctx, { 'genome_ref': params['input_ref'], 'new_genbank_file_name': info[1]+'.gbk' })[0] # create the output directory and move the file there export_package_dir = os.path.join(self.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move(file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done dfUtil = DataFileUtil(self.callback_url) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [ params['input_ref'] ] }) output = { 'shock_id': package_details['shock_id'] } #END export_genome_annotation_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_annotation_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]
class Utils: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.dfu = DataFileUtil(self.callback_url) self.kbse = KBaseSearchEngine(config['search-url']) self.gen_api = GenericsAPI(self.callback_url) self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom" self.DEFAULT_ONTOLOGY_ID = "Custom:Term" self.DEFAULT_UNIT_ID = "Custom:Unit" @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) def get_conditions(self, params): data = self.dfu.get_objects( {'object_refs': [params['condition_set_ref']]})['data'][0]['data'] conditions = {} keep_keys = params.get('conditions', data['conditions'].keys()) for key in keep_keys: conditions[key] = defaultdict(list) for factor, val in zip(data['factors'], data['conditions'][key]): ont_abriv = factor['factor_ont_id'].split(":")[0] factor['value'] = val conditions[key][ont_abriv].append(copy.copy(factor)) return {"conditions": conditions} def file_to_condition_set(self, params): """Convert a user supplied file to a compound set""" if 'input_file_path' in params: scratch_file_path = params['input_file_path'] elif 'input_shock_id' in params: scratch_file_path = self.dfu.shock_to_file({ 'shock_id': params['input_shock_id'], 'file_path': self.scratch }).get('file_path') else: raise ValueError( "Must supply either a input_shock_id or input_file_path") try: df = pd.read_excel(scratch_file_path, dtype='str') except XLRDError: df = pd.read_csv(scratch_file_path, sep="\t", dtype='str') comp_set = self._df_to_cs_obj(df) info = self.dfu.save_objects({ "id": params['output_ws_id'], "objects": [{ "type": "KBaseExperiments.ConditionSet", "data": comp_set, "name": params['output_obj_name'] }] })[0] return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])} def _conditionset_data_to_df(self, data): """ Converts a compound set object data to a dataframe """ factors = pd.DataFrame(data['factors']) factors.rename(columns=lambda x: x.replace("ont", "ontology"). capitalize().replace("_", " ")) conditions = pd.DataFrame(data['conditions']) cs_df = factors.join(conditions) return cs_df def _clusterset_data_to_df(self, data): """ Converts a cluster set object data to a dataframe """ original_matrix_ref = data.get('original_data') data_matrix = self.gen_api.fetch_data({ 'obj_ref': original_matrix_ref }).get('data_matrix') data_df = pd.read_json(data_matrix) clusters = data.get('clusters') id_name_list = [ cluster.get('id_to_data_position').keys() for cluster in clusters ] id_names = [item for sublist in id_name_list for item in sublist] if set(data_df.columns.tolist()) == set( id_names): # cluster is based on condition data_df = data_df.T cluster_names = [None] * data_df.index.size cluster_id = 0 for cluster in clusters: item_ids = cluster.get('id_to_data_position').keys() item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids] for idx in item_idx: cluster_names[idx] = cluster_id cluster_id += 1 data_df['cluster'] = cluster_names return data_df def _ws_obj_to_df(self, input_ref): """Converts workspace obj to a dataframe""" res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0] name = res['info'][1] obj_type = res['info'][2] if "KBaseExperiments.ConditionSet" in obj_type: cs_df = self._conditionset_data_to_df(res['data']) elif "KBaseExperiments.ClusterSet" in obj_type: cs_df = self._clusterset_data_to_df(res['data']) else: err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type) err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet' raise ValueError("err_msg") return name, cs_df, obj_type def _df_to_cs_obj(self, cs_df): """Converts a dataframe from a user file to a compound set object""" condition_set = {'ontology_mapping_method': "User Curation"} cs_df.fillna('', inplace=True) if not len(cs_df): raise ValueError("No factors in supplied files") factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor") condition_df = cs_df.drop(factor_df.columns, axis=1) if not len(condition_df.columns): raise ValueError( "Unable to find any condition columns in supplied file") factor_df.rename( columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(), inplace=True) if "factor" not in factor_df.columns: raise ValueError( "Unable to find a 'Factor' column in supplied file") factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id') factors = factor_df.filter(items=factor_fields).to_dict('records') condition_set['factors'] = [ self._add_ontology_info(f) for f in factors ] condition_set['conditions'] = condition_df.to_dict('list') return condition_set def _search_ontologies(self, term, closest=False): """ Match to an existing KBase ontology term :param term: Test to match :param closest: if false, term must exactly match an ontology ID :return: dict(ontology_ref, id) """ params = { "object_types": ["OntologyTerm"], "match_filter": { "lookup_in_keys": { "id": { "value": term } } }, "access_filter": { "with_private": 0, "with_public": 1 }, "pagination": { "count": 1 }, "post_processing": { "skip_data": 1 } } if closest: params['match_filter'] = {"full_text_in_all": term} res = self.kbse.search_objects(params) if not res['objects']: return None term = res['objects'][0] return { "ontology_ref": term['guid'].split(":")[1], "id": term['key_props']['id'] } def _add_ontology_info(self, factor): """Searches KBASE ontologies for terms matching the user supplied factors and units. Add the references if found""" optionals = { "unit", "unit_ont_id", "unit_ont_ref", } factor = { k: v for k, v in factor.items() if k not in optionals or v != "" } ont_info = self._search_ontologies( factor.get('factor_ont_id', "").replace("_", ":")) if ont_info: factor['factor_ont_ref'] = ont_info['ontology_ref'] factor['factor_ont_id'] = ont_info['id'] else: factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID if factor.get('unit'): ont_info = self._search_ontologies( factor.get('unit_ont_id', '').replace("_", ":")) if ont_info: factor['unit_ont_ref'] = ont_info['ontology_ref'] factor['unit_ont_id'] = ont_info['id'] else: factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF factor['unit_ont_id'] = self.DEFAULT_UNIT_ID return factor def to_tsv(self, params): """Convert an compound set to TSV file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".tsv") df.to_csv(files['file_path'], sep="\t", index=False) return _id, files def to_excel(self, params): """Convert an compound set to Excel file""" files = {} _id, df, obj_type = self._ws_obj_to_df(params['input_ref']) files['file_path'] = os.path.join(params['destination_dir'], _id + ".xlsx") writer = pd.ExcelWriter(files['file_path']) if "KBaseExperiments.ConditionSet" in obj_type: df.to_excel(writer, "Conditions", index=False) elif "KBaseExperiments.ClusterSet" in obj_type: df.to_excel(writer, "ClusterSet", index=True) # else is checked in `_ws_obj_to_df` writer.save() return _id, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}
class FeatureSetDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) self.ws = Workspace(config["workspace-url"]) @staticmethod def validate_params(params, expected={"workspace_name", "featureset_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): working_dir = os.path.join(self.scratch, 'featureset-download-'+str(uuid.uuid4())) os.makedirs(working_dir) header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function'] fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref']) files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)} writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t', lineterminator='\n') writer.writeheader() for feat in fs_dicts: writer.writerow(feat) return fs_name, files def make_featureset_dict(self, fs_ref): features = [] ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0] feat_set = ret['data'] fs_name = ret['info'][1] feat_by_genome = defaultdict(list) for k, v in feat_set['elements'].items(): feat_by_genome[v[0]].append(k) for genome, fids in feat_by_genome.items(): genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1] res = self.gsu.search({'ref': genome, 'structured_query': {'feature_id': fids}, 'sort_by': [['contig_id', 1]], 'start': 0, 'limit': len(fids) }) for feat in res['features']: features.append({'Feature Id': feat['feature_id'], 'Aliases': ", ".join(feat['aliases'].keys()), 'Genome': "{} ({})".format(genome_name, genome), 'Type': feat['feature_type'], 'Function': feat['function'] }) return fs_name, features def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['featureset_ref']] }) return {'shock_id': package_details['shock_id']}
def export_genome_as_genbank(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_genbank print('export_genome_as_genbank -- paramaters = ') # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot run export_genome_as_genbank- no "input_ref" field defined.' ) # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_object_info_new({ 'objects': [{ 'ref': params['input_ref'] }], 'includeMetadata': 0, 'ignoreErrors': 0 })[0] genome_to_genbank_params = {'genome_ref': params['input_ref']} # export to file result = self.genome_to_genbank( ctx, genome_to_genbank_params)[0]['genbank_file'] # create the output directory and move the file there export_package_dir = os.path.join(self.cfg.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join(export_package_dir, os.path.basename(result['file_path']))) # package it up and be done dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) output = {'shock_id': package_details['shock_id']} print('export complete -- result = ') pprint(output) #END export_genome_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]
class PangenomeDownload: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.pga = PanGenomeAPI(os.environ['SDK_CALLBACK_URL']) self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) @staticmethod def validate_params(params, expected={"workspace_name", "pangenome_name"}): expected = set(expected) pkeys = set(params) if expected - pkeys: raise ValueError("Required keys {} not in supplied parameters" .format(", ".join(expected - pkeys))) def to_tsv(self, params): files = {} working_dir = os.path.join(self.scratch, 'pangenome-download-'+str(uuid.uuid4())) os.makedirs(working_dir) pg_id, id_name_map, genome_df = self.make_genomes_df( params['pangenome_ref']) files['genomes_path'] = os.path.join(working_dir, pg_id + "_Genomes.tsv") genome_df.to_csv(files['genomes_path'], sep="\t") ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map) files['orthologs_path'] = os.path.join(working_dir, pg_id + "_Orthologs.tsv") ortho_df.to_csv(files['orthologs_path'], sep="\t") return pg_id, files def to_excel(self, params): files = {} working_dir = os.path.join(self.scratch, 'pangenome-download-' + str(uuid.uuid4())) os.makedirs(working_dir) pg_id, id_name_map, genome_df = self.make_genomes_df( params['pangenome_ref']) files['path'] = os.path.join(working_dir, pg_id + ".xlsx") writer = pandas.ExcelWriter(files['path']) genome_df.to_excel(writer, "Genomes") ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map) ortho_df.to_excel(writer, "Orthologs") writer.save() return pg_id, files def make_genomes_df(self, pg_ref): summary = self.pga.compute_summary_from_pangenome({ "pangenome_ref": pg_ref}) return summary['pangenome_id'], summary['genome_ref_name_map'], \ pandas.DataFrame(summary['shared_family_map']) def make_ortholog_df(self, pg_ref, id_name_map): pangen = self.dfu.get_objects({'object_refs': [pg_ref]} )['data'][0]['data'] ortho = {} for cluster in pangen['orthologs']: ortho[cluster['id']] = { "representative function": cluster.get('function', ""), "type": cluster.get("type", ""), "protein sequence": cluster.get("protein_translation", ""), } for gid, name in id_name_map.items(): ortho[cluster['id']][name] = ";".join( [x[0] for x in cluster['orthologs'] if x[2] == gid]) return pandas.DataFrame.from_dict(ortho, 'index')[ ["representative function", "type", "protein sequence"] + sorted([x for x in id_name_map.values()])] def export(self, files, name, params): export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4())) os.makedirs(export_package_dir) for file in files: shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['pangenome_ref']] }) return {'shock_id': package_details['shock_id']}
def export_genome_as_genbank(self, ctx, params): """ :param params: instance of type "ExportParams" (input and output structure functions for standard downloaders) -> structure: parameter "input_ref" of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_genome_as_genbank print('export_genome_as_genbank -- paramaters = ') # validate parameters if 'input_ref' not in params: raise ValueError('Cannot run export_genome_as_genbank- no "input_ref" field defined.') # get WS metadata to get ws_name and obj_name ws = Workspace(url=self.cfg.workspaceURL) info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0] genome_to_genbank_params = { 'genome_ref': params['input_ref'] } # export to file (building from KBase Genome Object) result = self.genome_to_genbank(ctx, genome_to_genbank_params)[0]['genbank_file']; # create the output directory and move the file there export_package_dir = os.path.join(self.cfg.sharedFolder, info[1]) os.makedirs(export_package_dir) shutil.move( result['file_path'], os.path.join(export_package_dir, os.path.basename(result['file_path']))) # export original uploaded GenBank file if it existed. exporter = GenomeToGenbank(self.cfg) original_result_full = exporter.export_original_genbank(ctx, genome_to_genbank_params) if original_result_full is not None: original_result = original_result_full['genbank_file'] shutil.move( original_result['file_path'], os.path.join(export_package_dir, os.path.basename(original_result['file_path']))) # Make warning file about genes only. warning_filename = "warning.txt" with open(os.path.join(export_package_dir, warning_filename), 'wb') as temp_file: temp_file.write('Please note: the KBase-derived GenBank file for annotated genome ' + 'objects currently only shows "gene" features. CDS and mRNA ' + 'feature types are not currently included in the GenBank download, ' + 'but are in the KBase Genome object. ' + 'We hope to address this issue in the future.\n\n' + 'This directory includes the KBase-derived GenBank file and also ' + '(if you originally uploaded the genome from an annotated ' + 'GenBank file) the original GenBank input.') # package it up and be done dfUtil = DataFileUtil(self.cfg.callbackURL) package_details = dfUtil.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [ params['input_ref'] ] }) output = { 'shock_id': package_details['shock_id'] } print('export complete -- result = ') pprint(output) #END export_genome_as_genbank # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_genome_as_genbank return value ' + 'output is not type dict as required.') # return the results return [output]
class AssemblyToFasta: def __init__(self, callback_url, scratch): self.scratch = scratch self.dfu = DataFileUtil(callback_url) def export_as_fasta(self, ctx, params): ''' Used almost exclusively for download only ''' # validate parameters if 'input_ref' not in params: raise ValueError( 'Cannot export Assembly- not input_ref field defined.') # export to a file file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']}) # create the output directory and move the file there export_package_dir = os.path.join(self.scratch, file['assembly_name']) os.makedirs(export_package_dir) shutil.move( file['path'], os.path.join(export_package_dir, os.path.basename(file['path']))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [params['input_ref']] }) return {'shock_id': package_details['shock_id']} def assembly_as_fasta(self, ctx, params): ''' main function that accepts a ref to an object and writes a file ''' self.validate_params(params) print('downloading ws object data (' + params['ref'] + ')') assembly_object = self.dfu.get_objects( {'object_refs': [params['ref']]})['data'][0] ws_type = assembly_object['info'][2] obj_name = assembly_object['info'][1] if 'filename' in params: output_filename = params['filename'] else: output_filename = obj_name + '.fa' output_fasta_file_path = os.path.join(self.scratch, output_filename) if 'KBaseGenomes.ContigSet' in ws_type: self.process_legacy_contigset(output_fasta_file_path, assembly_object['data']) elif 'KBaseGenomeAnnotations.Assembly' in ws_type: self.process_assembly(output_fasta_file_path, assembly_object['data']) else: raise ValueError( 'Cannot write data to fasta; invalid WS type (' + ws_type + '). Supported types are KBaseGenomes.ContigSet and ' + 'KBaseGenomeAnnotations.Assembly') return {'path': output_fasta_file_path, 'assembly_name': obj_name} def fasta_rows_generator_from_contigset(self, contig_list): ''' generates SeqRecords iterator for writing from a legacy contigset object ''' for contig in contig_list: description = '' if 'description' in contig and contig['description']: description = contig['description'] yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet), id=contig['id'], description=description) def process_legacy_contigset(self, output_fasta_path, data): ''' ''' SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']), output_fasta_path, "fasta") def process_assembly(self, output_fasta_path, data): ''' ''' self.dfu.shock_to_file({ 'handle_id': data['fasta_handle_ref'], 'file_path': output_fasta_path, 'unpack': 'uncompress' }) def validate_params(self, params): for key in ['ref']: if key not in params: raise ValueError('required "' + key + '" field was not defined')
class ReadsAlignmentUtils: ''' Module Name: ReadsAlignmentUtils Module Description: A KBase module: ReadsAlignmentUtils This module is intended for use by Aligners and Assemblers to upload and download alignment files. The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files in sam and bai formats from the downloaded bam file. This utility also generates stats from the stored alignment. ''' ######## WARNING FOR GEVENT USERS ####### noqa # Since asynchronous IO can lead to methods - even the same method - # interrupting each other, you must be *very* careful when using global # state. A method could easily clobber the state set by another while # the latter method is running. ######################################### noqa VERSION = "0.0.1" GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git" GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a" #BEGIN_CLASS_HEADER PARAM_IN_FILE = 'file_path' PARAM_IN_SRC_REF = 'source_ref' PARAM_IN_DST_REF = 'destination_ref' PARAM_IN_CONDITION = 'condition' PARAM_IN_READ_LIB_REF = 'read_library_ref' PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref' PARAM_IN_ALIGNED_USING = 'aligned_using' PARAM_IN_ALIGNER_VER = 'aligner_version' PARAM_IN_ALIGNER_OPTS = 'aligner_opts' PARAM_IN_REPLICATE_ID = 'replicate_id' PARAM_IN_PLATFORM = 'platform' PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index' PARAM_IN_SAMPLESET_REF = 'sampleset_ref' PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id' PARAM_IN_DOWNLOAD_SAM = 'downloadSAM' PARAM_IN_DOWNLOAD_BAI = 'downloadBAI' PARAM_IN_VALIDATE = 'validate' INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]') INVALID_WS_NAME_RE = re.compile('[^\\w:._-]') def _get_file_path_info(self, file_path): """ Given a file path, returns the directory, file name, file base and file extension """ dir, file_name = os.path.split(file_path) file_base, file_ext = os.path.splitext(file_name) return dir, file_name, file_base, file_ext def _mkdir_p(self, path): """ _mkdir_p: make directory for given path """ if not path: return try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def _check_required_param(self, in_params, param_list): """ Checks if each of the params in the list are in the input params """ for param in param_list: if (param not in in_params or not in_params[param]): raise ValueError('{} parameter is required'.format(param)) def _proc_ws_obj_params(self, ctx, params): """ Checks the validity of workspace and object params and returns them """ dst_ref = params.get(self.PARAM_IN_DST_REF) ws_name_id, obj_name_id = os.path.split(dst_ref) if not bool(ws_name_id.strip()) or ws_name_id == '/': raise ValueError("Workspace name or id is required in " + self.PARAM_IN_DST_REF) if not bool(obj_name_id.strip()): raise ValueError("Object name or id is required in " + self.PARAM_IN_DST_REF) if not isinstance(ws_name_id, int): try: ws_name_id = self.dfu.ws_name_to_id(ws_name_id) except DFUError as se: prefix = se.message.split('.')[0] raise ValueError(prefix) self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id)) return ws_name_id, obj_name_id def _get_ws_info(self, obj_ref): ws = Workspace(self.ws_url) try: info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0] except WorkspaceError as wse: self.__LOGGER.error('Logging workspace exception') self.__LOGGER.error(str(wse)) raise return info def _proc_upload_alignment_params(self, ctx, params): """ Checks the presence and validity of upload alignment params """ self._check_required_param(params, [ self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION, self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF ]) ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params) file_path = params.get(self.PARAM_IN_FILE) if not (os.path.isfile(file_path)): raise ValueError('File does not exist: ' + file_path) lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2] if lib_type.startswith('KBaseFile.SingleEndLibrary') or \ lib_type.startswith('KBaseFile.PairedEndLibrary') or \ lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \ lib_type.startswith('KBaseAssembly.PairedEndLibrary'): pass else: raise ValueError(self.PARAM_IN_READ_LIB_REF + ' parameter should be of type' + ' KBaseFile.SingleEndLibrary or' + ' KBaseFile.PairedEndLibrary or' + ' KBaseAssembly.SingleEndLibrary or' + ' KBaseAssembly.PairedEndLibrary') obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2] if obj_type.startswith('KBaseGenomes.Genome') or \ obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \ obj_type.startswith('KBaseGenomes.ContigSet'): pass else: raise ValueError(self.PARAM_IN_ASM_GEN_REF + ' parameter should be of type' + ' KBaseGenomes.Genome or' + ' KBaseGenomeAnnotations.Assembly or' + ' KBaseGenomes.ContigSet') return ws_name_id, obj_name_id, file_path, lib_type def _get_aligner_stats(self, bam_file): """ Gets the aligner stats from BAM file """ path, file = os.path.split(bam_file) return self.samtools.get_stats(file, path) def _validate(self, params): samt = SamTools(self.config, self.__LOGGER) if 'ignore' in params: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path, ignore=params['ignore']) else: path, file = os.path.split(params['file_path']) rval = samt.validate(ifile=file, ipath=path) return rval #END_CLASS_HEADER # config contains contents of config file in a hash or None if it couldn't # be found def __init__(self, config): #BEGIN_CONSTRUCTOR self.config = config self.__LOGGER = logging.getLogger('KBaseRNASeq') if 'log_level' in config: self.__LOGGER.setLevel(config['log_level']) else: self.__LOGGER.setLevel(logging.INFO) streamHandler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s - %(filename)s - %(lineno)d - \ %(levelname)s - %(message)s") formatter.converter = time.gmtime streamHandler.setFormatter(formatter) self.__LOGGER.addHandler(streamHandler) self.__LOGGER.info("Logger was set") script_utils.check_sys_stat(self.__LOGGER) self.scratch = config['scratch'] self.callback_url = os.environ['SDK_CALLBACK_URL'] self.ws_url = config['workspace-url'] self.dfu = DataFileUtil(self.callback_url) self.samtools = SamTools(config) #END_CONSTRUCTOR pass def validate_alignment(self, ctx, params): """ :param params: instance of type "ValidateAlignmentParams" (* Input parameters for validating a reads alignment. For validation errors to ignore, see http://broadinstitute.github.io/picard/command-line-overview.html#V alidateSamFile) -> structure: parameter "file_path" of String, parameter "ignore" of list of String :returns: instance of type "ValidateAlignmentOutput" (* Results from validate alignment *) -> structure: parameter "validated" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)) """ # ctx is the context object # return variables are: returnVal #BEGIN validate_alignment rval = self._validate(params) if rval == 0: returnVal = {'validated': True} else: returnVal = {'validated': False} #END validate_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method validate_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def upload_alignment(self, ctx, params): """ Validates and uploads the reads alignment * :param params: instance of type "UploadAlignmentParams" (* Required input parameters for uploading a reads alignment string destination_ref - object reference of alignment destination. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id file_path - File with the path of the sam or bam file to be uploaded. If a sam file is provided, it will be converted to the sorted bam format before being saved read_library_ref - workspace object ref of the read sample used to make the alignment file condition - assembly_or_genome_ref - workspace object ref of genome assembly or genome object that was used to build the alignment *) -> structure: parameter "destination_ref" of String, parameter "file_path" of String, parameter "read_library_ref" of String, parameter "condition" of String, parameter "assembly_or_genome_ref" of String, parameter "aligned_using" of String, parameter "aligner_version" of String, parameter "aligner_opts" of mapping from String to String, parameter "replicate_id" of String, parameter "platform" of String, parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter "sampleset_ref" of type "ws_Sampleset_ref", parameter "mapped_sample_id" of mapping from String to mapping from String to String, parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "UploadAlignmentOutput" (* Output from uploading a reads alignment *) -> structure: parameter "obj_ref" of String """ # ctx is the context object # return variables are: returnVal #BEGIN upload_alignment self.__LOGGER.info( 'Starting upload Reads Alignment, parsing parameters ') pprint(params) ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params( ctx, params) dir, file_name, file_base, file_ext = self._get_file_path_info( file_path) if self.PARAM_IN_VALIDATE in params and params[ self.PARAM_IN_VALIDATE] is True: if self._validate(params) == 1: raise Exception('{0} failed validation'.format(file_path)) bam_file = file_path if file_ext.lower() == '.sam': bam_file = os.path.join(dir, file_base + '.bam') self.samtools.convert_sam_to_sorted_bam(ifile=file_name, ipath=dir, ofile=bam_file) uploaded_file = self.dfu.file_to_shock({ 'file_path': bam_file, 'make_handle': 1 }) file_handle = uploaded_file['handle'] file_size = uploaded_file['size'] aligner_stats = self._get_aligner_stats(file_path) aligner_data = { 'file': file_handle, 'size': file_size, 'condition': params.get(self.PARAM_IN_CONDITION), 'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF), 'library_type': lib_type, 'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF), 'alignment_stats': aligner_stats } optional_params = [ self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER, self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID, self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX, self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID ] for opt_param in optional_params: if opt_param in params and params[opt_param] is not None: aligner_data[opt_param] = params[opt_param] self.__LOGGER.info('========= Adding extra_provenance_refs') self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF)) self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF)) self.__LOGGER.info('=======================================') res = self.dfu.save_objects({ "id": ws_name_id, "objects": [{ "type": "KBaseRNASeq.RNASeqAlignment", "data": aligner_data, "name": obj_name_id, "extra_provenance_input_refs": [ params.get(self.PARAM_IN_READ_LIB_REF), params.get(self.PARAM_IN_ASM_GEN_REF) ] }] })[0] self.__LOGGER.info('save complete') returnVal = { 'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4]) } self.__LOGGER.info('Uploaded object: ') self.__LOGGER.info(returnVal) #END upload_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method upload_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def download_alignment(self, ctx, params): """ Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats * :param params: instance of type "DownloadAlignmentParams" (* Required input parameters for downloading a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "DownloadAlignmentOutput" (* The output of the download method. *) -> structure: parameter "destination_dir" of String, parameter "stats" of type "AlignmentStats" -> structure: parameter "properly_paired" of Long, parameter "multiple_alignments" of Long, parameter "singletons" of Long, parameter "alignment_rate" of Double, parameter "unmapped_reads" of Long, parameter "mapped_reads" of Long, parameter "total_reads" of Long """ # ctx is the context object # return variables are: returnVal #BEGIN download_alignment self.__LOGGER.info('Running download_alignment with params:\n' + pformat(params)) inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise # set the output dir uuid_str = str(uuid.uuid4()) output_dir = os.path.join(self.scratch, 'download_' + uuid_str) self._mkdir_p(output_dir) file_ret = self.dfu.shock_to_file({ 'shock_id': alignment[0]['data']['file']['id'], 'file_path': output_dir }) if zipfile.is_zipfile(file_ret.get('file_path')): with zipfile.ZipFile(file_ret.get('file_path')) as z: z.extractall(output_dir) for f in glob.glob(output_dir + '/*.zip'): os.remove(f) bam_files = glob.glob(output_dir + '/*.bam') uuid_prefix = uuid_str[:8] if len(bam_files) == 0: raise ValueError("Alignment object does not contain a bam file") for bam_file_path in bam_files: dir, file_name, file_base, file_ext = self._get_file_path_info( bam_file_path) if params.get(self.PARAM_IN_VALIDATE, False): validate_params = {'file_path': bam_file_path} if self._validate(validate_params) == 1: raise Exception( '{0} failed validation'.format(bam_file_path)) if params.get('downloadBAI', False): bai_file = uuid_prefix + '_' + file_base + '.bai' bai_file_path = os.path.join(output_dir, bai_file) self.samtools.create_bai_from_bam(ifile=file_name, ipath=output_dir, ofile=bai_file) if not os.path.isfile(bai_file_path): raise ValueError('Error creating {}'.format(bai_file_path)) if params.get('downloadSAM', False): sam_file = uuid_prefix + '_' + file_base + '.sam' sam_file_path = os.path.join(output_dir, sam_file) self.samtools.convert_bam_to_sam(ifile=file_name, ipath=output_dir, ofile=sam_file) if not os.path.isfile(sam_file_path): raise ValueError('Error creating {}'.format(sam_file_path)) returnVal = { 'destination_dir': output_dir, 'stats': alignment[0]['data']['alignment_stats'] } #END download_alignment # At some point might do deeper type checking... if not isinstance(returnVal, dict): raise ValueError('Method download_alignment return value ' + 'returnVal is not type dict as required.') # return the results return [returnVal] def export_alignment(self, ctx, params): """ Wrapper function for use by in-narrative downloaders to download alignments from shock * :param params: instance of type "ExportParams" (* Required input parameters for exporting a reads alignment string source_ref - object reference of alignment source. The object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the workspace name or id and obj_name_or_id is the object name or id *) -> structure: parameter "source_ref" of String, parameter "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "validate" of type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1)), parameter "ignore" of list of String :returns: instance of type "ExportOutput" -> structure: parameter "shock_id" of String """ # ctx is the context object # return variables are: output #BEGIN export_alignment inref = params.get(self.PARAM_IN_SRC_REF) if not inref: raise ValueError('{} parameter is required'.format( self.PARAM_IN_SRC_REF)) if params.get(self.PARAM_IN_VALIDATE, False) or \ params.get('exportBAI', False) or \ params.get('exportSAM', False): """ Need to validate or convert files. Use download_alignment """ download_params = {} for key, val in params.iteritems(): download_params[key.replace('export', 'download')] = val download_retVal = self.download_alignment(ctx, download_params)[0] export_dir = download_retVal['destination_dir'] # package and load to shock ret = self.dfu.package_for_download({ 'file_path': export_dir, 'ws_refs': [inref] }) output = {'shock_id': ret['shock_id']} else: """ return shock id from the object """ try: alignment = self.dfu.get_objects({'object_refs': [inref]})['data'] except DFUError as e: self.__LOGGER.error( 'Logging stacktrace from workspace exception:\n' + e.data) raise output = {'shock_id': alignment[0]['data']['file']['id']} #END export_alignment # At some point might do deeper type checking... if not isinstance(output, dict): raise ValueError('Method export_alignment return value ' + 'output is not type dict as required.') # return the results return [output] def status(self, ctx): #BEGIN_STATUS returnVal = { 'state': "OK", 'message': "", 'version': self.VERSION, 'git_url': self.GIT_URL, 'git_commit_hash': self.GIT_COMMIT_HASH } #END_STATUS return [returnVal]
class Utils: def __init__(self, config): self.cfg = config self.scratch = config['scratch'] self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL']) @staticmethod def validate_params(params, expected, opt_param=set()): """Validates that required parameters are present. Warns if unexpected parameters appear""" expected = set(expected) opt_param = set(opt_param) pkeys = set(params) if expected - pkeys: raise ValueError( "Required keys {} not in supplied parameters".format( ", ".join(expected - pkeys))) defined_param = expected | opt_param for param in params: if param not in defined_param: logging.warning( "Unexpected parameter {} supplied".format(param)) @staticmethod def validate_newick(newick): """Validates a Newick string by attempting to make a tree with ete3""" try: ete3.Tree(newick) except NewickError: return False return True def to_newick(self, params): """Convert an Tree to a Newick File""" files = {} res = self.dfu.get_objects({'object_refs': [params['input_ref']]})['data'][0] name = res['info'][1] if "KBaseTrees.Tree" not in res['info'][2]: raise ValueError("Supplied reference is not a Tree") files['file_path'] = os.path.join(params['destination_dir'], name + ".newick") with open(files['file_path'], 'w') as out_file: out_file.write(res['data']['tree']) return name, files def export(self, file, name, input_ref): """Saves a set of files to SHOCK for export""" export_package_dir = os.path.join(self.scratch, name + str(uuid.uuid4())) os.makedirs(export_package_dir) shutil.move(file, os.path.join(export_package_dir, os.path.basename(file))) # package it up and be done package_details = self.dfu.package_for_download({ 'file_path': export_package_dir, 'ws_refs': [input_ref] }) return {'shock_id': package_details['shock_id']}