def test_split_biom(self): """Does 'split_biom' correctly split a biom file according to sample data properties?""" inputs = { 'biom_file': None, 'cluster': None, 'otu_meta': None, 'otu_table': ['otu_bananas.txt'], 'prefix': None, 'sample_data': None, 'split': 'BODY_SITE', 'tax_table': ['tax_bananas.txt'], 'name': ['test'], 'fp': (os.path.dirname(massoc.__file__)[:-6] + 'tests') } batch = Batch(deepcopy(testbiom), inputs) batch.split_biom() self.assertEqual(len(batch.otu), 3)
def get_input(inputs, publish=False): """ Takes all input and returns a dictionary of biom files. If tab-delimited files are supplied, these are combined into a biom file. File names are used as keys. This is mostly a utility wrapper, as all biom-related functions are from biom-format.org. At the moment, rarefaction is performed after sample splitting. This means that samples with uneven sequence counts will not be rarefied to equal depths. All files are written to BIOM files, while a settings file is also written to disk for use by other massoc commands. :param inputs: Dictionary of inputs. :param publish: If True, publishes messages to be received by GUI. :return: """ # handler to file # construct logger after filepath is provided _create_logger(inputs['fp']) if inputs['biom_file'] is not None: logger.info('BIOM file(s) to process: ' + ", ".join(inputs['biom_file'])) if inputs['otu_table'] is not None: logger.info('Tab-delimited OTU table(s) to process: ' + ", ".join(inputs['otu_table'])) if inputs['tax_table'] is not None: if len(inputs['otu_table']) is not len(inputs['tax_table']): logger.error("Add a taxonomy table for every OTU table!", exc_info=True) raise ValueError("Add a taxonomy table for every OTU table!") if inputs['sample_data'] is not None: if len(inputs['otu_table']) is not len(inputs['sample_data']): logger.error("Add a sample data table for every OTU table!", exc_info=True) raise ValueError("Add a sample data table for every OTU table!") if inputs['otu_meta'] is not None: if len(inputs['otu_table']) is not len(inputs['otu_meta']): logger.error("Add a metadata table for every OTU table!", exc_info=True) raise ValueError("Add a metadata table for every OTU table!") filestore = {} if inputs['biom_file'] is None and inputs['network'] is None: if inputs['otu_table'] is None and inputs['network'] is None: logger.error("Please supply either a biom file" ", a tab-delimited OTU table or a network!", exc_info=True) raise ValueError("Please supply either a biom file" ", a tab-delimited OTU table or a network!") # Only process count files if present i = 0 if inputs['name'] is None: inputs['name'] = list() inputs['name'].append('file_') if inputs['biom_file'] is not None: try: for x in inputs['biom_file']: biomtab = load_table(x) filestore[inputs['name'][i]] = biomtab i += 1 except Exception: logger.error("Failed to import BIOM files.", exc_info=True) if inputs['otu_table'] is not None: try: j = 0 # j is used to match sample + tax data to OTU data for x in inputs['otu_table']: input_fp = x sample_metadata_fp = None observation_metadata_fp = None obs_data = None sample_data = None biomtab = load_table(input_fp) try: sample_metadata_fp = inputs['sample_data'][j] observation_metadata_fp = inputs['tax_table'][j] except TypeError or KeyError: pass if sample_metadata_fp is not None: sample_f = open(sample_metadata_fp, 'r') sample_data = MetadataMap.from_file(sample_f) sample_f.close() biomtab.add_metadata(sample_data, axis='sample') if observation_metadata_fp is not None: obs_f = open(observation_metadata_fp, 'r') obs_data = MetadataMap.from_file(obs_f) obs_f.close() # for taxonomy collapsing, # metadata variable needs to be a complete list # not separate entries for each tax level for b in list(obs_data): tax = list() for l in list(obs_data[b]): tax.append(obs_data[b][l]) obs_data[b].pop(l, None) obs_data[b]['taxonomy'] = tax biomtab.add_metadata(obs_data, axis='observation') filestore[inputs['name'][j]] = biomtab j += 1 except Exception: logger.warning("Failed to combine input files.", exc_info=True) bioms = Batch({'otu': filestore}, inputs) # it is possible that there are forbidden characters in the OTU identifiers # we can forbid people from using those, or replace those with an underscore if inputs['biom_file'] or inputs['otu_table']: for name in bioms.otu: biomfile = bioms.otu[name] taxon_ids = biomfile._observation_ids # need to be careful with these operations taxon_index = biomfile._obs_index # likely to corrupt BIOM file if done wrong new_ids = deepcopy(taxon_ids) new_indexes = deepcopy(taxon_index) for i in range(0, len(taxon_ids)): id = taxon_ids[i] new_id = id.replace(" ", "_") new_ids[i] = new_id new_indexes[new_id] = new_indexes.pop(id) biomfile._observation_ids = new_ids biomfile._obs_index = new_indexes bioms.otu[name] = biomfile logger.info('Collapsing taxonomy... ') bioms.collapse_tax() if inputs['cluster'] is not None: if publish: pub.sendMessage('update', msg='Clustering BIOM files...') logger.info('Clustering BIOM files... ') bioms.cluster_biom() if inputs['split'] is not None and inputs['split'] is not 'TRUE': bioms.split_biom() if inputs['min'] is not None: if publish: pub.sendMessage('update', msg='Setting minimum mean abundance...') logger.info('Removing taxa below minimum count... ') bioms.prev_filter(mode='min') if inputs['prev'] is not None: if publish: pub.sendMessage('update', msg='Setting prevalence filter...') logger.info('Setting prevalence filter... ') bioms.prev_filter(mode='prev') if inputs['rar'] is not None: if publish: pub.sendMessage('update', msg='Rarefying counts...') logger.info('Rarefying counts... ') bioms.rarefy() bioms.inputs['procbioms'] = dict() if inputs['biom_file'] or inputs['otu_table']: if 'otu' not in bioms.inputs['levels']: # add otu level always bioms.inputs['procbioms']['otu'] = dict() for name in bioms.inputs['name']: biomname = bioms.inputs['fp'] + '/' + name + '_' + 'otu' + '.hdf5' bioms.inputs['procbioms']['otu'][name] = biomname for level in bioms.inputs['levels']: bioms.inputs['procbioms'][level] = dict() for name in bioms.inputs['name']: biomname = bioms.inputs['fp'] + '/' + name + '_' + level + '.hdf5' bioms.inputs['procbioms'][level][name] = biomname all_bioms = {**bioms.otu, **bioms.genus, **bioms.family, **bioms.order, **bioms.class_, **bioms.phylum} for biomfile in all_bioms: if all_bioms[biomfile].shape[0] == 1: logger.error("The current preprocessing steps resulted in BIOM files with only 1 row.", exc_info=True) if inputs['network'] is not None: if publish: pub.sendMessage('update', msg='Checking previously generated networks...') logger.info('Checking previously generated networks...') filelist = deepcopy(inputs['network']) for file in filelist: network = _read_network(file) nodes = len(network.nodes) edges = len(network.edges) logger.info("This network has " + str(nodes) + \ " nodes and " + str(edges) + " edges.") weight = nx.get_edge_attributes(network, 'weight') if len(weight) > 0: logger.info('This is a weighted network.') else: logger.info('This is an unweighted network.') try: if inputs['biom_file'] or inputs['otu_table']: bioms.write_bioms() logger.info('BIOM files written to disk. ') except Exception: logger.warning('Failed to write BIOM files to disk. ', exc_info=True) write_settings(bioms.inputs) logger.info('Settings file written to disk. ')