def convert(source_inv_fp, target_fp): """ Converter for ISA-Tab to SampleTab. :param source_inv_fp: File descriptor of input investigation file :param target_fp: File descriptor to write output SampleTab to (must be writeable) """ ISA = isatab.load(source_inv_fp) sampletab.dump(ISA, target_fp)
def convert(work_dir, identifier_type=IdentifierType.name, validate_first=True, use_new_parser=False): i_files = glob.glob(os.path.join(work_dir, 'i_*.txt')) if validate_first: logger.info("Validating input ISA tab before conversion") if len(i_files) != 1: logger.fatal( "Could not resolves input investigation file, please check input ISA tab directory." ) return with open(i_files[0], 'r', encoding='utf-8') as validate_fp: report = isatab.validate(fp=validate_fp, log_level=logging.ERROR) if len(report['errors']) > 0: logger.fatal( "Could not proceed with conversion as there are some fatal validation errors. Check log." ) return if use_new_parser: logger.info("Using new parser to load...") with open(i_files[0], 'r', encoding='utf-8') as fp: ISA = isatab.load(fp) from isatools.isajson import ISAJSONEncoder logger.info("Using new ISA JSON encoder to dump...") return json.loads(json.dumps(ISA, cls=ISAJSONEncoder)) else: converter = ISATab2ISAjson_v1(identifier_type) logger.info("Converting ISA-Tab to ISA JSON...") return converter.convert(work_dir)
def get_isa_study(self, study_id, api_key, skip_load_tables=True, study_location=None): """ Get an ISA-API Investigation object reading directly from the ISA-Tab files :param study_id: MTBLS study identifier :param api_key: User API key for accession check :param skip_load_tables: speed-up reading by skiping loading assay and sample tables :param study_location: filessystem location of the study :return: a tuple consisting in ISA-Study obj, ISA-Investigation obj and path to the Study in the file system """ if study_location is None: logger.info("Study location is not set, will have load study from filesystem") std_path = self.wsc.get_study_location(study_id, api_key) else: logger.info("Study location is: " + study_location) std_path = study_location try: i_filename = glob.glob(os.path.join(std_path, "i_*.txt"))[0] fp = open(i_filename) # loading tables also load Samples and Assays isa_inv = load(fp, skip_load_tables) isa_study = isa_inv.studies[0] except IndexError as e: logger.exception("Failed to find Investigation file from %s", study_id, std_path) logger.error(str(e)) abort(400) except Exception as e: logger.exception("Failed to find Investigation file from %s", study_id, std_path) logger.error(str(e)) abort(400) else: return isa_study, isa_inv, std_path
def load(mtbls_study_id): tmp_dir = get(mtbls_study_id) if tmp_dir is None: raise IOError("There was a problem retrieving the study ", mtbls_study_id) with open(glob.glob(os.path.join(tmp_dir, 'i_*.txt'))[0], encoding='utf-8') as f: ISA = isatab.load(f) shutil.rmtree(tmp_dir) return ISA
def create_isatab_archive(inv_fp, target_filename=None, filter_by_measurement=None): """Function to create an ISArchive; option to select by assay measurement type Example usage: >>> create_isatab_archive(open('/path/to/i_investigation.txt', target_filename='isatab.zip') >>> create_isatab_archive(open('/path/to/i.txt', filter_by_measurement='transcription profiling') """ if target_filename is None: target_filename = os.path.join( os.path.dirname(inv_fp.name), 'isatab.zip') ISA = isatab.load(inv_fp) all_files_in_isatab = [] found_files = [] for s in ISA.studies: if filter_by_measurement is not None: log.debug('Selecting ', filter_by_measurement) selected_assays = [a for a in s.assays if a.measurement_type.term == filter_by_measurement] else: selected_assays = s.assays for a in selected_assays: all_files_in_isatab += [d.filename for d in a.data_files] dirname = os.path.dirname(inv_fp.name) for fname in all_files_in_isatab: if os.path.isfile(os.path.join(dirname, fname)): found_files.append(fname) missing_files = [f for f in all_files_in_isatab if f not in found_files] if len(missing_files) == 0: log.debug('Do zip') with ZipFile(target_filename, mode='w') as zip_file: # use relative dir_name to avoid absolute path on file names zip_file.write(inv_fp.name, arcname=os.path.basename(inv_fp.name)) for s in ISA.studies: zip_file.write( os.path.join(dirname, s.filename), arcname=s.filename) for a in selected_assays: zip_file.write( os.path.join(dirname, a.filename), arcname=a.filename) for file in all_files_in_isatab: zip_file.write(os.path.join(dirname, file), arcname=file) log.debug(zip_file.namelist()) return zip_file.namelist() else: log.debug('Not zipping') log.debug('Missing: ', missing_files) return None
def convert(source_inv_fp, target_fp): """ Converter for ISA-Tab to SampleTab. :param source_inv_fp: File descriptor of input investigation file :param target_fp: File descriptor to write output SampleTab to (must be writeable) """ log.info("loading isatab %s", source_inv_fp.name) ISA = isatab.load(source_inv_fp) log.info("dumping sampletab %s", target_fp.name) sampletab.dump(ISA, target_fp)
def convert(source_inv_fp, output_path): """ Converter for ISA-Tab to MAGE-TAB. :param source_inv_fp: File descriptor of input investigation file :param output_dir: Path to directory to write output MAGE-TAB files to """ log.info("loading isatab %s", source_inv_fp.name) ISA = isatab.load(source_inv_fp) log.info("dumping magetab %s", output_path) magetab.dump(ISA, output_path)
def convert(idf_file_path): tmp = tempfile.mkdtemp() ISA = None try: magetab2isatab.convert(idf_file_path, output_path=tmp) with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp: ISA = isatab.load(isa_inv_fp) finally: shutil.rmtree(tmp) if ISA is not None: return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
def get_isa_study(self, study_id, api_key, skip_load_tables=True, study_location=None, failing_gracefully=False): """ Get an ISA-API Investigation object reading directly from the ISA-Tab files :param study_id: MTBLS study identifier :param api_key: User API key for accession check :param skip_load_tables: speed-up reading by skiping loading assay and sample tables :param study_location: filessystem location of the study :return: a tuple consisting in ISA-Study obj, ISA-Investigation obj and path to the Study in the file system """ if skip_load_tables == 'false': skip_load_tables = False if study_location is None: logger.info( "Study location is not set, will have load study from filesystem" ) std_path = self.wsc.get_study_location(study_id, api_key) else: logger.info("Study location is: " + study_location) std_path = study_location try: i_filename = glob.glob(os.path.join(std_path, "i_*.txt"))[0] fp = open(i_filename, encoding='utf-8', errors='ignore') # loading tables also load Samples and Assays isa_inv = load(fp, skip_load_tables) # ToDo. Add MAF to isa_study isa_study = isa_inv.studies[0] except IndexError as e: logger.exception("Failed to find Investigation file from %s", study_id, std_path) logger.error(str(e)) if failing_gracefully: return None, None, None else: abort(417) except Exception as e: logger.exception("Failed to find Investigation file from %s", study_id, std_path) logger.error(str(e)) if failing_gracefully: return None, None, None else: abort(417) else: return isa_study, isa_inv, std_path
def test_unused_protocol_fixer(self): i_table_path = os.path.join(self._tmp_dir, 'BII-S-3', 'i_gilbert.txt') fixer = utils.IsaTabFixer(i_table_path) fixer.remove_unused_protocols() with open('{}.fix'.format(i_table_path)) as fixed_i_fp: investigation = isatab.load(fixed_i_fp) study = investigation.studies[-1] unused_protocol1 = \ study.get_prot('reverse transcription - standard procedure 5') unused_protocol2 = \ study.get_prot('sequence analysis - standard procedure 7') self.assertIsNone(unused_protocol1) self.assertIsNone(unused_protocol2)
def convert(source_idf_fp, technology_type=None, measurement_type=None): tmp = tempfile.mkdtemp() ISA = None try: magetab2isatab.convert(source_idf_fp=source_idf_fp, output_path=tmp, technology_type=technology_type, measurement_type=measurement_type) with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp: ISA = isatab.load(isa_inv_fp) finally: shutil.rmtree(tmp) if ISA is not None: return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from data/raw into cleaned data ready to be measured (saved in data/interim). Cleaning: We iterate through our raw data MTBLS metadata and run ISA-Tab load onmeach to check for loading errors. If the study is not loadable, we exclude the study metadata from this analysis. """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') if len(glob(join(output_filepath, 'MTBLS*'))) > 0: logging.info('Output directory {} already contains MTBLS studies. ' 'Skipping writing to data/interim. If this is not ' 'expected, do you need to "make clean" first?'.format( output_filepath)) exit(0) for study_dir in tqdm(glob(join(input_filepath, 'MTBLS*'))): study_id = basename(study_dir) try: load(study_dir) copytree(study_dir, '{}/{}'.format(output_filepath, study_id)) except Exception: logging.info('Exluding {}'.format(study_dir))
def remove_unused_protocols(self): """Removes usused protocols :return: None """ investigation = isatab.load(os.path.dirname(self.path)) for study in investigation.studies: unused_protocol_names = set(x.name for x in study.protocols) for process in study.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass for assay in study.assays: for process in assay.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass print('Unused protocols: {}'.format(unused_protocol_names)) print('Location of unused protocols: {}'.format( list( map( lambda pr: True if pr.name in unused_protocol_names else False, study.protocols)))) # remove these protocols from study.protocols """ clean_protocols_list = [] for protocol in study.protocols: if protocol.name not in unused_protocol_names: clean_protocols_list.append(protocol) study.protocols = clean_protocols_list """ clean_protocols = [ pr for pr in study.protocols if pr.name not in unused_protocol_names ] print('Clean protocol list: {}'.format( [pr.name for pr in clean_protocols])) study.protocols = clean_protocols print('Clean study.protocols: {}'.format( [pr.name for pr in study.protocols])) isatab.dump(investigation, output_path=os.path.dirname(self.path), i_file_name='{filename}.fix'.format( filename=os.path.basename(self.path)), skip_dump_tables=True)
def main(input_filepath, output_filepath, max_files=-1): """ Runs data processing scripts to run measureents on the metadata found in data/interim to extract the loaded object sizes of the ISA objects, Pandas DataFrame objects, and the raw size of the files as reported on disk. We iterate through our processed data MTBLS metadata and run ISA-Tab load on each, and then extract the approximate size of the DAG portion of the metadta. This DAG metadata is analagous to each line of the table files that describes one path in the DAGs. """ logger = logging.getLogger(__name__) logger.info('making final data set from processed data') if max_files > 0: logger.info('limiting to {} study folders'.format(max_files)) if exists(output_filepath): if getsize(output_filepath) > 0: logger.info('Output file {} already contains data. ' 'Skipping writing to data/processed. If this is not ' 'expected, do you need to "make clean" first?'.format( output_filepath)) exit(0) with open(output_filepath, 'w') as output_file: output_file.write('studyid,fname,disk_size,df_size,isa_size\n') logger.info('studyid, fname', ['disk_size', 'df_size', 'isa_size']) for study_dir in tqdm( glob(join(input_filepath, 'MTBLS*'))[:max_files]): try: isa = load(study_dir) for s in isa.studies: fname = s.filename df = pd.read_csv(join(study_dir, fname), sep='\t') df_size = total_size(df, verbose=False) disk_size = getsize(join(study_dir, fname)) isa_size = total_size(s.process_sequence, verbose=False) output_file.write('"{}","{}",{},{},{}\n'.format( s.identifier, fname, disk_size, df_size, isa_size)) for a in s.assays: fname = a.filename df = pd.read_csv(join(study_dir, fname), sep='\t') df_size = total_size(df, verbose=False) disk_size = getsize(join(study_dir, fname)) isa_size = total_size(a.process_sequence, verbose=False) output_file.write('"{}","{}",{},{},{}\n'.format( s.identifier, fname, disk_size, df_size, isa_size)) output_file.flush() except KeyboardInterrupt: exit(1)
def detect_isatab_process_pooling(fp): from isatools import isatab report = [] ISA = isatab.load(fp) for study in ISA.studies: print("Checking {}".format(study.filename)) pooling_list = detect_graph_process_pooling(study.graph) if len(pooling_list) > 0: report.append({study.filename: pooling_list}) for assay in study.assays: print("Checking {}".format(assay.filename)) pooling_list = detect_graph_process_pooling(assay.graph) if len(pooling_list) > 0: report.append({assay.filename: pooling_list}) return report
def detect_isatab_process_pooling(fp): report = [] ISA = isatab.load(fp) for study in ISA.studies: log.info('Checking {}'.format(study.filename)) pooling_list = detect_graph_process_pooling(study.graph) if len(pooling_list) > 0: report.append({study.filename: pooling_list}) for assay in study.assays: log.info('Checking {}'.format(assay.filename)) pooling_list = detect_graph_process_pooling(assay.graph) if len(pooling_list) > 0: report.append({assay.filename: pooling_list}) return report
def load(FP): # loads IDF file # first cast to IDF inv_fp = cast_idf_to_inv(FP) df = pd.read_csv(inv_fp, names=range(0, 128), sep='\t', engine='python').dropna(axis=1, how='all') df = df.T # transpose df.reset_index(inplace=True) # Reset index so it is accessible as column df.columns = df.iloc[ 0] # If all was OK, promote this row to the column headers # second set output s_ and a_ files sdrf_file = df["Comment[SDRF File]"].iloc[1] study_df, assay_df = split_tables( sdrf_path=os.path.join(os.path.dirname(FP.name), sdrf_file)) study_df.columns = study_df.isatab_header assay_df.columns = assay_df.isatab_header # write out ISA files tmp = "/Users/dj/PycharmProjects/isa-api/tests/data/tmp" inv_fp.seek(0) # print("Writing i_investigation.txt to {}".format(tmp)) print("Writing s_{0} to {1}".format(tmp, os.path.basename(sdrf_file))) with open(os.path.join(tmp, "s_" + os.path.basename(sdrf_file)), "w") as s_fp: study_df.to_csv( path_or_buf=s_fp, mode='a', sep='\t', encoding='utf-8', index=False, ) print("Writing a_{0} to {1}".format(tmp, os.path.basename(sdrf_file))) with open(os.path.join(tmp, "a_" + os.path.basename(sdrf_file)), "w") as a_fp: assay_df.to_csv( path_or_buf=a_fp, mode='a', sep='\t', encoding='utf-8', index=False, ) with open(os.path.join(tmp, "i_investigation.txt")) as tmp_inv_fp: ISA = isatab.load(inv_fp) return ISA
def getISAAssay(assayNum, studyNum, pathToISATABFile): """ This function returns an Assay object given the assay and study numbers in an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file and retrieve the assay and study numbers you are interested in! :param assayNum: The Assay number (notice it's not zero-based index). :type assayNum: int :param studyNum: The Study number (notice it's not zero-based index). :type studyNum: int :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: str :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab import copy try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) std = isa.studies[studyNum - 1] return copy.deepcopy(std.assays[assayNum - 1]) except FileNotFoundError as err: raise err
def exploreISA(pathToISATABFile, verbose=True): """ This function loops through the ISATAB file and lists its Studies and their associated Assays. :param pathToISATABFile: The path to the ISATAB file. :type xpathToISATABFile: str :param verbose: Whether (or not) to print out details of Studies and Assays (default: True) :type verbose: boolean :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ try: isa_tab_record = isatab.load(pathToISATABFile, skip_load_tables=True) if verbose: print('In this ISATAB file you have:') for idx,st in enumerate(isa_tab_record.studies): print('Study: '+str(idx+1)) print('\tStudy Identifier: '+st.identifier+', Study ID: '+st.id+', Study Filename: '+st.filename+', Study Title: '+st.title) print('\tThis Study has the following Assays:') for ix,a in enumerate(st.assays): print('\tAssay: '+str(ix+1)) print('\t\tAssay Filename: '+a.filename+', Assay technology type: '+a.technology_type.term) except FileNotFoundError as err: raise err
def getISAStudy(studyNum, pathToISATABFile, noAssays = True): """ This function returns a Study object given the study number in an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file and retrieve the study number you are interested in! :param studyNum: The Study number (notice it's not zero-based index). :type studyNum: int :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: str :param noAssays: whetehr to remove all assays (i.e. return a copy of the study only) :type noAssays: boolean :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab import copy try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) st = copy.deepcopy(isa.studies[studyNum - 1]) if noAssays: st.assays = [] return st except FileNotFoundError as err: raise err
def get_metabolights_metadata(studyID, study_directory, investigation_file): from isatools import isatab from os.path import join with open(investigation_file, encoding='utf-8') as f: ISA_object = isatab.load(f) met_vars = get_factor_names_m(study_directory) #Metadata classes #Get full metadata values: metadata_full = get_factors_summary_m(ISA_object) s = "," header = "sample," + s.join(met_vars) f = open(join(study_directory, 'metadata.csv'), 'w') f.write(header) f.write('\n') for metadata_sample in metadata_full: line = "\"" + metadata_sample["sample"] + "\"" for met_var in met_vars: line = line + "," + "\"" + metadata_sample[met_var] + "\"" f.write(line) f.write('\n') f.close() return (join(study_directory, 'metadata.csv'))
def main(): """ The simplest checks for ISA-Tab correctness is to attempt to load ISA-Tab files, and then count certain objects to get an idea if the parser created the correct number of each entity. """ expected_values = { 'num_study_sources': [3, 1], 'num_study_samples': [12, 60] } study_number = 0 for d in sorted(glob.glob("cmso*/isa")): print("attempting to load {}".format(d)) i_files = glob.glob(os.path.join(d, 'i_*.txt')) if len(i_files) != 1: raise FileNotFoundError( "Could not find an investigation file in {}".format(d)) with open(os.path.join(next(iter(i_files)))) as i_fp: isa_objects = isatab.load(i_fp) assert isa_objects is not None # if not created, error # Some simple checks to sanity check that loaded what we expected num_study_sources = len( isa_objects.studies[-1].materials['sources']) num_study_samples = len( isa_objects.studies[-1].materials['samples']) print("loaded {} study sources".format(num_study_sources)) print("loaded {} study samples".format(num_study_samples)) assert num_study_sources == \ expected_values["num_study_sources"][study_number] assert num_study_samples == \ expected_values["num_study_samples"][study_number] print("{} load OK".format(d)) study_number += 1 """
import json import requests from isatools import isatab from isatools.convert import isatab2json directory_1 = '/Users/Philippe/Documents/Dropbox-Backup/Eurovis 2015 - Chronoglyph/ISATAB-datasets/BII-S-8_FP001RO-isatab-TEST' inv_name_1 = 'i_fp001ro-investigation.txt' isa_config_1 = '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-default_v2014-01-16/' directory_2 = '/Users/Philippe/Documents/git/ISAdatasets/tab/MTBLS404/' inv_name_2 = 'i_sacurine.txt' # isa_config_2 = '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-seq_v2016-11-17-SRA1.5-august2014mod/' try: # my_isa_read = isatab.load(open(os.path.join('/Users/Philippe/Downloads/ISAcreator-1.7.11-all/isatab files/SRA_assembly_test', 'i_investigation.txt'))) my_isa_read = isatab.load(open(os.path.join(directory_1, inv_name_1))) print("reading in:", my_isa_read.studies) # my_json_report = isatab.validate(open(os.path.join('/Users/Philippe/Downloads/ISAcreator-1.7.11-all/isatab files/SRA_assembly_test', 'i_investigation.txt')), '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-seq_v2016-11-17-SRA1.5-august2014mod/') # my_json_report = isatab.validate(open(os.path.join(directory_1,inv_name_1)), isa_config_1) # print(my_json_report) try: isa_json = isatab2json.convert(directory_2) except Exception as excep: print(excep) except IOError as e: print(e)
from isatools import isatab from isatools.model import * import sys import json import os input_filepath = sys.argv[1] # input path to ISA-Tab output_filepath = sys.argv[2] # output path to write ISA-Tab mapping_filepath = sys.argv[3] # path to mapping json file ISA = isatab.load(input_filepath) # only get first assay from first study obj study = ISA.studies[0] mapping = {} with open(mapping_filepath) as fp: mapping = json.load(fp) for assay in study.assays: # get mass spectrometry processes only ms_processes = [ x for x in assay.process_sequence if x.executes_protocol.protocol_type.term == 'mass spectrometry' ] # insert the new parameter values for k, v in mapping.items(): with open(os.path.join('MTBLS265-no-binary', 'json_meta', v + '.json')) as fp2: mzml_meta = json.load(fp2) data_trans_meta = {
def replace_factor_with_protocol_parameter_value(self, factor_name, protocol_ref): """Fixes a factor if it's supposed to be a Parameter Value :param factor_name: The factor that's incorrect :param protocol_ref: Protocol REF for the new Parameter Value :return: None """ table_file_df = isatab.read_tfile(self.path) field_names = list(table_file_df.columns) clean_field_names = self.clean_isatab_field_names(field_names) factor_index = clean_field_names.index( 'Factor Value[{factor_name}]'.format(factor_name=factor_name)) with open(self.path) as tfile_fp: next(tfile_fp) line1 = next(tfile_fp) protocol_ref_index = list( map(lambda x: x[1:-1] if x[0] == '"' and x[-1] == '"' else x, line1.split('\t'))).index(protocol_ref) if protocol_ref_index < 0: raise IOError( 'Could not find protocol ref matching {protocol_ref}'.format( protocol_ref=protocol_ref)) if factor_index < len(field_names) and \ 'Term Source REF' in field_names[factor_index + 1] and \ 'Term Accession' in field_names[factor_index + 2]: log.debug('Moving Factor Value[{}] with term columns'.format( factor_name)) # move Factor Value and Term Source REF and Term Accession columns field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) field_names.insert(protocol_ref_index + 3, field_names[factor_index + 2 + 2]) del field_names[factor_index + 3] # del Factor Value[{}] del field_names[factor_index + 1 + 2] # del Term Source REF del field_names[factor_index + 2 + 1] # del Term Accession elif factor_index < len(field_names) and \ 'Unit' in field_names[factor_index + 1] and \ 'Term Source REF' in field_names[factor_index + 2] and \ 'Term Accession' in field_names[factor_index + 3]: log.debug( 'Moving Factor Value[{factor_name}] with unit term columns'. format(factor_name=factor_name)) # move Factor Value and Unit as ontology annotation field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) field_names.insert(protocol_ref_index + 3, field_names[factor_index + 2 + 2]) field_names.insert(protocol_ref_index + 4, field_names[factor_index + 3 + 3]) del field_names[factor_index + 4] # del Factor Value[{}] del field_names[factor_index + 1 + 3] # del Unit del field_names[factor_index + 2 + 2] # del Term Source REF del field_names[factor_index + 3 + 1] # del Term Accession elif factor_index < len(field_names) and \ 'Unit' in field_names[factor_index + 1]: log.debug( 'Moving Factor Value[{factor_name}] with unit column'.format( factor_name=factor_name)) # move Factor Value and Unit columns field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) del field_names[factor_index + 2] # del Factor Value[{}] del field_names[factor_index + 1 + 1] # del Unit else: # move only the Factor Value column log.debug('Moving Factor Value[{factor_name}]'.format( factor_name=factor_name)) field_names.insert(protocol_ref_index + 1, field_names[factor_index]) del field_names[factor_index] # del Factor Value[{}] table_file_df.columns = self.clean_isatab_field_names(field_names) # Rename Factor Value column to Parameter Value column field_names_modified = list(table_file_df.columns) field_names_modified[protocol_ref_index + 1] = \ field_names_modified[protocol_ref_index + 1].replace( 'Factor Value', 'Parameter Value') table_file_df.columns = self.clean_isatab_field_names( field_names_modified) investigation = isatab.load(os.path.dirname(self.path), skip_load_tables=True) study = investigation.studies[-1] protocol = study.get_prot(protocol_ref) if protocol is None: raise ISAModelAttributeError( 'No protocol with name {protocol_ref} was found'.format( protocol_ref=protocol_ref)) protocol.add_param(factor_name) factor = study.get_factor(factor_name) if factor is None: raise ISAModelAttributeError( 'No factor with name {factor_name} was found'.format( factor_name=factor_name)) else: study.del_factor(name=factor_name, are_you_sure=True) study.filename = '{study_filename}.fix'.format( study_filename=study.filename) isatab.dump(investigation, output_path=os.path.dirname(self.path), i_file_name='i_Investigation.txt.fix', skip_dump_tables=True) with open( os.path.join( os.path.dirname(self.path), '{s_filename}.fix'.format( s_filename=os.path.basename(self.path))), 'w') as out_fp: table_file_df.to_csv(path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')
def generate_study_design_report(self, get_num_study_groups=True, get_factors=True, get_num_levels=True, get_levels=True, get_study_groups=True): """Generates a study design report :return: JSON report """ isa = isatab.load(self.path, skip_load_tables=False) study_design_report = [] raw_data_file_prefix = ('Raw', 'Array', 'Free Induction Decay') for study in isa.studies: study_key = study.identifier if study.identifier != '' \ else study.filename study_design_report.append({ 'study_key': study_key, 'total_sources': len(study.sources), 'total_samples': len(study.samples), 'assays': [] }) with open(os.path.join(self.path, study.filename)) as s_fp: s_df = isatab.load_table(s_fp) for assay in study.assays: assay_key = '/'.join([ assay.filename, assay.measurement_type.term, assay.technology_type.term, assay.technology_platform ]) assay_report = { 'assay_key': assay_key, 'num_sources': len(assay.samples), 'num_samples': len([ x for x in assay.data_files if x.label.startswith(raw_data_file_prefix) ]) } with open(os.path.join(self.path, assay.filename)) as a_fp: a_df = isatab.load_table(a_fp) merged_df = pd.merge(s_df, a_df, on='Sample Name') factor_cols = [ x for x in merged_df.columns if x.startswith("Factor Value") ] if len(factor_cols) > 0: # add branch to get all if no FVs study_group_factors_df = \ merged_df[factor_cols].drop_duplicates() factors_list = [ x[13:-1] for x in study_group_factors_df.columns ] queries = [] factors_and_levels = {} for i, row in study_group_factors_df.iterrows(): fvs = [] for x, y in zip(factors_list, row): fvs.append(' == '.join([x, str(y)])) try: factor_and_levels = \ factors_and_levels[x] except KeyError: factors_and_levels[x] = set() factor_and_levels = \ factors_and_levels[x] factor_and_levels.add(str(y)) queries.append(' and '.join(fvs)) assay_report['total_study_groups'] = len(queries) assay_report['factors_and_levels'] = [] assay_report['group_summary'] = [] for k, v in factors_and_levels.items(): assay_report['factors_and_levels'].append({ 'factor': k, 'num_levels': len(v), }) for query in queries: try: columns = merged_df.columns columns = recast_columns(columns=columns) for i, column in enumerate(columns): columns[i] = pyvar(column) if \ column.startswith( 'Factor Value[') else column merged_df.columns = columns qlist = query.split(' and ') fmt_query = [] for factor_query in qlist: factor_value = \ factor_query.split(' == ') fmt_query_part = \ "Factor_Value_{0}_ == '{1}'"\ .format(pyvar(factor_value[0]), factor_value[1]) fmt_query.append(fmt_query_part) fmt_query = ' and '.join(fmt_query) log.debug( 'running query: {}'.format(fmt_query)) df2 = merged_df.query(fmt_query) data_column = [ x for x in merged_df.columns if x.startswith(raw_data_file_prefix) and x.endswith('Data File') ][0] assay_report['group_summary'].append( dict(study_group=query, sources=len( list(df2['Source Name']. drop_duplicates())), samples=len( list(df2['Sample Name']. drop_duplicates())), raw_files=len( list(df2[data_column]. drop_duplicates())))) except Exception as e: print("error in query, {}".format(e)) study_design_report[-1]['assays'].append(assay_report) return study_design_report
def create_isatab_archive(inv_fp, target_filename=None, filter_by_measurement=None): """Function to create an ISArchive; option to select by assay measurement type :param inv_fp: A file-like buffer object pointing to an investigation file :param target_filename: Target ZIP file name :param filter_by_measurement: Select by measurement type :return: List of files zipped if successful, None if not successful """ if target_filename is None: target_filename = os.path.join(os.path.dirname(inv_fp.name), 'isatab.zip') ISA = isatab.load(inv_fp) all_files_in_isatab = [] found_files = [] for s in ISA.studies: if filter_by_measurement is not None: log.debug('Selecting ', filter_by_measurement) selected_assays = [ a for a in s.assays if a.measurement_type.term == filter_by_measurement ] else: selected_assays = s.assays for a in selected_assays: all_files_in_isatab += [d.filename for d in a.data_files] dirname = os.path.dirname(inv_fp.name) for fname in all_files_in_isatab: if os.path.isfile(os.path.join(dirname, fname)): found_files.append(fname) missing_files = [f for f in all_files_in_isatab if f not in found_files] if len(missing_files) == 0: log.debug('Do zip') with ZipFile(target_filename, mode='w') as zip_file: # use relative dir_name to avoid absolute path on file names zip_file.write(inv_fp.name, arcname=os.path.basename(inv_fp.name)) for s in ISA.studies: zip_file.write(os.path.join(dirname, s.filename), arcname=s.filename) for a in selected_assays: zip_file.write(os.path.join(dirname, a.filename), arcname=a.filename) for file in all_files_in_isatab: zip_file.write(os.path.join(dirname, file), arcname=file) log.debug(zip_file.namelist()) return zip_file.namelist() else: log.debug('Not zipping') log.debug('Missing: ', missing_files) return None
def load_investigation(investigation_file): f = open(investigation_file, 'r') investigation = ISATAB.load(f) return investigation
def test_batch_fixer(self): s_table_path = os.path.join(self._tmp_dir, 'BII-S-3', 's_BII-S-3.txt') settings = { s_table_path: { "factor": "dose", "protocol_ref": "environmental material collection - " "standard procedure 1" } } utils.batch_fix_isatabs(settings) expected_field_names = [ 'Source Name', 'Characteristics[organism]', 'Term Source REF', 'Term Accession Number', 'Characteristics[geographic location (country and/or sea,region)]', 'Term Source REF', 'Term Accession Number', 'Characteristics[geographic location (longitude)]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[geographic location (latitude)]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[chlorophyll a concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[fucoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[peridinin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[butfucoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[hexfucoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[alloxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[zeaxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[lutein concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[chl-c3 concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[chl-c2 concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[prasinoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[neoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[violaxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[diadinoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[diatoxanthin concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[divinyl-chl-b concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[chl-b concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[divinyl-chl-a concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[chl-a concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[BB carotene concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[bacteria count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[synechococcus count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[small picoeukaryotes count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[large picoeukaryotes count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[nanoflagellates count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[cryptophytes count]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[phosphate concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[nitrate concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[particulate organic nitrogen concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[particulate organic carbon concentration]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[primary production depth integrated production to ' '3 m expressed_in mgC m-2 d-1]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[water salinity]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Characteristics[fluorescence]', 'Term Source REF', 'Term Accession Number', 'Characteristics[water temperature at 3 meter depth]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Protocol REF', 'Parameter Value[dose]', 'Term Source REF', 'Term Accession Number', 'Parameter Value[filter pore size]', 'Unit', 'Term Source REF', 'Term Accession Number', 'Sample Name', 'Factor Value[compound]', 'Term Source REF', 'Term Accession Number', 'Factor Value[collection time]', 'Term Source REF', 'Term Accession Number'] # check the columns got moved in the study file with open(s_table_path + '.fix') as fixed_tab_fp: actual_field_names = list( map(lambda field_name: field_name.strip(), next(fixed_tab_fp).split('\t'))) self.assertListEqual(actual_field_names, expected_field_names) # check the parameter got added to the protocol with open(os.path.dirname( s_table_path) + '/i_Investigation.txt.fix') as fixed_i_fp: investigation = isatab.load(fixed_i_fp) study = investigation.studies[-1] protocol = study.get_prot( 'environmental material collection - standard procedure 1') param = protocol.get_param('dose') self.assertIsNotNone(param)
def load_investigation(investigation_file): f = utf8_text_file_open(investigation_file) investigation = ISATAB.load(f) return investigation
def modify_investigation(fp): """Load, edit, and dump an ISA-Tab 1.0 descriptor.""" # Load an existing ISA-Tab investigation file. In this example, we load an unpopulated i_investigation.txt file investigation = load(fp, skip_load_tables=True) investigation.identifier = "i1" investigation.title = "My Simple ISA Investigation" investigation.description = "We could alternatively use the class constructor's parameters to set some default " \ "values at the time of creation, however we want to demonstrate how to use the " \ "object's instance variables to set values." investigation.submission_date = "2016-11-03" investigation.public_release_date = "2016-11-03" study = Study(filename="s_study.txt") study.identifier = "s1" study.title = "My ISA Study" study.description = "Like with the Investigation, we could use the class constructor to set some default values, " \ "but have chosen to demonstrate in this example the use of instance variables to set initial " \ "values." study.submission_date = "2016-11-03" study.public_release_date = "2016-11-03" investigation.studies[0] = study obi = OntologySource(name='OBI', description="Ontology for Biomedical Investigations") investigation.ontology_source_references.append(obi) intervention_design = OntologyAnnotation(term_source=obi) intervention_design.term = "intervention design" intervention_design.term_accession = "http://purl.obolibrary.org/obo/OBI_0000115" study.design_descriptors.append(intervention_design) # Other instance variables common to both Investigation and Study objects include 'contacts' and 'publications', # each with lists of corresponding Person and Publication objects. contact = Person(first_name="Alice", last_name="Robertson", affiliation="University of Life", roles=[OntologyAnnotation(term='submitter')]) study.contacts.append(contact) publication = Publication(title="Experiments with Elephants", author_list="A. Robertson, B. Robertson") publication.pubmed_id = "12345678" publication.status = OntologyAnnotation(term="published") study.publications.append(publication) source = Source(name='source_material') study.sources.append(source) prototype_sample = Sample(name='sample_material', derives_from=[source]) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term="H**o Sapiens", term_source=ncbitaxon, term_accession="http://purl.bioontology.org/ontology/NCBITAXON/9606" )) prototype_sample.characteristics.append(characteristic_organism) study.samples = batch_create_materials(prototype_sample, n=3) # creates a batch of 3 samples sample_collection_protocol = Protocol( name="sample collection", protocol_type=OntologyAnnotation(term="sample collection")) study.protocols.append(sample_collection_protocol) sample_collection_process = Process( executes_protocol=sample_collection_protocol) for src in study.sources: sample_collection_process.inputs.append(src) for sam in study.samples: sample_collection_process.outputs.append(sam) study.process_sequence.append(sample_collection_process) assay = Assay(filename="a_assay.txt") extraction_protocol = Protocol( name='extraction', protocol_type=OntologyAnnotation(term="material extraction")) study.protocols.append(extraction_protocol) sequencing_protocol = Protocol( name='sequencing', protocol_type=OntologyAnnotation(term="material sequencing")) study.protocols.append(sequencing_protocol) for i, sample in enumerate(study.samples): extraction_process = Process(executes_protocol=extraction_protocol) extraction_process.inputs.append(sample) material = Material(name="extract-{}".format(i)) material.type = "Extract Name" extraction_process.outputs.append(material) sequencing_process = Process(executes_protocol=sequencing_protocol) sequencing_process.name = "assay-name-{}".format(i) sequencing_process.inputs.append(extraction_process.outputs[0]) datafile = DataFile(filename="sequenced-data-{}".format(i), label="Raw Data File") sequencing_process.outputs.append(datafile) extraction_process.next_process = sequencing_process sequencing_process.prev_process = extraction_process assay.samples.append(sample) assay.data_files.append(datafile) assay.other_material.append(material) assay.process_sequence.append(extraction_process) assay.process_sequence.append(sequencing_process) assay.measurement_type = OntologyAnnotation(term="gene sequencing") assay.technology_type = OntologyAnnotation( term="nucleotide sequencing") study.assays.append(assay) # dumps() writes out the ISA as a string representation of the ISA-Tab, but we are skipping writing tables return dumps(investigation, skip_dump_tables=True)