def remove_unused_protocols(self): investigation = isatab.load(os.path.dirname(self.path)) for study in investigation.studies: unused_protocol_names = set(x.name for x in study.protocols) for process in study.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass for assay in study.assays: for process in assay.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass print('Unused protocols: {}'.format(unused_protocol_names)) # remove these protocols from study.protocols clean_protocols_list = [] for protocol in study.protocols: if protocol.name not in unused_protocol_names: clean_protocols_list.append(protocol) study.protocols = clean_protocols_list isatab.dump(investigation, output_path=os.path.dirname(self.path), i_file_name='{filename}.fix'.format( filename=os.path.basename(self.path)), skip_dump_tables=True)
def convert(source_idf_fp, output_path): """ Converter for MAGE-TAB to ISA-Tab :param source_idf_fp: File descriptor of input IDF file :param output_dir: Path to directory to write output ISA-Tab files to """ ISA = magetab.load(source_idf_fp) isatab.dump(ISA, output_path)
def dropAssayFromStudy(assayNum, studyNum, pathToISATABFile): """ This function removes an Assay from a study in an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file and retrieve the assay and study numbers you are interested in! :param assayNum: The Assay number (notice it's 1-based index). :type assayNum: int :param studyNum: The Study number (notice it's 1-based index). :type studyNum: int :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: string :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab import os try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) std = isa.studies[studyNum - 1] assays = std.assays if os.path.isfile(os.path.join(pathToISATABFile,assays[assayNum - 1].filename)): os.remove(os.path.join(pathToISATABFile,assays[assayNum - 1].filename)) del assays[assayNum - 1] isatab.dump(isa_obj=isa, output_path=pathToISATABFile) except FileNotFoundError as err: raise err
def appendAssayToStudy(assay, studyNum, pathToISATABFile): """ This function appends an Assay object to a study in an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file and retrieve the assay and study number you are interested in! :param assay: The Assay :type assay: ISA Assay object :param studyNum: The Study number (notice it's not zero-based index). :type studyNum: int :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: string :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) std = isa.studies[studyNum - 1] lngth = len(std.assays) base = os.path.basename(assay.filename) fname = os.path.splitext(base)[0] fname = fname + str(lngth) ext = os.path.splitext(base)[1] fname = fname + ext assay.filename = fname isa.studies[studyNum - 1].assays.append(assay) isatab.dump(isa_obj=isa, output_path=pathToISATABFile) except FileNotFoundError as err: raise err
def dropStudyFromISA(studyNum, pathToISATABFile): """ This function removes a study from an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file and retrieve the study number you are interested in! Warning: this function deletes the given study and all its associated assays :param studyNum: The Study number (notice it's 1-based index). :type studyNum: int :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: string :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab import os try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) studies = isa.studies for assay in studies[studyNum - 1].assays: if os.path.isfile(os.path.join(pathToISATABFile,assay.filename)): os.remove(os.path.join(pathToISATABFile,assay.filename)) if os.path.isfile(os.path.join(pathToISATABFile,studies[studyNum - 1].filename)): os.remove(os.path.join(pathToISATABFile,studies[studyNum - 1].filename)) del studies[studyNum - 1] isatab.dump(isa_obj=isa, output_path=pathToISATABFile) except FileNotFoundError as err: raise err
def convert(source_idf_fp, output_path, technology_type=None, measurement_type=None): """ Converter for MAGE-TAB to ISA-Tab :param source_idf_fp: File descriptor of input IDF file :param output_path: Path to directory to write output ISA-Tab files to """ df = pd.read_csv(source_idf_fp, names=range(0, 128), sep='\t', engine='python', encoding='utf-8', comment='#').dropna(axis=1, how='all') df = df.T # transpose df.reset_index(inplace=True) # Reset index so it is accessible as column df.columns = df.iloc[0] # If all was OK, promote this row to the column headers df = df.reindex(df.index.drop(0)) # second set output s_ and a_ files for _, row in df.iterrows(): sdrf_file = row["SDRF File"] if isinstance(sdrf_file, str): study_df, assay_df = magetab.split_tables(sdrf_path=os.path.join(os.path.dirname(source_idf_fp.name), sdrf_file)) study_df.columns = study_df.isatab_header assay_df.columns = assay_df.isatab_header # write out ISA table files print("Writing s_{0} to {1}".format(os.path.basename(sdrf_file), output_path)) with open(os.path.join(output_path, "s_" + os.path.basename(sdrf_file)), "w") as s_fp: study_df.to_csv(path_or_buf=s_fp, mode='a', sep='\t', encoding='utf-8', index=False) print("Writing a_{0} to {1}".format(os.path.basename(sdrf_file), output_path)) with open(os.path.join(output_path, "a_" + os.path.basename(sdrf_file)), "w") as a_fp: assay_df.to_csv(path_or_buf=a_fp, mode='a', sep='\t', encoding='utf-8', index=False) print("Writing {0} to {1}".format("i_investigation.txt", output_path)) source_idf_fp.seek(0) ISA = magetab.parse_idf(source_idf_fp.name, technology_type=technology_type, measurement_type=measurement_type) isatab.dump(ISA, output_path=output_path, skip_dump_tables=True)
def appendStudytoISA(study, pathToISATABFile): """ This function appends a Study object to an ISA file Typically, you should use the exploreISA function to check the contents of the ISA file! :param study: The Study object. :type study: ISA Study object :param pathToISATABFile: The path to the ISATAB file :type pathToISATABFile: string :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'. """ from isatools import isatab import os try: isa = isatab.load(pathToISATABFile, skip_load_tables=True) lngth = len(isa.studies) base = os.path.basename(study.filename) fname = os.path.splitext(base)[0] fname = fname + str(lngth) ext = os.path.splitext(base)[1] fname = fname + ext study.filename = fname isa.studies.append(study) isatab.dump(isa_obj=isa, output_path=pathToISATABFile) except FileNotFoundError as err: raise err
def convert(source_sampletab_fp, target_dir): """ Converter for ISA-JSON to SampleTab. :param source_sampletab_fp: File descriptor of input SampleTab file :param target_dir: Path to write out ISA-Tab files to """ ISA = sampletab.load(source_sampletab_fp) isatab.dump(ISA, target_dir)
def write_isa_study(self, inv_obj, api_key, std_path, save_investigation_copy=True, save_samples_copy=False, save_assays_copy=False): """ Write back an ISA-API Investigation object directly into ISA-Tab files :param inv_obj: ISA-API Investigation object :param api_key: User API key for accession check :param std_path: file system path to destination folder :param save_investigation_copy: Keep track of changes saving a copy of the unmodified i_*.txt file :param save_samples_copy: Keep track of changes saving a copy of the unmodified s_*.txt file :param save_assays_copy: Keep track of changes saving a copy of the unmodified a_*.txt and m_*.tsv files :return: """ # dest folder name is a timestamp update_path_suffix = app.config.get('UPDATE_PATH_SUFFIX') update_path = os.path.join(std_path, update_path_suffix) if save_investigation_copy or save_samples_copy or save_assays_copy: # Only create audit folder when requested dest_path = new_timestamped_folder(update_path) # make a copy before applying changes if save_investigation_copy: src_file = os.path.join(std_path, self.inv_filename) dest_file = os.path.join(dest_path, self.inv_filename) logger.info("Copying %s to %s", src_file, dest_file) copy_file(src_file, dest_file) if save_samples_copy: for sample_file in glob.glob(os.path.join(std_path, "s_*.txt")): sample_file_name = os.path.basename(sample_file) src_file = sample_file dest_file = os.path.join(dest_path, sample_file_name) logger.info("Copying %s to %s", src_file, dest_file) copy_file(src_file, dest_file) if save_assays_copy: for assay_file in glob.glob(os.path.join(std_path, "a_*.txt")): assay_file_name = os.path.basename(assay_file) src_file = assay_file dest_file = os.path.join(dest_path, assay_file_name) logger.info("Copying %s to %s", src_file, dest_file) copy_file(src_file, dest_file) # Save the MAF for maf in glob.glob(os.path.join(std_path, "m_*.tsv")): maf_file_name = os.path.basename(maf) src_file = maf dest_file = os.path.join(dest_path, maf_file_name) logger.info("Copying %s to %s", src_file, dest_file) copy_file(src_file, dest_file) logger.info("Writing %s to %s", self.inv_filename, std_path) i_file_name = self.inv_filename dump(inv_obj, std_path, i_file_name=i_file_name, skip_dump_tables=False) return
def convert(json_fp, path, i_file_name='i_investigation.txt', config_dir=isajson.default_config_dir, validate_first=True): """ Converter for ISA JSON to ISA Tab. Currently only converts investigation file contents :param json_fp: File pointer to ISA JSON input :param path: Directory to ISA tab output :param i_file_name: Investigation file name, default is i_investigation.txt :param config_dir: Directory to config directory :param validate_first: Validate JSON before conversion, default is True Example usage: Read from a JSON and write to an investigation file, make sure to create/open relevant Python file objects. from isatools.convert import json2isatab json_file = open('BII-I-1.json', 'r') tab_file = open('i_investigation.txt', 'w') json2isatab.convert(json_file, path) """ if validate_first: log.info("Validating input JSON before conversion") report = isajson.validate(fp=json_fp, config_dir=config_dir, log_level=logging.ERROR) if len(report['errors']) > 0: log.fatal("Could not proceed with conversion as there are some " "fatal validation errors. Check log.") return json_fp.seek(0) # reset file pointer after validation log.info("Loading ISA-JSON from %s", json_fp.name) isa_obj = isajson.load(fp=json_fp) log.info("Dumping ISA-Tab to %s", path) log.debug("Using configuration from %s", config_dir) isatab.dump(isa_obj=isa_obj, output_path=path, i_file_name=i_file_name) # copy data files across from source directory where JSON is located log.info("Copying data files from source to target") for file in [ f for f in os.listdir(os.path.dirname(json_fp.name)) if not (f.endswith('.txt') and (f.startswith('i_') or f.startswith( 's_') or f.startswith('a_'))) and not (f.endswith('.json')) ]: filepath = os.path.join(os.path.dirname(json_fp.name), file) if os.path.isfile(filepath): log.debug("Copying %s to %s", filepath, path) shutil.copy(filepath, path)
def measure_minimal(n_rows): starting_time = time.process_time_ns() investigation = Investigation() investigation.identifier = "i1" study = Study(filename="s_study.txt") study.identifier = "s1" investigation.studies.append(study) sample_collection_protocol = Protocol(name="sample collection") study.protocols.append(sample_collection_protocol) for i in range(0, n_rows): source = Source(name='source_material-{}'.format(i)) sample = Sample(name="sample_material-{}".format(i)) study.samples.append(sample) study.sources.append(source) sample_collection_process = Process( executes_protocol=sample_collection_protocol) sample_collection_process.inputs.append(source) sample_collection_process.outputs.append(sample) study.process_sequence.append(sample_collection_process) assay = Assay(filename="a_assay.txt") sequencing_protocol = Protocol( name='sequencing', protocol_type=OntologyAnnotation(term="material sequencing")) study.protocols.append(sequencing_protocol) for i, sample in enumerate(study.samples): sequencing_process = Process(executes_protocol=sequencing_protocol) sequencing_process.name = "assay-name-{}".format(i) sequencing_process.inputs.append(sample) datafile = DataFile(filename="sequenced-data-{}".format(i), label="Raw Data File", generated_from=[sample]) sequencing_process.outputs.append(datafile) assay.samples.append(sample) assay.data_files.append(datafile) assay.process_sequence.append(sequencing_process) study.assays.append(assay) isatab.dump(investigation, "./") return time.process_time_ns() - starting_time
def remove_unused_protocols(self): """Removes usused protocols :return: None """ investigation = isatab.load(os.path.dirname(self.path)) for study in investigation.studies: unused_protocol_names = set(x.name for x in study.protocols) for process in study.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass for assay in study.assays: for process in assay.process_sequence: try: unused_protocol_names.remove( process.executes_protocol.name) except KeyError: pass print('Unused protocols: {}'.format(unused_protocol_names)) print('Location of unused protocols: {}'.format( list( map( lambda pr: True if pr.name in unused_protocol_names else False, study.protocols)))) # remove these protocols from study.protocols """ clean_protocols_list = [] for protocol in study.protocols: if protocol.name not in unused_protocol_names: clean_protocols_list.append(protocol) study.protocols = clean_protocols_list """ clean_protocols = [ pr for pr in study.protocols if pr.name not in unused_protocol_names ] print('Clean protocol list: {}'.format( [pr.name for pr in clean_protocols])) study.protocols = clean_protocols print('Clean study.protocols: {}'.format( [pr.name for pr in study.protocols])) isatab.dump(investigation, output_path=os.path.dirname(self.path), i_file_name='{filename}.fix'.format( filename=os.path.basename(self.path)), skip_dump_tables=True)
def convert(idf_file_path, output_path): """ Converter for MAGE-TAB to ISA-Tab :param idf_file_path: File descriptor of input IDF file :param output_path: Path to directory to write output ISA-Tab files to """ parser = MageTabParser() parser.parse_idf(idf_file_path) sdrf_files = [x.value for x in parser.ISA.studies[-1].comments if 'SDRF File' in x.name] if len(sdrf_files) == 1: sdrf_files = sdrf_files[0].split(';') for sdrf_file in sdrf_files: table_files = parser.parse_sdrf_to_isa_table_files(os.path.join(os.path.dirname(idf_file_path), sdrf_file)) for in_fp in table_files: log.info("Writing {0} to {1}".format(in_fp.name, output_path)) with open(os.path.join(output_path, in_fp.name), 'w') as out_fp: out_fp.write(in_fp.read()) log.info("Writing {0} to {1}".format("i_investigation.txt", output_path)) isatab.dump(parser.ISA, output_path=output_path, skip_dump_tables=True)
def _write_study_json(self, inv_obj, std_path, skip_dump_tables=True): logger.info("Writing %s to %s", self.inv_filename, std_path) try: os.makedirs(std_path) except OSError as exception: if exception.errno != errno.EEXIST: raise inv = dump(inv_obj, std_path, i_file_name=self.inv_filename, skip_dump_tables=skip_dump_tables) return inv
def replace_factor_with_protocol_parameter_value(self, factor_name, protocol_ref): """Fixes a factor if it's supposed to be a Parameter Value :param factor_name: The factor that's incorrect :param protocol_ref: Protocol REF for the new Parameter Value :return: None """ table_file_df = isatab.read_tfile(self.path) field_names = list(table_file_df.columns) clean_field_names = self.clean_isatab_field_names(field_names) factor_index = clean_field_names.index( 'Factor Value[{factor_name}]'.format(factor_name=factor_name)) with open(self.path) as tfile_fp: next(tfile_fp) line1 = next(tfile_fp) protocol_ref_index = list( map(lambda x: x[1:-1] if x[0] == '"' and x[-1] == '"' else x, line1.split('\t'))).index(protocol_ref) if protocol_ref_index < 0: raise IOError( 'Could not find protocol ref matching {protocol_ref}'.format( protocol_ref=protocol_ref)) if factor_index < len(field_names) and \ 'Term Source REF' in field_names[factor_index + 1] and \ 'Term Accession' in field_names[factor_index + 2]: log.debug('Moving Factor Value[{}] with term columns'.format( factor_name)) # move Factor Value and Term Source REF and Term Accession columns field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) field_names.insert(protocol_ref_index + 3, field_names[factor_index + 2 + 2]) del field_names[factor_index + 3] # del Factor Value[{}] del field_names[factor_index + 1 + 2] # del Term Source REF del field_names[factor_index + 2 + 1] # del Term Accession elif factor_index < len(field_names) and \ 'Unit' in field_names[factor_index + 1] and \ 'Term Source REF' in field_names[factor_index + 2] and \ 'Term Accession' in field_names[factor_index + 3]: log.debug( 'Moving Factor Value[{factor_name}] with unit term columns'. format(factor_name=factor_name)) # move Factor Value and Unit as ontology annotation field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) field_names.insert(protocol_ref_index + 3, field_names[factor_index + 2 + 2]) field_names.insert(protocol_ref_index + 4, field_names[factor_index + 3 + 3]) del field_names[factor_index + 4] # del Factor Value[{}] del field_names[factor_index + 1 + 3] # del Unit del field_names[factor_index + 2 + 2] # del Term Source REF del field_names[factor_index + 3 + 1] # del Term Accession elif factor_index < len(field_names) and \ 'Unit' in field_names[factor_index + 1]: log.debug( 'Moving Factor Value[{factor_name}] with unit column'.format( factor_name=factor_name)) # move Factor Value and Unit columns field_names.insert(protocol_ref_index + 1, field_names[factor_index]) field_names.insert(protocol_ref_index + 2, field_names[factor_index + 1 + 1]) del field_names[factor_index + 2] # del Factor Value[{}] del field_names[factor_index + 1 + 1] # del Unit else: # move only the Factor Value column log.debug('Moving Factor Value[{factor_name}]'.format( factor_name=factor_name)) field_names.insert(protocol_ref_index + 1, field_names[factor_index]) del field_names[factor_index] # del Factor Value[{}] table_file_df.columns = self.clean_isatab_field_names(field_names) # Rename Factor Value column to Parameter Value column field_names_modified = list(table_file_df.columns) field_names_modified[protocol_ref_index + 1] = \ field_names_modified[protocol_ref_index + 1].replace( 'Factor Value', 'Parameter Value') table_file_df.columns = self.clean_isatab_field_names( field_names_modified) investigation = isatab.load(os.path.dirname(self.path), skip_load_tables=True) study = investigation.studies[-1] protocol = study.get_prot(protocol_ref) if protocol is None: raise ISAModelAttributeError( 'No protocol with name {protocol_ref} was found'.format( protocol_ref=protocol_ref)) protocol.add_param(factor_name) factor = study.get_factor(factor_name) if factor is None: raise ISAModelAttributeError( 'No factor with name {factor_name} was found'.format( factor_name=factor_name)) else: study.del_factor(name=factor_name, are_you_sure=True) study.filename = '{study_filename}.fix'.format( study_filename=study.filename) isatab.dump(investigation, output_path=os.path.dirname(self.path), i_file_name='i_Investigation.txt.fix', skip_dump_tables=True) with open( os.path.join( os.path.dirname(self.path), '{s_filename}.fix'.format( s_filename=os.path.basename(self.path))), 'w') as out_fp: table_file_df.to_csv(path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8')
def create_from_galaxy_parameters(galaxy_parameters_file, target_dir): def _create_treatment_sequence(galaxy_parameters): treatment_plan = galaxy_parameters['treatment_plan'] study_type = treatment_plan['study_type']['study_type_selector'] log.debug(json.dumps(galaxy_parameters, indent=4)) try: single_or_multiple = treatment_plan['study_type']['balance'][ 'multiple_interventions'] except KeyError: single_or_multiple = \ treatment_plan['study_type']['multiple_interventions'][ 'multiple_interventions_selector'] if single_or_multiple == 'multiple': raise NotImplementedError( 'Multiple treatments not yet implemented. Please select Single') if study_type == 'full_factorial': intervention_type = \ treatment_plan['study_type']['multiple_interventions'][ 'intervention_type']['intervention_type_selector'] if intervention_type == 'chemical intervention': interventions = INTERVENTIONS['CHEMICAL'] elif intervention_type == 'dietary intervention': interventions = INTERVENTIONS['DIET'] elif intervention_type == 'behavioural intervention': interventions = INTERVENTIONS['BEHAVIOURAL'] elif intervention_type == 'biological intervention': interventions = INTERVENTIONS['BIOLOGICAL'] elif intervention_type == 'surgical intervention': interventions = INTERVENTIONS['SURGICAL'] elif intervention_type == 'radiological intervention': # not in tool yet interventions = INTERVENTIONS['RADIOLOGICAL'] else: # default to chemical interventions = INTERVENTIONS['CHEMICAL'] treatment_factory = TreatmentFactory( intervention_type=interventions, factors=BASE_FACTORS) # Treatment Sequence agent_levels = \ treatment_plan['study_type']['multiple_interventions'][ 'intervention_type']['agent'].split(',') for agent_level in agent_levels: treatment_factory.add_factor_value(BASE_FACTORS[0], agent_level.strip()) dose_levels = \ treatment_plan['study_type']['multiple_interventions'][ 'intervention_type']['intensity'].split(',') for dose_level in dose_levels: treatment_factory.add_factor_value(BASE_FACTORS[1], dose_level.strip()) duration_of_exposure_levels = treatment_plan[ 'study_type']['multiple_interventions']['intervention_type'][ 'duration'].split(',') for duration_of_exposure_level in duration_of_exposure_levels: treatment_factory.add_factor_value( BASE_FACTORS[2], duration_of_exposure_level.strip()) treatment_sequence = TreatmentSequence( ranked_treatments=treatment_factory .compute_full_factorial_design()) group_size = int( galaxy_parameters['treatment_plan']['study_type'][ 'multiple_interventions']['group_size']) for ranked_treatment in \ treatment_sequence.ranked_treatments: ranked_treatment[0].group_size = group_size return treatment_sequence elif study_type == 'fractional_factorial': intervention_type = \ treatment_plan['study_type']['balance'][ 'multiple_interventions']['intervention_type_selector'] treatments = set() study_factors = [StudyFactor(name=x.strip()) for x in treatment_plan['study_type'][ 'balance']['multiple_interventions'][ 'study_factors'].split(',')] for group in \ treatment_plan['study_type']['balance'][ 'multiple_interventions']['study_groups']: factor_values = () for x, y in zip(study_factors, [x.strip() for x in group['factor_values'].split( ',')]): factor_value = FactorValue(factor_name=x, value=y) factor_values = factor_values + (factor_value,) if galaxy_parameters['treatment_plan']['study_type'][ 'balance']['balanced_groups']: group_size = int( galaxy_parameters['treatment_plan']['study_type'][ 'balance']['multiple_interventions']['group_size']) else: group_size = int(group['group_size']) treatment = Treatment(treatment_type=intervention_type, factor_values=factor_values, group_size=group_size) treatments.add(treatment) treatment_sequence = TreatmentSequence(ranked_treatments=treatments) return treatment_sequence def _create_sample_plan(sample_assay_plan, sample_plan_record): def _create_nmr_assay_type(assay_plan_record): nmr_assay_type = AssayType( measurement_type='metabolite profiling', technology_type='nmr spectroscopy') nmr_top_mods = NMRTopologyModifiers() nmr_top_mods.technical_replicates = assay_plan_record[ 'assay_type']['acquisition_mode']['technical_replicates'] nmr_top_mods.acquisition_modes.add( assay_plan_record['assay_type'][ 'acquisition_mode']['acquisition_mode_selector']) nmr_top_mods.instruments.add('{} {}'.format( assay_plan_record['assay_type'][ 'acquisition_mode']['nmr_instrument'], assay_plan_record['assay_type']['acquisition_mode']['magnet'])) nmr_top_mods.pulse_sequences.add( assay_plan_record['assay_type'][ 'acquisition_mode']['pulse_sequence'] ) nmr_top_mods.magnet_power = \ assay_plan_record['assay_type']['acquisition_mode']['magnet'] nmr_assay_type.topology_modifiers = nmr_top_mods return nmr_assay_type def _create_ms_assay_type(assay_plan_record): ms_assay_type = AssayType( measurement_type='metabolite profiling', technology_type='mass spectrometry') ms_assay_type.topology_modifiers = MSTopologyModifiers( sample_fractions=set(map( lambda x: x['sample_fraction'], assay_plan_record['assay_type']['sample_fractions']))) injection_modes = ms_assay_type.topology_modifiers.injection_modes if len(assay_plan_record['assay_type']['injections']) > 0: for inj_mod in assay_plan_record['assay_type']['injections']: injection_mode = MSInjectionMode( injection_mode=inj_mod[ 'injection_mode']['injection_mode_selector'], ms_instrument=inj_mod['injection_mode']['instrument'] ) if inj_mod['injection_mode'][ 'injection_mode_selector'] in ('LC', 'GC'): injection_mode.chromatography_instrument = inj_mod[ 'injection_mode']['chromatography_instrument'] if inj_mod[ 'injection_mode']['injection_mode_selector'] == 'LC': injection_mode.chromatography_column = inj_mod[ 'injection_mode']['chromatography_column'] injection_modes.add(injection_mode) for acq_mod in inj_mod['injection_mode']['acquisitions']: injection_mode.acquisition_modes.add( MSAcquisitionMode( acquisition_method=acq_mod['acquisition_mode'], technical_repeats=acq_mod[ 'technical_replicates'] ) ) if inj_mod['injection_mode'][ 'injection_mode_selector'] == 'GC': for deriva in inj_mod['injection_mode'][ 'derivatizations']: derivatization = deriva['derivatization'] if re.match('(.*?) \((.*?)\)', derivatization): matches = next(iter( re.findall('(.*?) \((.*?)\)', derivatization))) term, ontoid = matches[0], matches[1] source_name, accession_id = \ ontoid.split(':')[0], \ ontoid.split(':')[1] source = OntologySource(name=source_name) derivatization = OntologyAnnotation( term=term, term_source=source, term_accession=accession_id) injection_mode.derivatizations.add( derivatization) return ms_assay_type if sample_plan_record['material_type'] == 'user defined': sample_type = sample_plan_record['material_type']['sample_type_ud'] else: sample_type = sample_plan_record['material_type'] if re.match('(.*?) \((.*?)\)', sample_type): matches = next(iter(re.findall('(.*?) \((.*?)\)', sample_type))) term, ontoid = matches[0], matches[1] source_name, accession_id = ontoid.split(':')[0], \ ontoid.split(':')[1] source = OntologySource(name=source_name) sample_type = OntologyAnnotation(term=term, term_source=source, term_accession=accession_id) sample_assay_plan.add_sample_type(sample_type) sample_size = sample_plan_record['sample_collections'] sample_assay_plan.add_sample_plan_record(sample_type, sample_size) for assay_plan_record in sample_plan_record['assay_plans']: tt = assay_plan_record['assay_type']['assay_type_selector'] if tt == 'nmr': assay_type = _create_nmr_assay_type(assay_plan_record) elif tt == 'ms': assay_type = _create_ms_assay_type(assay_plan_record) else: raise NotImplementedError('Only MS and NMR assays supported') sample_assay_plan.add_assay_type(assay_type) sample_assay_plan.add_assay_plan_record(sample_type, assay_type) return sample_assay_plan def _inject_qcqa_plan(sample_assay_plan, qcqa_record): qc_type = qcqa_record['qc_type']['qc_type_selector'] if qc_type == 'interval_series': material_type = qcqa_record['material_type'] if re.match('(.*?) \((.*?)\)', material_type): matches = next(iter( re.findall('(.*?) \((.*?)\)', material_type))) term, ontoid = matches[0], matches[1] source_name, accession_id = ontoid.split(':')[0], \ ontoid.split(':')[1] source = OntologySource(name=source_name) material_type = OntologyAnnotation( term=term, term_source=source, term_accession=accession_id) sample_assay_plan.add_sample_qc_plan_record( material_type=material_type, injection_interval=qcqa_record[ 'qc_type']['injection_frequency']) elif 'dilution_series' in qc_type: values = [int(x) for x in qcqa_record[ 'qc_type']['values'].split(',')] material_type = qcqa_record['material_type'] if re.match('(.*?) \((.*?)\)', material_type): matches = next(iter( re.findall('(.*?) \((.*?)\)', material_type))) term, ontoid = matches[0], matches[1] source_name, accession_id = ontoid.split(':')[0], \ ontoid.split(':')[1] source = OntologySource(name=source_name) material_type = OntologyAnnotation( term=term, term_source=source, term_accession=accession_id) batch = SampleQCBatch(material=material_type) for value in values: batch.characteristic_values.append( Characteristic(category=OntologyAnnotation( term='quantity'), value=value) ) if 'pre' in qc_type: sample_assay_plan.pre_run_batch = batch elif 'post' in qc_type: sample_assay_plan.post_run_batch = batch else: raise NotImplementedError('QC type not recognized!') return sample_assay_plan # pre-generation checks if galaxy_parameters_file: galaxy_parameters = json.load(galaxy_parameters_file) log.debug(json.dumps(galaxy_parameters, indent=4)) else: raise IOError('Could not load Galaxy parameters file!') if target_dir: if not os.path.exists(target_dir): raise IOError('Target path does not exist!') if len(galaxy_parameters['sample_and_assay_planning']['sample_plans']) == 0: raise IOError('No Sampling plan specified') treatment_sequence = _create_treatment_sequence(galaxy_parameters) sample_assay_plan = SampleAssayPlan() for sample_plan_record in galaxy_parameters['sample_and_assay_planning'][ 'sample_plans']: _ = _create_sample_plan(sample_assay_plan, sample_plan_record) for qcqa_record in galaxy_parameters['qc_planning']['qc_plans']: _ = _inject_qcqa_plan(sample_assay_plan, qcqa_record) try: sample_assay_plan.group_size = \ int(galaxy_parameters['treatment_plan']['study_type'][ 'multiple_interventions']['group_size']) except KeyError: try: sample_assay_plan.group_size = \ int(galaxy_parameters['treatment_plan']['study_type'][ 'balance']['multiple_interventions']['group_size']) except KeyError: log.debug( 'Group size not set for root plan as multiple intervention') sample_assay_plan.group_size = 0 # raises AttributeError study_info = galaxy_parameters['study_metadata'] if len(sample_assay_plan.sample_plan) == 0: log.info('No sample plan defined') if len(sample_assay_plan.assay_plan) == 0: log.info('No assay plan defined') study_design = StudyDesign() study_design.add_single_sequence_plan(treatment_sequence, sample_assay_plan) isa_object_factory = IsaModelObjectFactory(study_design) if len(sample_assay_plan.sample_plan) == 0: s = Study() else: s = isa_object_factory.create_assays_from_plan() c = Person() c.affiliation = study_info.get('affiliation') c.last_name = study_info.get('last_name') c.email = study_info['email'] c.first_name = study_info['first_name'] s.contacts = [c] s.description = study_info['description'] s.filename = 's_study.txt' s.title = study_info['title'] s.identifier = 'ISA-{}'.format(uuid.uuid4().hex[:8]) s.comments = [ Comment(name='Consent Information (ICO:0000011)', value=study_info['study_consent']), Comment(name='Data Use Requirement (DUO:0000017)', value=study_info['study_use_condition']) ] i = Investigation() i.contacts = [c] i.description = "" i.title = "Investigation" i.identifier = s.identifier i.studies = [s] try: i.ontology_source_references = s.ontology_source_references except AttributeError: pass i.ontology_source_references.append(OntologySource(name='ICO')) i.ontology_source_references.append(OntologySource(name='DUO')) def sanitize_filename(filename): filename = str(filename).strip().replace(' ', '_') filename = re.sub(r'(?u)[^-\w.]', '_', filename) return filename i.filename = sanitize_filename(i.filename) for s in i.studies: s.filename = sanitize_filename(s.filename) for a in s.assays: a.filename = sanitize_filename(a.filename) isatab.dump(isa_obj=i, output_path=target_dir)
def test_isatab_bad_i_file_name(self): with self.assertRaises(NameError): isatab.dump(Investigation(), self._tmp_dir, i_file_name="investigation.txt")
def test_isatab_dump_source_sample_split(self): i = Investigation() uberon = OntologySource( name="UBERON", description="Uber Anatomy Ontology", version="216", file="http://data.bioontology.org/ontologies/UBERON", ) ncbitaxon = OntologySource( name="NCBITAXON", description="National Center for Biotechnology Information (NCBI) Organismal Classification", version="2", file="http://data.bioontology.org/ontologies/NCBITAXON", ) i.ontology_source_references.append(uberon) i.ontology_source_references.append(ncbitaxon) s = Study(filename="s_pool.txt") sample_collection_protocol = Protocol( name="sample collection", protocol_type=OntologyAnnotation(term="sample collection") ) s.protocols.append(sample_collection_protocol) reference_descriptor_category = OntologyAnnotation(term="reference descriptor") material_type_category = OntologyAnnotation(term="material type") organism_category = OntologyAnnotation(term="organism") source1 = Source(name="source1") source1.characteristics = [ Characteristic(category=reference_descriptor_category, value="not applicable"), Characteristic(category=material_type_category, value="specimen"), Characteristic( category=organism_category, value=OntologyAnnotation( term="Human", term_source=ncbitaxon, term_accession="http://purl.bioontology.org/ontology/STY/T016" ), ), ] sample1 = Sample(name="sample1") organism_part = OntologyAnnotation(term="organism part") sample1.characteristics.append( Characteristic( category=organism_part, value=OntologyAnnotation( term="liver", term_source=uberon, term_accession="http://purl.obolibrary.org/obo/UBERON_0002107" ), ) ) sample2 = Sample(name="sample2") sample2.characteristics.append( Characteristic( category=organism_part, value=OntologyAnnotation( term="heart", term_source=uberon, term_accession="http://purl.obolibrary.org/obo/UBERON_0000948" ), ) ) sample3 = Sample(name="sample3") sample3.characteristics.append( Characteristic( category=organism_part, value=OntologyAnnotation( term="blood", term_source=uberon, term_accession="http://purl.obolibrary.org/obo/UBERON_0000178" ), ) ) sample4 = Sample(name="sample4") sample4.characteristics.append( Characteristic( category=organism_part, value=OntologyAnnotation( term="blood", term_source=uberon, term_accession="http://purl.obolibrary.org/obo/UBERON_0000178" ), ) ) sample_collection_process = Process(executes_protocol=sample_collection_protocol) sample_collection_process.inputs = [source1] sample_collection_process.outputs = [sample1, sample2, sample3, sample4] s.process_sequence = [sample_collection_process] from isatools.model.v1 import _build_assay_graph s.graph = _build_assay_graph(s.process_sequence) i.studies = [s] isatab.dump(i, self._tmp_dir) self.assertTrue( assert_tab_content_equal( open(os.path.join(self._tmp_dir, "s_pool.txt")), open(os.path.join(self._tab_data_dir, "TEST-ISA-source-split", "s_TEST-Template1-Splitting.txt")), ) )
output.label]['entry_list'][-1]['value'] output.generated_from[-1].name = labels['Sample Name'][ 'value'] # set MS Assay Name to mzML metadata ms_process.name = labels['MS Assay Name']['value'] # add data transformation to describe conversion to mzML if data_trans_meta['Data Transformation Name']: if not study.get_prot('Conversion to mzML'): dt_prot = Protocol(name='Conversion to mzML', protocol_type=OntologyAnnotation( term='data transformation')) dt_prot.add_param('peak picking') dt_prot.add_param('software') dt_prot.add_param('software version') study.protocols.append(dt_prot) dt_prot = study.get_prot('Conversion to mzML') dt_process = Process(executes_protocol=dt_prot) dt_process.outputs = [ DerivedSpectralDataFile( filename=labels['Derived Spectral Data File'] ['entry_list'][-1]['value']) ] dt_process.inputs = ms_process.outputs plink(ms_process, dt_process) assay.process_sequence.append(dt_process) except IndexError: pass isatab.dump(ISA, output_filepath)
def test_isatab_bad_i_file_name(self): with self.assertRaises(NameError): isatab.dump(Investigation(), self._tmp_dir, i_file_name='investigation.txt')
def test_isatab_dump_source_sample_split(self): i = Investigation() uberon = OntologySource( name='UBERON', description="Uber Anatomy Ontology", version='216', file='http://data.bioontology.org/ontologies/UBERON') ncbitaxon = OntologySource( name='NCBITAXON', description= "National Center for Biotechnology Information (NCBI) Organismal Classification", version='2', file='http://data.bioontology.org/ontologies/NCBITAXON') i.ontology_source_references.append(uberon) i.ontology_source_references.append(ncbitaxon) s = Study(filename='s_pool.txt') sample_collection_protocol = Protocol( name='sample collection', protocol_type=OntologyAnnotation(term='sample collection')) s.protocols.append(sample_collection_protocol) reference_descriptor_category = OntologyAnnotation( term='reference descriptor') material_type_category = OntologyAnnotation(term='material type') organism_category = OntologyAnnotation(term='organism') source1 = Source(name='source1') source1.characteristics = [ Characteristic(category=reference_descriptor_category, value='not applicable'), Characteristic(category=material_type_category, value='specimen'), Characteristic( category=organism_category, value=OntologyAnnotation( term='Human', term_source=ncbitaxon, term_accession= 'http://purl.bioontology.org/ontology/STY/T016')), ] sample1 = Sample(name='sample1') organism_part = OntologyAnnotation(term='organism part') sample1.characteristics.append( Characteristic(category=organism_part, value=OntologyAnnotation( term='liver', term_source=uberon, term_accession= 'http://purl.obolibrary.org/obo/UBERON_0002107', ))) sample2 = Sample(name='sample2') sample2.characteristics.append( Characteristic(category=organism_part, value=OntologyAnnotation( term='heart', term_source=uberon, term_accession= 'http://purl.obolibrary.org/obo/UBERON_0000948', ))) sample3 = Sample(name='sample3') sample3.characteristics.append( Characteristic(category=organism_part, value=OntologyAnnotation( term='blood', term_source=uberon, term_accession= 'http://purl.obolibrary.org/obo/UBERON_0000178', ))) sample4 = Sample(name='sample4') sample4.characteristics.append( Characteristic(category=organism_part, value=OntologyAnnotation( term='blood', term_source=uberon, term_accession= 'http://purl.obolibrary.org/obo/UBERON_0000178', ))) sample_collection_process = Process( executes_protocol=sample_collection_protocol) sample_collection_process.inputs = [source1] sample_collection_process.outputs = [ sample1, sample2, sample3, sample4 ] s.process_sequence = [sample_collection_process] from isatools.model.v1 import _build_assay_graph s.graph = _build_assay_graph(s.process_sequence) i.studies = [s] isatab.dump(i, self._tmp_dir) self.assertTrue( assert_tab_content_equal( open(os.path.join(self._tmp_dir, 's_pool.txt')), open( os.path.join(self._tab_data_dir, 'TEST-ISA-source-split', 's_TEST-Template1-Splitting.txt'))))
def create_from_plan_parameters(galaxy_parameters_file, sample_assay_plans_file, study_info_file, treatment_plans_file, target_dir): decoder = SampleAssayPlanDecoder() if galaxy_parameters_file: galaxy_parameters = json.load(galaxy_parameters_file) sample_and_assay_plans, study_info, treatment_plan_params = \ map_galaxy_to_isa_create_json(galaxy_parameters) plan = decoder.load(io.StringIO(json.dumps(sample_and_assay_plans))) elif sample_assay_plans_file and study_info_file and treatment_plans_file: plan = decoder.load(sample_assay_plans_file) study_info = json.load(study_info_file) treatment_plan_params = json.load(treatment_plans_file) else: raise IOError('Wrong parameters provided') study_type = treatment_plan_params['study_type_cond']['study_type'] if study_type != 'intervention': raise NotImplementedError('Only supports Intervention studies') single_or_multiple = treatment_plan_params['study_type_cond'][ 'one_or_more']['single_or_multiple'] if single_or_multiple == 'multiple': raise NotImplementedError( 'Multiple treatments not yet implemented. Please select Single') intervention_type = treatment_plan_params['study_type_cond'][ 'one_or_more']['intervention_type']['select_intervention_type'] if intervention_type != 'chemical intervention': raise NotImplementedError( 'Only Chemical Interventions supported at this time') treatment_factory = TreatmentFactory( intervention_type=INTERVENTIONS['CHEMICAL'], factors=BASE_FACTORS) agent_levels = treatment_plan_params['study_type_cond']['one_or_more'][ 'intervention_type']['agent'].split(',') for agent_level in agent_levels: treatment_factory.add_factor_value(BASE_FACTORS[0], agent_level.strip()) dose_levels = treatment_plan_params['study_type_cond']['one_or_more'][ 'intervention_type']['intensity'].split(',') for dose_level in dose_levels: treatment_factory.add_factor_value(BASE_FACTORS[1], dose_level.strip()) duration_of_exposure_levels = treatment_plan_params['study_type_cond'][ 'one_or_more']['intervention_type']['duration'].split(',') for duration_of_exposure_level in duration_of_exposure_levels: treatment_factory.add_factor_value(BASE_FACTORS[2], duration_of_exposure_level.strip()) treatment_sequence = TreatmentSequence( ranked_treatments=treatment_factory.compute_full_factorial_design()) isa_object_factory = IsaModelObjectFactory(plan, treatment_sequence) s = isa_object_factory.create_assays_from_plan() contact = Person() contact.affiliation = study_info['study_pi_affiliation'] contact.last_name = study_info['study_pi_last_name'] contact.email = study_info['study_pi_email'] contact.first_name = study_info['study_pi_first_name'] s.contacts = [contact] s.description = study_info['study_description'] s.filename = 's_study.txt' s.title = 'ISA created {}'.format(datetime.datetime.now().isoformat()) s.identifier = 'ISA-{}'.format(uuid.uuid4().hex[:8]) i = Investigation() i.contacts = [contact] i.description = s.description i.title = s.title i.identifier = s.identifier i.studies = [s] isatab.dump(isa_obj=i, output_path=target_dir, i_file_name='i_investigation.txt') for assay in s.assays: for data_file in assay.data_files: data_file_path = os.path.join(target_dir, data_file.filename) with open(data_file_path, 'a'): os.utime(data_file_path, None)
def main(arg): """ Given a SERVER value (and BRAPI isa_study identifier), generates an ISA-Tab document""" client = BrapiClient(SERVER, logger) converter = BrapiToIsaConverter(logger, SERVER) # iterating through the trials held in a BRAPI server: # for trial in client.get_trials(TRIAL_IDS): for trial in get_trials(client): logger.info('we start from a set of Trials') investigation = Investigation() output_directory = get_output_path( trial['trialName']) logger.info("Generating output in : "+ output_directory) if 'contacts' in trial.keys(): for brapicontact in trial['contacts']: #NOTE: brapi has just name atribute -> no seperate first/last name ContactName = brapicontact['name'].split(' ') contact = Person(first_name=ContactName[0], last_name=ContactName[1], affiliation=brapicontact['institutionName'], email=brapicontact['email']) investigation.contacts.append(contact) # iterating through the BRAPI studies associated to a given BRAPI trial: for brapi_study in trial['studies']: germplasminfo = {} #NOTE keeping track of germplasm info for data file generation brapi_study_id = brapi_study['studyDbId'] obs_levels_in_study_and_var, obs_levels = converter.obtain_brapi_obs_levels_and_var(brapi_study_id) # NB: this method always create an ISA Assay Type isa_study, investigation = converter.create_isa_study(brapi_study_id, investigation, obs_levels_in_study_and_var.keys()) investigation.studies.append(isa_study) # creating the main ISA protocols: sample_collection_protocol = Protocol(name="sample collection", protocol_type=OntologyAnnotation(term="sample collection")) isa_study.protocols.append(sample_collection_protocol) # !!!: fix isatab.py to access other protocol_type values to enable Assay Tab serialization # TODO: see https://github.com/ISA-tools/isa-api/blob/master/isatools/isatab.py#L886 phenotyping_protocol = Protocol(name="phenotyping", protocol_type=OntologyAnnotation(term="nucleic acid sequencing")) isa_study.protocols.append(phenotyping_protocol) # Getting the list of all germplasms used in the BRAPI isa_study: germplasms = client.get_study_germplasms(brapi_study_id) germ_counter = 0 # Iterating through the germplasm considered as biosource, # For each of them, we retrieve their attributes and create isa characteristics for germ in germplasms: # print("GERM:", germ['germplasmName']) # germplasmDbId # WARNING: BRAPIv1 endpoints are not consistently using these # depending on endpoints, attributes may have to swapped # get_germplasm_chars(germ) # Creating corresponding ISA biosources with is Creating isa characteristics from germplasm attributes. # ------------------------------------------------------ source = Source(name=germ['germplasmName'], characteristics=converter.create_germplasm_chars(germ)) if germ['germplasmDbId'] not in germplasminfo: germplasminfo[germ['germplasmDbId']] = [germ['accessionNumber']] # Associating ISA sources to ISA isa_study object isa_study.sources.append(source) germ_counter = germ_counter + 1 # Now dealing with BRAPI observation units and attempting to create ISA samples create_study_sample_and_assay(client, brapi_study_id, isa_study, sample_collection_protocol, phenotyping_protocol) # Writing isa_study to ISA-Tab format: # -------------------------------- try: # isatools.isatab.dumps(investigation) # dumps() writes out the ISA # !!!: fix isatab.py to access other protocol_type values to enable Assay Tab serialization # !!!: if Assay Table is missing the 'Assay Name' field, remember to check protocol_type used !!! isatab.dump(isa_obj=investigation, output_path=output_directory) logger.info('DONE!...') except IOError as ioe: logger.info('CONVERSION FAILED!...') logger.info(str(ioe)) try: variable_records = converter.create_isa_tdf_from_obsvars(client.get_study_observed_variables(brapi_study_id)) # Writing Trait Definition File: # ------------------------------ write_records_to_file(this_study_id=str(brapi_study_id), this_directory=output_directory, records=variable_records, filetype="t_") except Exception as ioe: print(ioe) # Getting Variable Data and writing Measurement Data File # ------------------------------------------------------- for level, variables in obs_levels_in_study_and_var.items(): try: obsvarlist = [] for i in client.get_study_observation_units(brapi_study_id): obsvarlist.append(i) data_readings = converter.create_isa_obs_data_from_obsvars(obsvarlist, list(variables) ,level, germplasminfo, obs_levels) logger.debug("Generating data files") write_records_to_file(this_study_id=str(brapi_study_id), this_directory=output_directory, records=data_readings, filetype="d_", ObservationLevel=level) except Exception as ioe: print(ioe)
def measure_real_world(n_rows): starting_time = time.process_time_ns() investigation = Investigation() investigation.identifier = "i1" study = Study(filename="s_study.txt") study.identifier = "s1" investigation.studies.append(study) # Ontologies ontologies = {} ontologies["NCBITaxon"] = OntologySource( name="NCBITaxon", file="http://purl.obolibrary.org/obo/ncbitaxon", description= "National Center for Biotechnology Information (NCBI) Organismal Classification" ) ontologies["AGRO"] = OntologySource( name="AGRO", file="http://purl.obolibrary.org/obo/agro/releases/2018-05-14/agro.owl", description="Agronomy Ontology", version="2018-05-14") ontologies["UO"] = OntologySource( name="UO", file="http://data.bioontology.org/ontologies/UO", description="Units of Measurement Ontology", version="38802") investigation.ontology_source_references.extend(ontologies.values()) # Factors fa_soil_cover = StudyFactor(name="Soil Cover") fa_plant_movement = StudyFactor(name="Plant Movement") study.factors.extend([fa_soil_cover, fa_plant_movement]) fav_covered = FactorValue(factor_name=fa_soil_cover, value="covered") fav_uncovered = FactorValue(factor_name=fa_soil_cover, value="uncovered") fav_rotating = FactorValue(factor_name=fa_plant_movement, value="rotating") fav_stationary = FactorValue(factor_name=fa_plant_movement, value="stationary") # Protocols prot_phenotyping = Protocol(name="Phenotyping") prot_growth = Protocol(name="Growth") prot_watering = Protocol(name="Watering") study.protocols.append(prot_phenotyping) study.protocols.append(prot_growth) study.protocols.append(prot_watering) assay = Assay(filename="a_assay.txt") study.assays.append(assay) # Characteristics common_characteristics = [ Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term="Arabidopsis thaliana", term_source=ontologies["NCBITaxon"], term_accession="http://purl.obolibrary.org/obo/NCBITaxon_3702") ), Characteristic( category=OntologyAnnotation(term="Genus"), value=OntologyAnnotation( term="Arabidopsis", term_source=ontologies["NCBITaxon"], term_accession="http://purl.obolibrary.org/obo/NCBITaxon_3701") ), Characteristic(category=OntologyAnnotation(term="Species"), value=OntologyAnnotation(term="thaliana")), Characteristic(category=OntologyAnnotation(term="Infraspecific Name"), value=OntologyAnnotation(term=" ")), Characteristic( category=OntologyAnnotation(term="Biological Material Latitude"), value=OntologyAnnotation(term="51.827721")), Characteristic( category=OntologyAnnotation(term="Biological Material Longitude"), value=OntologyAnnotation(term="11.27778")), Characteristic( category=OntologyAnnotation(term="Material Source ID"), value=OntologyAnnotation( term= "http://eurisco.ipk-gatersleben.de/apex/f?p=103:16:::NO::P16_EURISCO_ACC_ID:1668187" )), Characteristic( category=OntologyAnnotation(term="Seed Origin"), value=OntologyAnnotation( term="http://arabidopsis.info/StockInfo?NASC_id=22680")), Characteristic( category=OntologyAnnotation(term="Growth Facility"), value=OntologyAnnotation(term="small LemnaTec phytochamber")), Characteristic( category=OntologyAnnotation(term="Material Source Latitude"), value=OntologyAnnotation(term="51.827721")), Characteristic( category=OntologyAnnotation(term="Material Source Longitude"), value=OntologyAnnotation(term="11.27778")) ] sample_characteristic = Characteristic( category=OntologyAnnotation(term="Observation Unit Type"), value=OntologyAnnotation(term="plant")) # Growth Parameters growth_parameters = { } # Name => [Value, Value REF, Value Accession, Unit, Unit REF, Unit Accession] with open("growth_parameters.csv") as gp: r = csv.DictReader(gp, delimiter=';') for row in r: growth_parameters[row["Parameter name"]] = list( row.values())[1:len(row)] prot_growth.parameters.append( ProtocolParameter(parameter_name=OntologyAnnotation( term=row["Parameter name"]))) growth_parameter_values = [] for param in prot_growth.parameters: field_values = growth_parameters[param.parameter_name.term] if field_values[3]: if field_values[4]: unit = OntologyAnnotation( term=field_values[3], term_accession=field_values[5], term_source=ontologies[field_values[4]]) else: unit = OntologyAnnotation(term=field_values[3]) # If there is a unit, the value should be a number value = float(field_values[0]) else: unit = None if field_values[1]: value = OntologyAnnotation( term=field_values[0], term_accession=field_values[2], term_source=ontologies[field_values[1]]) else: value = OntologyAnnotation(term=field_values[0]) growth_parameter_values.append( ParameterValue(category=param, value=value, unit=unit)) # Write Study File for i in range(0, n_rows): source = Source(name='Plant_{}'.format(i)) sample = Sample(name="1135FA-{}".format(i)) study.samples.append(sample) study.sources.append(source) proc_growth = Process(executes_protocol=prot_growth) proc_growth.inputs.append(source) proc_growth.outputs.append(sample) study.process_sequence.append(proc_growth) source.characteristics.extend(common_characteristics) proc_growth.parameter_values.extend(growth_parameter_values) if i % 2 == 0: sample.factor_values.extend([fav_covered, fav_rotating]) else: sample.factor_values.extend([fav_uncovered, fav_stationary]) sample.characteristics.append(sample_characteristic) ## Read Phenotyping Parameters prot_phenotyping_parameters = {} with open("phenotyping_parameters.csv") as gp: r = csv.DictReader(gp, delimiter=';') for row in r: param = ProtocolParameter(parameter_name=OntologyAnnotation( term=row["Parameter name"])) prot_phenotyping_parameters[row["Parameter name"]] = param prot_phenotyping.parameters.append(param) prot_watering_parameters = { "Irrigation Type": ProtocolParameter(parameter_name=OntologyAnnotation( term="Irrigation Type")), "Volume": ProtocolParameter(parameter_name=OntologyAnnotation(term="Volume")), } prot_watering.parameters = prot_watering_parameters.values() datafile_comment = Comment(name="Image analysis tool", value="IAP") for i, sample in enumerate(study.samples): phenotyping_process = Process(executes_protocol=prot_phenotyping) phenotyping_process.inputs.append(sample) datafile = DataFile( filename= "{}FA_images/fluo/side/54/1135FA1001 side.fluo das_54 DEG_000 2011-10-12 11_09_36.png" .format(i), label="Raw Data File", generated_from=[sample]) phenotyping_process.outputs.append(datafile) phenotyping_process.parameter_values.extend([ ParameterValue( category=prot_phenotyping_parameters["Imaging Time"], value="28.09.2011 12:34:37"), ParameterValue( category=prot_phenotyping_parameters["Camera Configuration"], value="A_Fluo_Side_Big_Plant"), ParameterValue( category=prot_phenotyping_parameters["Camera Sensor"], value="FLUO"), ParameterValue(category=prot_phenotyping_parameters["Camera View"], value="side"), ParameterValue( category=prot_phenotyping_parameters["Imaging Angle"], value=90.0, unit=OntologyAnnotation( term="degree", term_source=ontologies["UO"], term_accession="http://purl.obolibrary.org/obo/UO_0000185") ), ]) watering_process = Process(executes_protocol=prot_watering) watering_process.inputs.append(datafile) datafile2 = DataFile( filename="derived_data_files/das_{}.txt".format(i), label="Derived Data File", generated_from=[datafile]) datafile2.comments.append(datafile_comment) watering_process.outputs.append(datafile2) watering_process.parameter_values.extend([ ParameterValue( category=prot_watering_parameters["Irrigation Type"], value="automated (LemnaTec target weight)"), ParameterValue( category=prot_watering_parameters["Volume"], value=80.4, unit=OntologyAnnotation( term="g", term_source=ontologies["UO"], term_accession="http://purl.obolibrary.org/obo/UO_0000021") ), ]) plink(phenotyping_process, watering_process) assay.samples.append(sample) assay.data_files.append(datafile) assay.data_files.append(datafile2) assay.process_sequence.append(phenotyping_process) assay.process_sequence.append(watering_process) isatab.dump(investigation, "./") return time.process_time_ns() - starting_time
def measure_reduced(n_rows): starting_time = time.process_time_ns() investigation = Investigation() investigation.identifier = "i1" study = Study(filename="s_study.txt") study.identifier = "s1" investigation.studies.append(study) sample_collection_protocol = Protocol(name="sample collection") study.protocols.append(sample_collection_protocol) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term="H**o Sapiens", term_source=ncbitaxon, term_accession="http://purl.bioontology.org/ontology/NCBITAXON/9606" )) for i in range(0, n_rows): source = Source(name='source_material-{}'.format(i)) sample = Sample(name="sample_material-{}".format(i)) sample.characteristics.append(characteristic_organism) study.samples.append(sample) study.sources.append(source) sample_collection_process = Process( executes_protocol=sample_collection_protocol) sample_collection_process.inputs.append(source) sample_collection_process.outputs.append(sample) study.process_sequence.append(sample_collection_process) # Next, we build n Assay object and attach two protocols, extraction and sequencing. assay = Assay(filename="a_assay.txt") extraction_protocol = Protocol(name='extraction') study.protocols.append(extraction_protocol) sequencing_protocol = Protocol(name='sequencing') study.protocols.append(sequencing_protocol) for i, sample in enumerate(study.samples): # create an extraction process that executes the extraction protocol extraction_process = Process(executes_protocol=extraction_protocol) # extraction process takes as input a sample, and produces an extract material as output extraction_process.inputs.append(sample) material = Material(name="extract-{}".format(i)) material.type = "Extract Name" extraction_process.outputs.append(material) # create a sequencing process that executes the sequencing protocol sequencing_process = Process(executes_protocol=sequencing_protocol) sequencing_process.name = "assay-name-{}".format(i) sequencing_process.inputs.append(extraction_process.outputs[0]) # Sequencing process usually has an output data file datafile = DataFile(filename="sequenced-data-{}".format(i), label="Raw Data File", generated_from=[sample]) sequencing_process.outputs.append(datafile) # Ensure Processes are linked forward and backward. plink(from_process, to_process) is a function to set # these links for you. It is found in the isatools.model package plink(extraction_process, sequencing_process) # make sure the extract, data file, and the processes are attached to the assay assay.samples.append(sample) assay.data_files.append(datafile) assay.other_material.append(material) assay.process_sequence.append(extraction_process) assay.process_sequence.append(sequencing_process) assay.measurement_type = OntologyAnnotation(term="gene sequencing") assay.technology_type = OntologyAnnotation( term="nucleotide sequencing") # attach the assay to the study study.assays.append(assay) isatab.dump(investigation, "./") return time.process_time_ns() - starting_time
def create_descriptor(): dom = xml.dom.minidom.parse('./dataset-100194.xml') root = dom.documentElement data = root.getElementsByTagName('dataset') dataset = data[0] print(dataset.nodeName) investigation = Investigation() investigation.studies.append(Study()) # ------------ dataset --------------- investigation.studies[0].filename = "s_study.txt" investigation.studies[0].identifier = "10.5524/100001" investigation.studies[0].title = "test dataset" investigation.studies[0].description = "this is test dataset" investigation.studies[0].public_release_date = "2016/11/11" # submitter contact = Person(first_name="Alice", last_name="Robertson", affiliation="University of Life", email="*****@*****.**", roles=[OntologyAnnotation(term='submitter')]) investigation.studies[0].contacts.append(contact) publication = Publication(doi="10.5524/manuscript10002") publication.status = OntologyAnnotation(term="published") investigation.studies[0].publications.append(publication) #Data Repository investigation.studies[0].comments = [] comment1 = Comment(name="Data Repository", value="ftp://climb.genomics.cn") investigation.studies[0].comments.append(comment1) #Data Record Accession comment2 = Comment(name="Data Record Accession", value="ftp://climb.genomics.cn") investigation.studies[0].comments.append(comment2) ##funder comment3 = Comment(name="Funder Term Source REF", value="ftp://climb.genomics.cn") # funder url investigation.studies[0].comments.append(comment3) comment4 = Comment(name="Grant Identifier", value="National ....") # funder award investigation.studies[0].comments.append(comment4) comment5 = Comment(name="Awardee", value="National ....") # funder comment investigation.studies[0].comments.append(comment5) ##publication comment6 = Comment(name="Data Repository", value="GigaScience database") # publication investigation.studies[0].comments.append(comment6) ##author author1 = Person(first_name="Alice", last_name="Robertson", roles="author") #if contain orcid comment7 = Comment(name="Study Person ORCID", value="111111-22221-00000") investigation.studies[0].comments.append(comment7) ##dataset type eg. Genomics comment8 = Comment(name="Subject Keywords", value="Genomics") investigation.studies[0].comments.append(comment8) ##dataset keyword comment9 = Comment(name="key", value="rna sequences") investigation.studies[0].comments.append(comment9) # ------------ sample --------------- source = Source(name='source_material') investigation.studies[0].materials['sources'].append(source) #sample name sample = Sample(name="SAMEA3518466", derives_from=source) #sample attribute ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic1 = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term="H**o Sapiens", term_source=ncbitaxon, term_accession="http://purl.bioontology.org/ontology/NCBITAXON/9606" )) sample.characteristics.append(characteristic1) # characteristic2 = Characteristic(category="Term Source Ref", value="NCBITaxon") # sample.characteristics.append(characteristic2) # characteristic3 = Characteristic(category="Term Accession Number", value="http://eol_link") #eol_link not need now # sample.characteristics.append(characteristic3) characteristic4 = Characteristic( category=OntologyAnnotation(term="geolocation"), value="10.222/2.00002222") #eol_link not need now sample.characteristics.append(characteristic4) investigation.studies[0].materials['samples'].append(sample) #protocols sample_collection_protocol = Protocol( name="sample collection", protocol_type=OntologyAnnotation(term="sample collection")) investigation.studies[0].protocols.append(sample_collection_protocol) data_collection_protocol = Protocol( name="data collection", protocol_type=OntologyAnnotation(term="data collection")) investigation.studies[0].protocols.append(sample_collection_protocol) # study-level process sequence is needed to declare samples sample_collection_process = Process( executes_protocol=sample_collection_protocol) # Be careful here, this bit of code says to attach all sources to one process instance producing all samples # This works right now as there is only one source and sample, but if there are multiple source->collection-sample # instances, make sure you use a new Process object to hold them every time, otherwise the 1-1 relationship between # source and sample may be lost! for src in investigation.studies[0].materials['sources']: sample_collection_process.inputs.append(src) for sam in investigation.studies[0].materials['samples']: sample_collection_process.outputs.append(sam) investigation.studies[0].process_sequence.append(sample_collection_process) # ------------ file --------------- assay = Assay(filename="a_assay.txt") datafile = DataFile( filename="ftp://xxxxxxxxx", label="Raw Data File") # needs 'label' set as it is the column name datafile.comments = [] comment10 = Comment(name="File Description", value="test file") datafile.comments.append(comment10) assay.data_files.append(datafile) # assay-level process sequence is needed to declare data files data_collection_process = Process( executes_protocol=data_collection_protocol) data_collection_process.inputs.append(sample) data_collection_process.outputs.append(datafile) assay.process_sequence.append(data_collection_process) investigation.studies[0].assays.append(assay) from isatools.isatab import dump return dump(isa_obj=investigation, output_path='.')
def convert(json_path, output_path): print(json_path) print(output_path) with open(json_path, 'r') as f: dcc_json = json.load(f) # print(array['protocol']) # for element in array['protocol']: # array['protocol'][element]['id'] # array['protocol'][element]['description'] # array['protocol'][element]['type'] # array['protocol'][element]['filename'] # for element in array['measurement']: # print(array['measurement'][element]['corrected_mz']) # for element in array['subject']: # print(array['subject'][element]['species']) # Building the Investigation Object and its elements: project_set_json = dcc_json.get('project') if len(project_set_json) == 0: raise IOError('No project found in input JSON') # print(next(iter(project_set_json))) project_json = next(iter(project_set_json.values())) investigation = Investigation(identifier=project_json['id']) obi = OntologySource(name='OBI', description='Ontology for Biomedical Investigations') investigation.ontology_source_references.append(obi) inv_person = Person( first_name=project_json['PI_first_name'], last_name=project_json['PI_last_name'], email=project_json['PI_email'], address=project_json['address'], affiliation=(', '.join( [project_json['department'], project_json['institution']])), roles=[ OntologyAnnotation(term="", term_source=obi, term_accession="http://purl.org/obo/OBI_1") ]) investigation.contacts.append(inv_person) study_set_json = dcc_json.get('study') if len(study_set_json) > 0: study_json = next(iter(study_set_json.values())) study = Study( identifier=study_json['id'], title=study_json['title'], description=study_json['description'], design_descriptors=[ OntologyAnnotation(term=study_json['type'], term_source=obi, term_accession="http://purl.org/obo/OBI_1") ], filename='s_{study_id}.txt'.format(study_id=study_json['id'])) investigation.studies = [study] studyid = study_json['id'] print(studyid) study_person = Person( first_name=study_json['PI_first_name'], last_name=study_json['PI_last_name'], email=study_json['PI_email'], address=study_json['address'], affiliation=(', '.join( [study_json['department'], study_json['institution']])), roles=[ OntologyAnnotation(term='principal investigator', term_source=obi, term_accession="http://purl.org/obo/OBI_1") ]) study.contacts.append(study_person) for factor_json in dcc_json['factor'].values(): factor = StudyFactor(name=factor_json['id']) study.factors.append(factor) for i, protocol_json in enumerate(dcc_json['protocol'].values()): oat_p = protocol_json['type'] oa_protocol_type = OntologyAnnotation( term=oat_p, term_source=obi, term_accession="http://purl.org/obo/OBI_1") study.protocols.append( Protocol(name=protocol_json['id'], protocol_type=oa_protocol_type, description=protocol_json['description'], uri=protocol_json['filename'])) if 'MS' in protocol_json['type']: study.assays.append( Assay(measurement_type=OntologyAnnotation( term='mass isotopologue distribution analysis', term_source=obi, term_accession="http://purl.org/obo/OBI_112"), technology_type=OntologyAnnotation( term='mass spectrometry', term_source=obi, term_accession="http://purl.org/obo/OBI_1"), filename='a_assay_ms_{count}.txt'.format(count=i))) if 'NMR' in protocol_json['type']: study.assays.append( Assay(measurement_type=OntologyAnnotation( term='isotopomer analysis', term_source=obi, term_accession="http://purl.org/obo/OBI_111"), technology_type=OntologyAnnotation( term='nmr spectroscopy', term_source=obi, term_accession="http://purl.org/obo/OBI_1"), filename='a_assay_nmr.txt')) for subject_json in dcc_json['subject'].values(): # print(array['subject'][element]) if "organism" in subject_json['type']: source = Source(name=subject_json['id']) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term=subject_json['species'], term_source=ncbitaxon, term_accession= 'http://purl.bioontology.org/ontology/NCBITAXON/9606')) source.characteristics.append(characteristic_organism) study.sources.append(source) elif 'tissue_slice' in subject_json['type']: # print(array['subject'][element]['type']) source = Source(name=subject_json['id']) study.sources.append(source) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term=subject_json['species'], term_source=ncbitaxon, term_accession= 'http://purl.bioontology.org/ontology/NCBITAXON/9606')) source.characteristics.append(characteristic_organism) sample = Sample(name=subject_json['id'], derives_from=subject_json['parentID']) characteristic_organismpart = Characteristic( category=OntologyAnnotation(term='organism_part'), value=OntologyAnnotation( term=subject_json['tissue_type'], term_source=obi, term_accession="http://purl.org/obo/OBI_1")) sample.characteristics.append(characteristic_organismpart) study.samples.append(sample) # print(study.samples[0].name) sample_collection_process = Process( executes_protocol=study.get_prot( subject_json['protocol.id'])) sample_collection_process.inputs.append(source) sample_collection_process.outputs.append(sample) study.process_sequence.append(sample_collection_process) else: source = Source(name=subject_json['id']) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term=subject_json['species'], term_source=ncbitaxon, term_accession= 'http://purl.bioontology.org/ontology/NCBITAXON/9606')) source.characteristics.append(characteristic_organism) study.sources.append(source) print(subject_json['id']) print(subject_json['species']) print(subject_json['type']) # for src in investigation.studies[0].materials: # # for sam in investigation.studies[0].materials: for sample_json in dcc_json['sample'].values(): if 'cells' in sample_json['type']: material_separation_process = Process( executes_protocol=study.get_prot( sample_json['protocol.id'])) material_separation_process.name = sample_json['id'] # dealing with input material, check that the parent material is already among known samples or sources if len([ x for x in study.samples if x.name == sample_json['parentID'] ]) == 0: material_in = Sample(name=sample_json['parentID']) material_separation_process.inputs.append(material_in) study.assays[0].samples.append(material_in) else: print([ x for x in study.samples if x.name == sample_json['parentID'] ]) material_separation_process.inputs.append([ x for x in study.samples if x.name == sample_json['parentID'] ][0]) material_out = Sample(name=sample_json['id']) material_type = Characteristic( category=OntologyAnnotation(term='material_type'), value=OntologyAnnotation( term=sample_json['type'], term_source=obi, term_accession="http://purl.org/obo/OBI_xxxxxxx")) material_out.characteristics.append(material_type) material_separation_process.outputs.append(material_out) study.assays[0].samples.append(material_out) try: sample_collection_process except NameError: sample_collection_process = None if sample_collection_process is None: sample_collection_process = Process(executes_protocol="") else: # plink(protein_extraction_process, data_acq_process) # plink(material_separation_process, protein_extraction_process) plink(sample_collection_process, protein_extraction_process) if 'protein_extract' in sample_json['type']: protein_extraction_process = Process( executes_protocol=study.get_prot( sample_json['protocol.id'])) protein_extraction_process.name = sample_json['id'] if len([ x for x in study.samples if x.name == sample_json['parentID'] ]) == 0: material_in = Sample(name=sample_json['parentID']) protein_extraction_process.inputs.append(material_in) study.assays[0].samples.append(material_in) else: # print([x for x in study.samples if x.name == sample_json['parentID']]) protein_extraction_process.inputs.append(material_in) # for material_in in study.samples: # # print("OHO:", material_in.name) # if material_in.name == sample_json['parentID']: # # print("C:",sample_json['parentID']) # #no need to create, just link to process # protein_extraction_process.inputs.append(x) # else: # # print("D:", sample_json['parentID']) # #create new material and link # material_in = Sample(name=sample_json['parentID']) # protein_extraction_process.inputs.append(material_in) material_out = Material(name=sample_json['id']) material_out.type = "Extract Name" material_type = Characteristic( category=OntologyAnnotation(term='material_type'), value=OntologyAnnotation( term=sample_json['type'], term_source=obi, term_accession="http://purl.org/obo/OBI_1")) material_out.characteristics.append(material_type) study.assays[0].samples.append(material_in) study.assays[0].materials['other_material'].append(material_in) try: material_separation_process except NameError: material_separation_process = None if material_separation_process is None: material_separation_process = Process(executes_protocol="") else: # plink(protein_extraction_process, data_acq_process) plink(material_separation_process, protein_extraction_process) if 'polar' in sample_json['type']: material_in = Material(name=sample_json['parentID']) material_type = Characteristic( category=OntologyAnnotation(term='material_type', term_source=obi), value=OntologyAnnotation(term=sample_json['type'], term_source=obi)) material_in.characteristics.append(material_type) study.assays[0].materials['other_material'].append(material_in) data_acq_process = Process(executes_protocol=study.get_prot( sample_json['protocol.id'])) data_acq_process.name = sample_json['id'] datafile = DataFile( filename='{filename}.txt'.format(filename='_'.join( ['mass_isotopomer-data', studyid, sample_json['id']])), label='Raw Data File') data_acq_process.outputs.append(datafile) # print(study.assays[0].technology_type.term) study.assays[0].data_files.append(datafile) try: protein_extraction_process except NameError: protein_extraction_process = None if protein_extraction_process is None: protein_extraction_process = Process(executes_protocol="") else: plink(protein_extraction_process, data_acq_process) # else: # material_in = Material(name=sample_json['parentID']) # material_out = Material(name=sample_json['id']) # material_type = Characteristic( # category=OntologyAnnotation(term="material_type"), # value=OntologyAnnotation(term=sample_json['type'], # term_source=obi, # term_accession="http://purl.org/obo/OBI_1")) # material_out.characteristics.append(material_type) # process = Process(executes_protocol=sample_json['protocol.id']) # process.name = sample_json['id'] # process.inputs.append(material_in) # process.outputs.append(material_out) # # study.assays[0].materials['other_material'].append(material_in) # study.assays[0].materials['other_material'].append(material_out) if 'bulk_tissue' in sample_json['type']: bulk_process = Process(executes_protocol=study.get_prot( sample_json['protocol.id'])) bulk_process.name = sample_json['id'] if len([ x for x in study.samples if x.name == sample_json['parentID'] ]) == 0: material_in = Sample(name=sample_json['parentID']) bulk_process.inputs.append(material_in) study.assays[0].samples.append(material_in) else: # print([x for x in study.samples if x.name == sample_json['parentID']]) bulk_process.inputs.append(material_in) plink(sample_collection_process, bulk_process) data_rec_header = '\t'.join( ('metabolite name', 'assignment', 'signal intensity', 'retention time', 'm/z', 'formula', 'adduct', 'isotopologue', 'sample identifier')) records = [] for element in dcc_json['measurement']: # metabolite_name: -> compound # array['measurement'][element]['signal_intensity'] record = '\t'.join((dcc_json['measurement'][element]['compound'], dcc_json['measurement'][element]['assignment'], dcc_json['measurement'][element]['raw_intensity'], dcc_json['measurement'][element]['retention_time'], dcc_json['measurement'][element]['corrected_mz'], dcc_json['measurement'][element]['formula'], dcc_json['measurement'][element]['adduct'], dcc_json['measurement'][element]['isotopologue'], dcc_json['measurement'][element]['sample.id'])) # print(record) records.append(record) if not os.path.exists(output_path): os.makedirs(output_path) try: with open( '{output_path}/{study_id}-maf-data-nih-dcc-json.txt'. format(output_path=output_path, study_id=studyid), 'w') as fh: print( "'writing 'maf file document' to file from 'generate_maf_file' method:..." ) fh.writelines(data_rec_header) fh.writelines('\n') for item in records: fh.writelines(item) fh.writelines('\n') print("writing 'investigation information' to file...") print(isatab.dumps(investigation)) isatab.dump(investigation, output_path=output_path) except IOError: print("Error: in main() method can't open file or write data")
def _exportISATAB(self, destinationPath, detailsDict): """ Export the dataset's metadata to the directory *destinationPath* as ISATAB detailsDict should have the format: detailsDict = { 'investigation_identifier' : "i1", 'investigation_title' : "Give it a title", 'investigation_description' : "Add a description", 'investigation_submission_date' : "2016-11-03", 'investigation_public_release_date' : "2016-11-03", 'first_name' : "Noureddin", 'last_name' : "Sadawi", 'affiliation' : "University", 'study_filename' : "my_ms_study", 'study_material_type' : "Serum", 'study_identifier' : "s1", 'study_title' : "Give the study a title", 'study_description' : "Add study description", 'study_submission_date' : "2016-11-03", 'study_public_release_date' : "2016-11-03", 'assay_filename' : "my_ms_assay" } :param str destinationPath: Path to a directory in which the output will be saved :param dict detailsDict: Contains several key, value pairs required to for ISATAB :raises IOError: If writing one of the files fails """ from isatools.model import Investigation, Study, Assay, OntologyAnnotation, OntologySource, Person, Publication, Protocol, Source from isatools.model import Comment, Sample, Characteristic, Process, Material, DataFile, ParameterValue, plink from isatools import isatab import isaExplorer as ie investigation = Investigation() investigation.identifier = detailsDict['investigation_identifier'] investigation.title = detailsDict['investigation_title'] investigation.description = detailsDict['investigation_description'] investigation.submission_date = detailsDict[ 'investigation_submission_date'] #use today if not specified investigation.public_release_date = detailsDict[ 'investigation_public_release_date'] study = Study(filename='s_' + detailsDict['study_filename'] + '.txt') study.identifier = detailsDict['study_identifier'] study.title = detailsDict['study_title'] study.description = detailsDict['study_description'] study.submission_date = detailsDict['study_submission_date'] study.public_release_date = detailsDict['study_public_release_date'] investigation.studies.append(study) obi = OntologySource( name='OBI', description="Ontology for Biomedical Investigations") investigation.ontology_source_references.append(obi) intervention_design = OntologyAnnotation(term_source=obi) intervention_design.term = "intervention design" intervention_design.term_accession = "http://purl.obolibrary.org/obo/OBI_0000115" study.design_descriptors.append(intervention_design) # Other instance variables common to both Investigation and Study objects include 'contacts' and 'publications', # each with lists of corresponding Person and Publication objects. contact = Person(first_name=detailsDict['first_name'], last_name=detailsDict['last_name'], affiliation=detailsDict['affiliation'], roles=[OntologyAnnotation(term='submitter')]) study.contacts.append(contact) publication = Publication(title="Experiments with Data", author_list="Auther 1, Author 2") publication.pubmed_id = "12345678" publication.status = OntologyAnnotation(term="published") study.publications.append(publication) # To create the study graph that corresponds to the contents of the study table file (the s_*.txt file), we need # to create a process sequence. To do this we use the Process class and attach it to the Study object's # 'process_sequence' list instance variable. Each process must be linked with a Protocol object that is attached to # a Study object's 'protocols' list instance variable. The sample collection Process object usually has as input # a Source material and as output a Sample material. sample_collection_protocol = Protocol( id_="sample collection", name="sample collection", protocol_type=OntologyAnnotation(term="sample collection")) aliquoting_protocol = Protocol( id_="aliquoting", name="aliquoting", protocol_type=OntologyAnnotation(term="aliquoting")) for index, row in self.sampleMetadata.iterrows(): src_name = row['Sample File Name'] source = Source(name=src_name) source.comments.append( Comment(name='Study Name', value=row['Study'])) study.sources.append(source) sample_name = src_name sample = Sample(name=sample_name, derives_from=[source]) # check if field exists first status = row[ 'Status'] if 'Status' in self.sampleMetadata.columns else 'N/A' characteristic_material_type = Characteristic( category=OntologyAnnotation(term="material type"), value=status) sample.characteristics.append(characteristic_material_type) #characteristic_material_role = Characteristic(category=OntologyAnnotation(term="material role"), value=row['AssayRole']) #sample.characteristics.append(characteristic_material_role) # check if field exists first age = row['Age'] if 'Age' in self.sampleMetadata.columns else 'N/A' characteristic_age = Characteristic( category=OntologyAnnotation(term="Age"), value=age, unit='Year') sample.characteristics.append(characteristic_age) # check if field exists first gender = row[ 'Gender'] if 'Gender' in self.sampleMetadata.columns else 'N/A' characteristic_gender = Characteristic( category=OntologyAnnotation(term="Gender"), value=gender) sample.characteristics.append(characteristic_gender) ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy") characteristic_organism = Characteristic( category=OntologyAnnotation(term="Organism"), value=OntologyAnnotation( term="H**o Sapiens", term_source=ncbitaxon, term_accession= "http://purl.bioontology.org/ontology/NCBITAXON/9606")) sample.characteristics.append(characteristic_organism) study.samples.append(sample) # check if field exists first sampling_date = row['Sampling Date'] if not pandas.isnull( row['Sampling Date']) else None sample_collection_process = Process( id_='sam_coll_proc', executes_protocol=sample_collection_protocol, date_=sampling_date) aliquoting_process = Process(id_='sam_coll_proc', executes_protocol=aliquoting_protocol, date_=sampling_date) sample_collection_process.inputs = [source] aliquoting_process.outputs = [sample] # links processes plink(sample_collection_process, aliquoting_process) study.process_sequence.append(sample_collection_process) study.process_sequence.append(aliquoting_process) study.protocols.append(sample_collection_protocol) study.protocols.append(aliquoting_protocol) ### Add NMR Assay ### nmr_assay = Assay( filename='a_' + detailsDict['assay_filename'] + '.txt', measurement_type=OntologyAnnotation(term="metabolite profiling"), technology_type=OntologyAnnotation(term="NMR spectroscopy")) extraction_protocol = Protocol( name='extraction', protocol_type=OntologyAnnotation(term="material extraction")) study.protocols.append(extraction_protocol) nmr_protocol = Protocol( name='NMR spectroscopy', protocol_type=OntologyAnnotation(term="NMR Assay")) nmr_protocol.add_param('Run Order') #if 'Instrument' in self.sampleMetadata.columns: nmr_protocol.add_param('Instrument') #if 'Sample Batch' in self.sampleMetadata.columns: nmr_protocol.add_param('Sample Batch') nmr_protocol.add_param('Acquisition Batch') study.protocols.append(nmr_protocol) #for index, row in sampleMetadata.iterrows(): for index, sample in enumerate(study.samples): row = self.sampleMetadata.loc[ self.sampleMetadata['Sample File Name'].astype( str) == sample.name] # create an extraction process that executes the extraction protocol extraction_process = Process(executes_protocol=extraction_protocol) # extraction process takes as input a sample, and produces an extract material as output sample_name = sample.name sample = Sample(name=sample_name, derives_from=[source]) #print(row['Acquired Time'].values[0]) extraction_process.inputs.append(sample) material = Material(name="extract-{}".format(index)) material.type = "Extract Name" extraction_process.outputs.append(material) # create a ms process that executes the nmr protocol nmr_process = Process(executes_protocol=nmr_protocol, date_=datetime.isoformat( datetime.strptime( str(row['Acquired Time'].values[0]), '%Y-%m-%d %H:%M:%S'))) nmr_process.name = "assay-name-{}".format(index) nmr_process.inputs.append(extraction_process.outputs[0]) # nmr process usually has an output data file # check if field exists first assay_data_name = row['Assay data name'].values[ 0] if 'Assay data name' in self.sampleMetadata.columns else 'N/A' datafile = DataFile(filename=assay_data_name, label="NMR Assay Name", generated_from=[sample]) nmr_process.outputs.append(datafile) #nmr_process.parameter_values.append(ParameterValue(category='Run Order',value=str(i))) nmr_process.parameter_values = [ ParameterValue(category=nmr_protocol.get_param('Run Order'), value=row['Run Order'].values[0]) ] # check if field exists first instrument = row['Instrument'].values[ 0] if 'Instrument' in self.sampleMetadata.columns else 'N/A' nmr_process.parameter_values.append( ParameterValue(category=nmr_protocol.get_param('Instrument'), value=instrument)) # check if field exists first sbatch = row['Sample batch'].values[ 0] if 'Sample batch' in self.sampleMetadata.columns else 'N/A' nmr_process.parameter_values.append( ParameterValue(category=nmr_protocol.get_param('Sample Batch'), value=sbatch)) nmr_process.parameter_values.append( ParameterValue( category=nmr_protocol.get_param('Acquisition Batch'), value=row['Batch'].values[0])) # ensure Processes are linked forward and backward plink(extraction_process, nmr_process) # make sure the extract, data file, and the processes are attached to the assay nmr_assay.samples.append(sample) nmr_assay.data_files.append(datafile) nmr_assay.other_material.append(material) nmr_assay.process_sequence.append(extraction_process) nmr_assay.process_sequence.append(nmr_process) nmr_assay.measurement_type = OntologyAnnotation( term="metabolite profiling") nmr_assay.technology_type = OntologyAnnotation( term="NMR spectroscopy") # attach the assay to the study study.assays.append(nmr_assay) if os.path.exists(os.path.join(destinationPath, 'i_Investigation.txt')): ie.appendStudytoISA(study, destinationPath) else: isatab.dump(isa_obj=investigation, output_path=destinationPath)
sequencing_process.outputs.append(datafile) # Ensure Processes are linked forward and backward. plink(from_process, to_process) is a function to set # these links for you. It is found in the isatools.model package plink(extraction_process, sequencing_process) # make sure the extract, data file, and the processes are attached to the assay assay.samples.append(sample) assay.data_files.append(datafile) assay.other_material.append(material) assay.process_sequence.append(extraction_process) assay.process_sequence.append(sequencing_process) # assay.measurement_type = OntologyAnnotation(term="gene sequencing") # assay.technology_type = OntologyAnnotation(term="nucleotide sequencing") isatab.dump(investigation, ".") shutil.copyfile( "i_investigation.txt", "../../isa4J/src/test/resources/de/ipk_gatersleben/bit/bi/isa4j/components/python_originals/i_investigation.txt" ) shutil.copyfile( "s_study.txt", "../../isa4J/src/test/resources/de/ipk_gatersleben/bit/bi/isa4j/components/python_originals/s_study.txt" ) shutil.copyfile( "a_assay.txt", "../../isa4J/src/test/resources/de/ipk_gatersleben/bit/bi/isa4j/components/python_originals/a_assay.txt" )