def study_datafiles(isatab_dir, data_dirs, study_config): """Retrieve data files and associated metadata for a study. """ ftypes = ["Derived Data File", "Raw Data File"] private = study_config["private"] rec = isatab.parse(isatab_dir) assert len(rec.studies) == 1 study = rec.studies[0] # Do not load private libraries into Galaxy if (rec.metadata.get("Investigation Identifier", "") in private or study.metadata["Study Identifier"] in private): study.assays = [] for assay in study.assays: study_info = _get_study_matadata(study, assay) info = _get_assay_info(assay) verbose = study.metadata["Study Identifier"] in ["SB-S-29"] sample_info = _get_sample_metadata(study, info["Sample Name"], verbose) study_info.update(sample_info) for ftype in ftypes: for fname in info.get(ftype, []): try: out = {"name": _get_full_path(fname, ftype, data_dirs), "type": ftype} out.update(study_info) yield out except ValueError: print "Missing file", fname, data_dirs
def parse_isatab_assays(isatab_dir): """ Read all files contained in isatab format to be processed by pergola :param isatab_dir: :py:func:`str` containing the path to isatab data folder :return: :py:func:`dict` of files to be processed by pergola TODO: This function needs that the assays to be processed are tagged some way """ dict_files = dict() # if not path.isdir(isatab_dir): if not isdir(isatab_dir): raise ValueError ("Argument input must be a folder containning data in isatab format") rec = isatab.parse(isatab_dir) # Sample name are the key shared by both study and assay for i in rec.studies: for j in i.assays: for file in j.nodes.keys(): key = j.nodes[file].metadata['Sample Name'][0] dict_files[key] = file return dict_files
def parse_isatab_assays (isatab_dir): """ Read all files contained in isatab format to be processed by pergola :param isatab_dir: :py:func:`str` containing the path to isatab data folder :return: :py:func:`dict` of files to be processed by pergola """ dict_files = {} if not path.isdir(isatab_dir): raise ValueError ("Argument input must be a folder containning data in isatab format") rec = isatab.parse(isatab_dir) #Sample name are the key shared by both study and assay for i in rec.studies: # print "studies are", i # print "..................",i.assays # print i.assays.node['metadata'] for j in i.assays: print "assays are:", j # print "-----------", j.nodes for file in j.nodes.keys(): print j.nodes[file].metadata['Sample Name'] dict_files[j.nodes[file].metadata['Sample Name']] = file pass # print "file to process is ------------------",file return dict_files
def parse_isatab_assays(isatab_dir): """ Read all files contained in isatab format to be processed by pergola :param isatab_dir: :py:func:`str` containing the path to isatab data folder :return: :py:func:`dict` of files to be processed by pergola TODO: This function needs that the assays to be processed are tagged some way """ dict_files = dict() # if not path.isdir(isatab_dir): if not isdir(isatab_dir): raise ValueError( "Argument input must be a folder containning data in isatab format" ) rec = isatab.parse(isatab_dir) # Sample name are the key shared by both study and assay for i in rec.studies: for j in i.assays: for file in j.nodes.keys(): key = j.nodes[file].metadata['Sample Name'][0] dict_files[key] = file return dict_files
def test_nextgen_parsing(self): """Parse ISA-Tab file representing next gen sequencing data """ work_dir = os.path.join(self._dir, "BII-S-3") rec = isatab.parse(work_dir) assay = rec.studies[0].assays[0] assert assay.metadata['Study Assay Technology Platform'] == '454 Genome Sequencer FLX' assert assay.nodes.has_key("ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/" "SRA000266/EWOEPZA01.sff")
def main(): # get directory from the command arguments and parse the files study_directory = str(sys.argv[1]) # data/ER-metab-v1_latest/ spreadsheet_file = str(sys.argv[2]) # ER-metab-v1_latest.xlsx # parse the isatab directory with isatools parser and get the record record = Record(isatab.parse(study_directory)) investigation = record.investigation write_isatab_to_xlsx(investigation, spreadsheet_file)
def test_mage(self): """Parse MAGE ISATab from ArrayExpress. """ work_dir = os.path.join(self._dir, "mage") rec = isatab.parse(work_dir) assert len(rec.studies) == 1 study = rec.studies[0] node = study.nodes["ERS025105"] assert node.metadata["FASTQ_URI"][0].FASTQ_URI == \ "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR030/ERR030907/ERR030907.fastq.gz"
def test_nextgen_parsing(self): """Parse ISA-Tab file representing next gen sequencing data """ work_dir = os.path.join(self._dir, "BII-S-3") rec = isatab.parse(work_dir) assay = rec.studies[0].assays[0] assert assay.metadata[ 'Study Assay Technology Platform'] == '454 Genome Sequencer FLX' assert assay.nodes.has_key( "ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/" "SRA000266/EWOEPZA01.sff")
def isatab_parser(id_MTBLS=None): if not id_MTBLS : #os.system('clear') #print '======= ISATAB parser =======' # MTBLSentries = get_MTBLS_ID() for name in MTBLSentries: #print name #if os.path.isdir('OUTPUT/_dataMTBLS/' + name) and name.startswith("MTBLS") : rec = isatab.parse(pathMTBLS + name) fp = open(pathISAPARSER + name + '_parsed', 'w') print >> fp, rec fp.close() else:#CASE: function called with one input parameter. USER should provide a good and existing MTBLS entry!!!! if not os.path.exists(pathMTBLS + id_MTBLS) : get_MTBLS_metadata(id_MTBLS) rec = isatab.parse(pathMTBLS + id_MTBLS) fp = open(pathISAPARSER + id_MTBLS + '_parsed', 'w') print >> fp, rec fp.close()
def test_get_genelists(self): """Identify derived genelists available in ISA-Tab experiment """ work_dir = os.path.join(self._dir, "genelist") rec = isatab.parse(work_dir) study = rec.studies[0] assay = study.assays[0] assay_node = assay.nodes["KLS1nature.CEL"] study_node = study.nodes[assay_node.metadata["Sample Name"][0]] assert "16862118-Figure2bSRAS.txt" in assay_node.metadata["Derived Data File"] expects = ["Mus musculus (Mouse)", "C57BL/6", "bone marrow"] attrs = ["Organism", "strain", "Organism Part"] for attr, expect in zip(attrs, expects): assert study_node.metadata[attr] == [expect]
def parse_isatab(filename): """Parses either an isa-tab zip or folder""" from bcbio import isatab # check if filename is a folder or a zip if not os.path.exists(filename): raise Exception('File or folder %s does not exist', filename) if not os.path.isdir(filename): work_dir = tempfile.mkdtemp() # unzip to temporary folder with zipfile.ZipFile(filename, "r") as z: z.extractall(work_dir) else: work_dir = filename # parse rec = isatab.parse(work_dir) # parse trait def file and raw data file for s in rec.studies: s.trait_def_map = {} s.derived_data_map = {} for assay in s.assays: if (assay.metadata[ 'Study Assay Measurement Type Term Accession Number'] not in ('23', '0000023')): continue for sample_id, assay_data in assay.nodes.items(): derived_data_file = assay_data.metadata['Derived Data File'][0] trait_def_file = assay_data.metadata[ 'Parameter Value[Trait Definition File]'][ 0].Trait_Definition_File # check if we already loaded the trait def file if trait_def_file not in s.trait_def_map: s.trait_def_map[trait_def_file] = _parse_trait_def_file( trait_def_file, work_dir) # check if we already loaded the derived data matrix if derived_data_file not in s.derived_data_map: s.derived_data_map[ derived_data_file] = _parse_derived_data_file( derived_data_file, work_dir) # remove temporary folder if filename != work_dir: shutil.rmtree(work_dir) return rec
def test_get_genelists(self): """Identify derived genelists available in ISA-Tab experiment """ work_dir = os.path.join(self._dir, "genelist") rec = isatab.parse(work_dir) study = rec.studies[0] assay = study.assays[0] assay_node = assay.nodes["KLS1nature.CEL"] study_node = study.nodes[assay_node.metadata["Sample Name"][0]] assert "16862118-Figure2bSRAS.txt" in assay_node.metadata[ "Derived Data File"] expects = ["Mus musculus (Mouse)", "C57BL/6", "bone marrow"] attrs = ["Organism", "strain", "Organism Part"] for attr, expect in zip(attrs, expects): assert study_node.metadata[attr][0][0] == expect
def test_minimal_parsing(self): """Parse a minimal ISA-Tab file without some field values filled in. """ work_dir = os.path.join(self._dir, "minimal") rec = isatab.parse(work_dir) assert len(rec.publications) == 0 assert len(rec.metadata) == 0 assert len(rec.studies) == 1 assert len(rec.studies[0].design_descriptors) == 0 sname = "C2C12 sample1 rep3" study = rec.studies[0] assay_node = study.assays[0].nodes["AFFY#35C.CEL"] assert assay_node.metadata["Sample Name"] == [sname] assert study.nodes[sname].metadata["strain"] == ["C3H"]
def test_minimal_parsing(self): """Parse a minimal ISA-Tab file without some field values filled in. """ work_dir = os.path.join(self._dir, "minimal") rec = isatab.parse(work_dir) assert len(rec.publications) == 0 assert len(rec.metadata) == 0 assert len(rec.studies) == 1 assert len(rec.studies[0].design_descriptors) == 0 sname = "C2C12 sample1 rep3" study = rec.studies[0] assay_node = study.assays[0].nodes["AFFY#35C.CEL"] assert assay_node.metadata["Sample Name"] == [sname] assert study.nodes[sname].metadata["strain"][0][0] == "C3H"
def test_basic_parsing(self): """Test general parsing of an example ISA directory. """ work_dir = os.path.join(self._dir, "BII-I-1") rec = isatab.parse(work_dir) assert rec.metadata["Investigation Identifier"] == "BII-I-1" assert len(rec.ontology_refs) == 6 assert rec.ontology_refs[2]["Term Source Name"] == "UO" assert len(rec.publications) == 1 assert rec.publications[0]["Investigation Publication DOI"] == "doi:10.1186/jbiol54" assert len(rec.studies) == 2 study = rec.studies[0] assert study.metadata["Study File Name"] == "s_BII-S-1.txt" assert len(study.assays) == 3 assert study.assays[0].metadata["Study Assay File Name"] == "a_metabolome.txt" study = rec.studies[1] assert study.nodes['NZ_0hrs_Grow_1'].metadata["organism"] == \ ["Saccharomyces cerevisiae (Baker's yeast)"] assert study.assays[0].nodes['E-MAXD-4-raw-data-426648783.txt' ].metadata["ArrayExpress Accession"] == \ ["E-MAXD-4"]
def test_basic_parsing(self): """Test general parsing of an example ISA directory. """ work_dir = os.path.join(self._dir, "BII-I-1") rec = isatab.parse(work_dir) assert rec.metadata["Investigation Identifier"] == "BII-I-1" assert len(rec.ontology_refs) == 6 assert rec.ontology_refs[2]["Term Source Name"] == "UO" assert len(rec.publications) == 1 assert rec.publications[0][ "Investigation Publication DOI"] == "doi:10.1186/jbiol54" assert len(rec.studies) == 2 study = rec.studies[0] assert study.metadata["Study File Name"] == "s_BII-S-1.txt" assert len(study.assays) == 3 assert study.assays[0].metadata[ "Study Assay File Name"] == "a_metabolome.txt" study = rec.studies[1] assert study.nodes['NZ_0hrs_Grow_1'].metadata["organism"][0].organism == \ "Saccharomyces cerevisiae (Baker's yeast)" assert study.assays[0].nodes['E-MAXD-4-raw-data-426648783.txt' ].metadata["ArrayExpress Accession"][0][0] == \ "E-MAXD-4"
def parse_files(self): isatab_record = isatab.parse(study_directory) print isatab_record return isatab_record
def test_repeated_header(self): """Handle ISA-Tab inputs with repeated header names. """ work_dir = os.path.join(self._dir, "BII-S-6") rec = isatab.parse(work_dir)
# easy_install biopy-isatab from bcbio import isatab rec = isatab.parse("/Users/jespinosa/software/isaTabTools/ISAcreator-1.7.7/isatab files/test2_int2GB") rec = isatab.parse("/home/kadomu/Dropbox/isaTabTools/ISAcreator-1.7.7/isatab files/test2_int2GB") rec = isatab.parse("/users/cn/jespinosa/git/pergola_cbcrg/data/isatab_ex") study = rec.studies[0] assay = rec.studies[0].assays[0] # List of all files inside an assay for node in assay.nodes: print node node=study.nodes["CRG.Group-1.Subject-1"] assay.metadata['Study Assay Technology Platform'] rec = isatab.parse("/Users/jespinosa/software/isaTabTools/ISAcreator-1.7.7/isatab files/BII-I-1") study= rec.studies[0] print study.nodes.keys() ##Number of studies len(rec.studies) print rec st = "" for i in rec.studies: print "studies are", i