def study_datafiles(isatab_dir, data_dirs, study_config):
    """Retrieve data files and associated metadata for a study.
    """
    ftypes = ["Derived Data File", "Raw Data File"]
    private = study_config["private"]
    rec = isatab.parse(isatab_dir)
    assert len(rec.studies) == 1
    study = rec.studies[0]
    # Do not load private libraries into Galaxy
    if (rec.metadata.get("Investigation Identifier", "") in private or
        study.metadata["Study Identifier"] in private):
        study.assays = []
    for assay in study.assays:
        study_info = _get_study_matadata(study, assay)
        info = _get_assay_info(assay)
        verbose = study.metadata["Study Identifier"] in ["SB-S-29"]
        sample_info = _get_sample_metadata(study, info["Sample Name"],
                                           verbose)
        study_info.update(sample_info)
        for ftype in ftypes:
            for fname in info.get(ftype, []):
                try:
                    out = {"name": _get_full_path(fname, ftype, data_dirs),
                           "type": ftype}
                    out.update(study_info)
                    yield out
                except ValueError:
                    print "Missing file", fname, data_dirs
Example #2
0
def parse_isatab_assays(isatab_dir):
    """ 
    Read all files contained in isatab format to be processed by pergola
    
    :param isatab_dir: :py:func:`str` containing the path to isatab data folder
    
    :return: :py:func:`dict` of files to be processed by pergola
     
    TODO: This function needs that the assays to be processed are tagged some way
    
    """

    dict_files = dict()
    
#     if not path.isdir(isatab_dir):
    if not isdir(isatab_dir):
        raise ValueError ("Argument input must be a folder containning data in isatab format")
    
    rec = isatab.parse(isatab_dir) 
    
    # Sample name are the key shared by both study and assay
    for i in rec.studies:
        for j in i.assays:
            for file in j.nodes.keys():
                key = j.nodes[file].metadata['Sample Name'][0]

                dict_files[key] = file

    return dict_files
Example #3
0
def parse_isatab_assays (isatab_dir):
    """ 
    Read all files contained in isatab format to be processed by pergola
    
    :param isatab_dir: :py:func:`str` containing the path to isatab data folder
    
    :return: :py:func:`dict` of files to be processed by pergola
     
    """
    dict_files = {}
    
    if not path.isdir(isatab_dir):
        raise ValueError ("Argument input must be a folder containning data in isatab format")
    
    rec = isatab.parse(isatab_dir)
    
    #Sample name are the key shared by both study and assay
    for i in rec.studies:
#         print "studies are", i
#         print "..................",i.assays
#         print i.assays.node['metadata']
        for j in i.assays:
            print "assays are:", j
#             print "-----------", j.nodes
            for file in j.nodes.keys():
                print j.nodes[file].metadata['Sample Name']
                dict_files[j.nodes[file].metadata['Sample Name']] = file
                pass
#                 print "file to process is ------------------",file
    return dict_files            
Example #4
0
def parse_isatab_assays(isatab_dir):
    """ 
    Read all files contained in isatab format to be processed by pergola
    
    :param isatab_dir: :py:func:`str` containing the path to isatab data folder
    
    :return: :py:func:`dict` of files to be processed by pergola
     
    TODO: This function needs that the assays to be processed are tagged some way
    
    """

    dict_files = dict()

    #     if not path.isdir(isatab_dir):
    if not isdir(isatab_dir):
        raise ValueError(
            "Argument input must be a folder containning data in isatab format"
        )

    rec = isatab.parse(isatab_dir)

    # Sample name are the key shared by both study and assay
    for i in rec.studies:
        for j in i.assays:
            for file in j.nodes.keys():
                key = j.nodes[file].metadata['Sample Name'][0]

                dict_files[key] = file

    return dict_files
Example #5
0
 def test_nextgen_parsing(self):
     """Parse ISA-Tab file representing next gen sequencing data
     """
     work_dir = os.path.join(self._dir, "BII-S-3")
     rec = isatab.parse(work_dir)
     assay = rec.studies[0].assays[0]
     assert assay.metadata['Study Assay Technology Platform'] == '454 Genome Sequencer FLX'
     assert assay.nodes.has_key("ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/"
                                "SRA000266/EWOEPZA01.sff")
def main():
	# get directory from the command arguments and parse the files
	study_directory = str(sys.argv[1]) # data/ER-metab-v1_latest/
	spreadsheet_file = str(sys.argv[2]) # ER-metab-v1_latest.xlsx
	
	# parse the isatab directory with isatools parser and get the record
	record = Record(isatab.parse(study_directory))
	investigation = record.investigation
	
	write_isatab_to_xlsx(investigation, spreadsheet_file)
Example #7
0
 def test_mage(self):
     """Parse MAGE ISATab from ArrayExpress.
     """
     work_dir = os.path.join(self._dir, "mage")
     rec = isatab.parse(work_dir)
     assert len(rec.studies) == 1
     study = rec.studies[0]
     node = study.nodes["ERS025105"]
     assert node.metadata["FASTQ_URI"][0].FASTQ_URI == \
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR030/ERR030907/ERR030907.fastq.gz"
Example #8
0
 def test_mage(self):
     """Parse MAGE ISATab from ArrayExpress.
     """
     work_dir = os.path.join(self._dir, "mage")
     rec = isatab.parse(work_dir)
     assert len(rec.studies) == 1
     study = rec.studies[0]
     node = study.nodes["ERS025105"]
     assert node.metadata["FASTQ_URI"][0].FASTQ_URI == \
            "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR030/ERR030907/ERR030907.fastq.gz"
Example #9
0
 def test_nextgen_parsing(self):
     """Parse ISA-Tab file representing next gen sequencing data
     """
     work_dir = os.path.join(self._dir, "BII-S-3")
     rec = isatab.parse(work_dir)
     assay = rec.studies[0].assays[0]
     assert assay.metadata[
         'Study Assay Technology Platform'] == '454 Genome Sequencer FLX'
     assert assay.nodes.has_key(
         "ftp://ftp.ncbi.nih.gov/pub/TraceDB/ShortRead/"
         "SRA000266/EWOEPZA01.sff")
Example #10
0
def isatab_parser(id_MTBLS=None):
                
    if not id_MTBLS :
        #os.system('clear')
        #print '======= ISATAB parser ======='
        #
        MTBLSentries = get_MTBLS_ID()
        for name in MTBLSentries:
            #print name
            #if os.path.isdir('OUTPUT/_dataMTBLS/' + name) and name.startswith("MTBLS") :
            rec = isatab.parse(pathMTBLS + name)
            fp = open(pathISAPARSER + name + '_parsed', 'w')
            print >> fp, rec
            fp.close()
    else:#CASE: function called with one input parameter. USER should provide a good and existing MTBLS entry!!!!
        if not os.path.exists(pathMTBLS + id_MTBLS) :
            get_MTBLS_metadata(id_MTBLS)
            
        rec = isatab.parse(pathMTBLS + id_MTBLS)
        fp = open(pathISAPARSER + id_MTBLS + '_parsed', 'w')
        print >> fp, rec
        fp.close()
Example #11
0
 def test_get_genelists(self):
     """Identify derived genelists available in ISA-Tab experiment
     """
     work_dir = os.path.join(self._dir, "genelist")
     rec = isatab.parse(work_dir)
     study = rec.studies[0]
     assay = study.assays[0]
     assay_node = assay.nodes["KLS1nature.CEL"]
     study_node = study.nodes[assay_node.metadata["Sample Name"][0]]
     assert "16862118-Figure2bSRAS.txt" in assay_node.metadata["Derived Data File"]
     expects = ["Mus musculus (Mouse)", "C57BL/6", "bone marrow"]
     attrs = ["Organism", "strain", "Organism Part"]
     for attr, expect in zip(attrs, expects):
         assert study_node.metadata[attr] == [expect]
Example #12
0
def parse_isatab(filename):
    """Parses either an isa-tab zip or folder"""

    from bcbio import isatab

    # check if filename is a folder or a zip
    if not os.path.exists(filename):
        raise Exception('File or folder %s does not exist', filename)

    if not os.path.isdir(filename):
        work_dir = tempfile.mkdtemp()
        # unzip to temporary folder
        with zipfile.ZipFile(filename, "r") as z:
            z.extractall(work_dir)
    else:
        work_dir = filename

    # parse
    rec = isatab.parse(work_dir)
    # parse trait def file and raw data file
    for s in rec.studies:
        s.trait_def_map = {}
        s.derived_data_map = {}
        for assay in s.assays:
            if (assay.metadata[
                    'Study Assay Measurement Type Term Accession Number']
                    not in ('23', '0000023')):
                continue
            for sample_id, assay_data in assay.nodes.items():
                derived_data_file = assay_data.metadata['Derived Data File'][0]
                trait_def_file = assay_data.metadata[
                    'Parameter Value[Trait Definition File]'][
                        0].Trait_Definition_File

                # check if we already loaded the trait def file
                if trait_def_file not in s.trait_def_map:
                    s.trait_def_map[trait_def_file] = _parse_trait_def_file(
                        trait_def_file, work_dir)

                # check if we already loaded the derived data matrix
                if derived_data_file not in s.derived_data_map:
                    s.derived_data_map[
                        derived_data_file] = _parse_derived_data_file(
                            derived_data_file, work_dir)
    # remove temporary folder

    if filename != work_dir:
        shutil.rmtree(work_dir)
    return rec
Example #13
0
 def test_get_genelists(self):
     """Identify derived genelists available in ISA-Tab experiment
     """
     work_dir = os.path.join(self._dir, "genelist")
     rec = isatab.parse(work_dir)
     study = rec.studies[0]
     assay = study.assays[0]
     assay_node = assay.nodes["KLS1nature.CEL"]
     study_node = study.nodes[assay_node.metadata["Sample Name"][0]]
     assert "16862118-Figure2bSRAS.txt" in assay_node.metadata[
         "Derived Data File"]
     expects = ["Mus musculus (Mouse)", "C57BL/6", "bone marrow"]
     attrs = ["Organism", "strain", "Organism Part"]
     for attr, expect in zip(attrs, expects):
         assert study_node.metadata[attr][0][0] == expect
Example #14
0
    def test_minimal_parsing(self):
        """Parse a minimal ISA-Tab file without some field values filled in.
        """
        work_dir = os.path.join(self._dir, "minimal")
        rec = isatab.parse(work_dir)
        assert len(rec.publications) == 0
        assert len(rec.metadata) == 0

        assert len(rec.studies) == 1
        assert len(rec.studies[0].design_descriptors) == 0

        sname = "C2C12 sample1 rep3"
        study = rec.studies[0]
        assay_node = study.assays[0].nodes["AFFY#35C.CEL"]
        assert assay_node.metadata["Sample Name"] == [sname]
        assert study.nodes[sname].metadata["strain"] == ["C3H"]
Example #15
0
    def test_minimal_parsing(self):
        """Parse a minimal ISA-Tab file without some field values filled in.
        """
        work_dir = os.path.join(self._dir, "minimal")
        rec = isatab.parse(work_dir)
        assert len(rec.publications) == 0
        assert len(rec.metadata) == 0

        assert len(rec.studies) == 1
        assert len(rec.studies[0].design_descriptors) == 0

        sname = "C2C12 sample1 rep3"
        study = rec.studies[0]
        assay_node = study.assays[0].nodes["AFFY#35C.CEL"]
        assert assay_node.metadata["Sample Name"] == [sname]
        assert study.nodes[sname].metadata["strain"][0][0] == "C3H"
Example #16
0
    def test_basic_parsing(self):
        """Test general parsing of an example ISA directory.
        """
        work_dir = os.path.join(self._dir, "BII-I-1")
        rec = isatab.parse(work_dir)
        assert rec.metadata["Investigation Identifier"] == "BII-I-1"
        assert len(rec.ontology_refs) == 6
        assert rec.ontology_refs[2]["Term Source Name"] == "UO"
        assert len(rec.publications) == 1
        assert rec.publications[0]["Investigation Publication DOI"] == "doi:10.1186/jbiol54"

        assert len(rec.studies) == 2
        study = rec.studies[0]
        assert study.metadata["Study File Name"] == "s_BII-S-1.txt"
        assert len(study.assays) == 3
        assert study.assays[0].metadata["Study Assay File Name"] == "a_metabolome.txt"
        study = rec.studies[1]
        assert study.nodes['NZ_0hrs_Grow_1'].metadata["organism"] == \
               ["Saccharomyces cerevisiae (Baker's yeast)"]
        assert study.assays[0].nodes['E-MAXD-4-raw-data-426648783.txt'
                                     ].metadata["ArrayExpress Accession"] == \
                                     ["E-MAXD-4"]
Example #17
0
    def test_basic_parsing(self):
        """Test general parsing of an example ISA directory.
        """
        work_dir = os.path.join(self._dir, "BII-I-1")
        rec = isatab.parse(work_dir)
        assert rec.metadata["Investigation Identifier"] == "BII-I-1"
        assert len(rec.ontology_refs) == 6
        assert rec.ontology_refs[2]["Term Source Name"] == "UO"
        assert len(rec.publications) == 1
        assert rec.publications[0][
            "Investigation Publication DOI"] == "doi:10.1186/jbiol54"

        assert len(rec.studies) == 2
        study = rec.studies[0]
        assert study.metadata["Study File Name"] == "s_BII-S-1.txt"
        assert len(study.assays) == 3
        assert study.assays[0].metadata[
            "Study Assay File Name"] == "a_metabolome.txt"
        study = rec.studies[1]
        assert study.nodes['NZ_0hrs_Grow_1'].metadata["organism"][0].organism == \
               "Saccharomyces cerevisiae (Baker's yeast)"
        assert study.assays[0].nodes['E-MAXD-4-raw-data-426648783.txt'
                                     ].metadata["ArrayExpress Accession"][0][0] == \
                                     "E-MAXD-4"
Example #18
0
def parse_files(self):
	isatab_record = isatab.parse(study_directory)
	print isatab_record
	return isatab_record
Example #19
0
 def test_repeated_header(self):
     """Handle ISA-Tab inputs with repeated header names.
     """
     work_dir = os.path.join(self._dir, "BII-S-6")
     rec = isatab.parse(work_dir)
Example #20
0
# easy_install biopy-isatab

from bcbio import isatab

rec = isatab.parse("/Users/jespinosa/software/isaTabTools/ISAcreator-1.7.7/isatab files/test2_int2GB")
rec = isatab.parse("/home/kadomu/Dropbox/isaTabTools/ISAcreator-1.7.7/isatab files/test2_int2GB")
rec = isatab.parse("/users/cn/jespinosa/git/pergola_cbcrg/data/isatab_ex")
study = rec.studies[0]
assay = rec.studies[0].assays[0]

# List of all files inside an assay
for node in assay.nodes: print node


node=study.nodes["CRG.Group-1.Subject-1"]



assay.metadata['Study Assay Technology Platform'] 

rec = isatab.parse("/Users/jespinosa/software/isaTabTools/ISAcreator-1.7.7/isatab files/BII-I-1")
study= rec.studies[0]
print study.nodes.keys()

##Number of studies
len(rec.studies)
print rec

st = ""
for i in rec.studies: 
    print "studies are", i
Example #21
0
 def test_repeated_header(self):
     """Handle ISA-Tab inputs with repeated header names.
     """
     work_dir = os.path.join(self._dir, "BII-S-6")
     rec = isatab.parse(work_dir)