Example #1
0
def convert(source_inv_fp, target_fp):
    """ Converter for ISA-Tab to SampleTab.
    :param source_inv_fp: File descriptor of input investigation file
    :param target_fp: File descriptor to write output SampleTab to (must be writeable)
    """
    ISA = isatab.load(source_inv_fp)
    sampletab.dump(ISA, target_fp)
Example #2
0
def convert(work_dir,
            identifier_type=IdentifierType.name,
            validate_first=True,
            use_new_parser=False):
    i_files = glob.glob(os.path.join(work_dir, 'i_*.txt'))
    if validate_first:
        logger.info("Validating input ISA tab before conversion")
        if len(i_files) != 1:
            logger.fatal(
                "Could not resolves input investigation file, please check input ISA tab directory."
            )
            return
        with open(i_files[0], 'r', encoding='utf-8') as validate_fp:
            report = isatab.validate(fp=validate_fp, log_level=logging.ERROR)
            if len(report['errors']) > 0:
                logger.fatal(
                    "Could not proceed with conversion as there are some fatal validation errors. Check log."
                )
                return
    if use_new_parser:
        logger.info("Using new parser to load...")
        with open(i_files[0], 'r', encoding='utf-8') as fp:
            ISA = isatab.load(fp)
            from isatools.isajson import ISAJSONEncoder
            logger.info("Using new ISA JSON encoder to dump...")
            return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
    else:
        converter = ISATab2ISAjson_v1(identifier_type)
        logger.info("Converting ISA-Tab to ISA JSON...")
        return converter.convert(work_dir)
    def get_isa_study(self, study_id, api_key, skip_load_tables=True, study_location=None):
        """
        Get an ISA-API Investigation object reading directly from the ISA-Tab files
        :param study_id: MTBLS study identifier
        :param api_key: User API key for accession check
        :param skip_load_tables: speed-up reading by skiping loading assay and sample tables
        :param study_location: filessystem location of the study
        :return: a tuple consisting in ISA-Study obj, ISA-Investigation obj
                and path to the Study in the file system
        """

        if study_location is None:
            logger.info("Study location is not set, will have load study from filesystem")
            std_path = self.wsc.get_study_location(study_id, api_key)
        else:
            logger.info("Study location is: " + study_location)
            std_path = study_location

        try:
            i_filename = glob.glob(os.path.join(std_path, "i_*.txt"))[0]
            fp = open(i_filename)
            # loading tables also load Samples and Assays
            isa_inv = load(fp, skip_load_tables)
            isa_study = isa_inv.studies[0]
        except IndexError as e:
            logger.exception("Failed to find Investigation file from %s", study_id, std_path)
            logger.error(str(e))
            abort(400)
        except Exception as e:
            logger.exception("Failed to find Investigation file from %s", study_id, std_path)
            logger.error(str(e))
            abort(400)
        else:
            return isa_study, isa_inv, std_path
Example #4
0
def load(mtbls_study_id):
    tmp_dir = get(mtbls_study_id)
    if tmp_dir is None:
        raise IOError("There was a problem retrieving the study ", mtbls_study_id)
    with open(glob.glob(os.path.join(tmp_dir, 'i_*.txt'))[0], encoding='utf-8') as f:
        ISA = isatab.load(f)
        shutil.rmtree(tmp_dir)
        return ISA
Example #5
0
def create_isatab_archive(inv_fp, target_filename=None, 
                          filter_by_measurement=None):
    """Function to create an ISArchive; option to select by assay measurement type

    Example usage:

        >>> create_isatab_archive(open('/path/to/i_investigation.txt', target_filename='isatab.zip')
        >>> create_isatab_archive(open('/path/to/i.txt', filter_by_measurement='transcription profiling')
    """
    if target_filename is None:
        target_filename = os.path.join(
            os.path.dirname(inv_fp.name), 'isatab.zip')
    ISA = isatab.load(inv_fp)
    
    all_files_in_isatab = []
    found_files = []
    
    for s in ISA.studies:
        if filter_by_measurement is not None:
            log.debug('Selecting ', filter_by_measurement)
            selected_assays = [a for a in s.assays if 
                               a.measurement_type.term == filter_by_measurement]
        else:
            selected_assays = s.assays
            
        for a in selected_assays:
            all_files_in_isatab += [d.filename for d in a.data_files]
    dirname = os.path.dirname(inv_fp.name)
    
    for fname in all_files_in_isatab:
        if os.path.isfile(os.path.join(dirname, fname)):
            found_files.append(fname)
    missing_files = [f for f in all_files_in_isatab if f not in found_files]
    
    if len(missing_files) == 0:
        log.debug('Do zip')
        with ZipFile(target_filename, mode='w') as zip_file:
            # use relative dir_name to avoid absolute path on file names
            zip_file.write(inv_fp.name, arcname=os.path.basename(inv_fp.name))
            
            for s in ISA.studies:
                zip_file.write(
                    os.path.join(dirname, s.filename), arcname=s.filename)
                
                for a in selected_assays:
                    zip_file.write(
                        os.path.join(dirname, a.filename), arcname=a.filename)
                    
            for file in all_files_in_isatab:
                zip_file.write(os.path.join(dirname, file), arcname=file)
                
            log.debug(zip_file.namelist())
            return zip_file.namelist()
        
    else:
        log.debug('Not zipping')
        log.debug('Missing: ', missing_files)
        return None
Example #6
0
def convert(source_inv_fp, target_fp):
    """ Converter for ISA-Tab to SampleTab.
    :param source_inv_fp: File descriptor of input investigation file
    :param target_fp: File descriptor to write output SampleTab to (must be writeable)
    """
    log.info("loading isatab %s", source_inv_fp.name)
    ISA = isatab.load(source_inv_fp)
    log.info("dumping sampletab %s", target_fp.name)
    sampletab.dump(ISA, target_fp)
Example #7
0
def convert(source_inv_fp, output_path):
    """ Converter for ISA-Tab to MAGE-TAB.
    :param source_inv_fp: File descriptor of input investigation file
    :param output_dir: Path to directory to write output MAGE-TAB files to
    """
    log.info("loading isatab %s", source_inv_fp.name)
    ISA = isatab.load(source_inv_fp)
    log.info("dumping magetab %s", output_path)
    magetab.dump(ISA, output_path)
Example #8
0
def convert(idf_file_path):
    tmp = tempfile.mkdtemp()
    ISA = None
    try:
        magetab2isatab.convert(idf_file_path, output_path=tmp)
        with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
            ISA = isatab.load(isa_inv_fp)
    finally:
        shutil.rmtree(tmp)
        if ISA is not None:
            return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
Example #9
0
    def get_isa_study(self,
                      study_id,
                      api_key,
                      skip_load_tables=True,
                      study_location=None,
                      failing_gracefully=False):
        """
        Get an ISA-API Investigation object reading directly from the ISA-Tab files
        :param study_id: MTBLS study identifier
        :param api_key: User API key for accession check
        :param skip_load_tables: speed-up reading by skiping loading assay and sample tables
        :param study_location: filessystem location of the study
        :return: a tuple consisting in ISA-Study obj, ISA-Investigation obj
                and path to the Study in the file system
        """

        if skip_load_tables == 'false':
            skip_load_tables = False

        if study_location is None:
            logger.info(
                "Study location is not set, will have load study from filesystem"
            )
            std_path = self.wsc.get_study_location(study_id, api_key)
        else:
            logger.info("Study location is: " + study_location)
            std_path = study_location

        try:
            i_filename = glob.glob(os.path.join(std_path, "i_*.txt"))[0]
            fp = open(i_filename, encoding='utf-8', errors='ignore')
            # loading tables also load Samples and Assays
            isa_inv = load(fp, skip_load_tables)
            # ToDo. Add MAF to isa_study
            isa_study = isa_inv.studies[0]
        except IndexError as e:
            logger.exception("Failed to find Investigation file from %s",
                             study_id, std_path)
            logger.error(str(e))
            if failing_gracefully:
                return None, None, None
            else:
                abort(417)
        except Exception as e:
            logger.exception("Failed to find Investigation file from %s",
                             study_id, std_path)
            logger.error(str(e))
            if failing_gracefully:
                return None, None, None
            else:
                abort(417)
        else:
            return isa_study, isa_inv, std_path
Example #10
0
 def test_unused_protocol_fixer(self):
     i_table_path = os.path.join(self._tmp_dir, 'BII-S-3', 'i_gilbert.txt')
     fixer = utils.IsaTabFixer(i_table_path)
     fixer.remove_unused_protocols()
     with open('{}.fix'.format(i_table_path)) as fixed_i_fp:
         investigation = isatab.load(fixed_i_fp)
         study = investigation.studies[-1]
         unused_protocol1 = \
             study.get_prot('reverse transcription - standard procedure 5')
         unused_protocol2 = \
             study.get_prot('sequence analysis - standard procedure 7')
         self.assertIsNone(unused_protocol1)
         self.assertIsNone(unused_protocol2)
Example #11
0
def convert(source_idf_fp, technology_type=None, measurement_type=None):
    tmp = tempfile.mkdtemp()
    ISA = None
    try:
        magetab2isatab.convert(source_idf_fp=source_idf_fp,
                               output_path=tmp,
                               technology_type=technology_type,
                               measurement_type=measurement_type)
        with open(os.path.join(tmp, "i_investigation.txt")) as isa_inv_fp:
            ISA = isatab.load(isa_inv_fp)
    finally:
        shutil.rmtree(tmp)
        if ISA is not None:
            return json.loads(json.dumps(ISA, cls=ISAJSONEncoder))
Example #12
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from data/raw into
        cleaned data ready to be measured (saved in data/interim).

        Cleaning: We iterate through our raw data MTBLS metadata and run
        ISA-Tab load onmeach to check for loading errors. If the study is not
        loadable, we exclude the study metadata from this analysis.
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    if len(glob(join(output_filepath, 'MTBLS*'))) > 0:
        logging.info('Output directory {} already contains MTBLS studies. '
                     'Skipping writing to data/interim. If this is not '
                     'expected, do you need to "make clean" first?'.format(
                         output_filepath))
        exit(0)
    for study_dir in tqdm(glob(join(input_filepath, 'MTBLS*'))):
        study_id = basename(study_dir)
        try:
            load(study_dir)
            copytree(study_dir, '{}/{}'.format(output_filepath, study_id))
        except Exception:
            logging.info('Exluding {}'.format(study_dir))
Example #13
0
    def remove_unused_protocols(self):
        """Removes usused protocols

        :return: None
        """
        investigation = isatab.load(os.path.dirname(self.path))
        for study in investigation.studies:
            unused_protocol_names = set(x.name for x in study.protocols)
            for process in study.process_sequence:
                try:
                    unused_protocol_names.remove(
                        process.executes_protocol.name)
                except KeyError:
                    pass
            for assay in study.assays:
                for process in assay.process_sequence:
                    try:
                        unused_protocol_names.remove(
                            process.executes_protocol.name)
                    except KeyError:
                        pass
            print('Unused protocols: {}'.format(unused_protocol_names))
            print('Location of unused protocols: {}'.format(
                list(
                    map(
                        lambda pr: True
                        if pr.name in unused_protocol_names else False,
                        study.protocols))))
            # remove these protocols from study.protocols
            """
            clean_protocols_list = []
            for protocol in study.protocols:
                if protocol.name not in unused_protocol_names:
                    clean_protocols_list.append(protocol)
            study.protocols = clean_protocols_list
            """
            clean_protocols = [
                pr for pr in study.protocols
                if pr.name not in unused_protocol_names
            ]
            print('Clean protocol list: {}'.format(
                [pr.name for pr in clean_protocols]))
            study.protocols = clean_protocols
            print('Clean study.protocols: {}'.format(
                [pr.name for pr in study.protocols]))
        isatab.dump(investigation,
                    output_path=os.path.dirname(self.path),
                    i_file_name='{filename}.fix'.format(
                        filename=os.path.basename(self.path)),
                    skip_dump_tables=True)
def main(input_filepath, output_filepath, max_files=-1):
    """ Runs data processing scripts to run measureents on the metadata found
        in data/interim to extract the loaded object sizes of the ISA objects,
        Pandas DataFrame objects, and the raw size of the files as reported on
        disk.

        We iterate through our processed data MTBLS metadata and run ISA-Tab
        load on each, and then extract the approximate size of the DAG portion
        of the metadta. This DAG metadata is analagous to each line of the
        table files that describes one path in the DAGs.
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from processed data')
    if max_files > 0:
        logger.info('limiting to {} study folders'.format(max_files))
    if exists(output_filepath):
        if getsize(output_filepath) > 0:
            logger.info('Output file {} already contains data. '
                        'Skipping writing to data/processed. If this is not '
                        'expected, do you need to "make clean" first?'.format(
                            output_filepath))
            exit(0)
    with open(output_filepath, 'w') as output_file:
        output_file.write('studyid,fname,disk_size,df_size,isa_size\n')
        logger.info('studyid, fname', ['disk_size', 'df_size', 'isa_size'])
        for study_dir in tqdm(
                glob(join(input_filepath, 'MTBLS*'))[:max_files]):
            try:
                isa = load(study_dir)
                for s in isa.studies:
                    fname = s.filename
                    df = pd.read_csv(join(study_dir, fname), sep='\t')
                    df_size = total_size(df, verbose=False)
                    disk_size = getsize(join(study_dir, fname))
                    isa_size = total_size(s.process_sequence, verbose=False)
                    output_file.write('"{}","{}",{},{},{}\n'.format(
                        s.identifier, fname, disk_size, df_size, isa_size))
                    for a in s.assays:
                        fname = a.filename
                        df = pd.read_csv(join(study_dir, fname), sep='\t')
                        df_size = total_size(df, verbose=False)
                        disk_size = getsize(join(study_dir, fname))
                        isa_size = total_size(a.process_sequence,
                                              verbose=False)
                        output_file.write('"{}","{}",{},{},{}\n'.format(
                            s.identifier, fname, disk_size, df_size, isa_size))
                output_file.flush()
            except KeyboardInterrupt:
                exit(1)
Example #15
0
def detect_isatab_process_pooling(fp):
    from isatools import isatab
    report = []
    ISA = isatab.load(fp)
    for study in ISA.studies:
        print("Checking {}".format(study.filename))
        pooling_list = detect_graph_process_pooling(study.graph)
        if len(pooling_list) > 0:
            report.append({study.filename: pooling_list})
        for assay in study.assays:
            print("Checking {}".format(assay.filename))
            pooling_list = detect_graph_process_pooling(assay.graph)
            if len(pooling_list) > 0:
                report.append({assay.filename: pooling_list})
    return report
Example #16
0
def detect_isatab_process_pooling(fp):
    report = []

    ISA = isatab.load(fp)

    for study in ISA.studies:
        log.info('Checking {}'.format(study.filename))
        pooling_list = detect_graph_process_pooling(study.graph)

        if len(pooling_list) > 0:
            report.append({study.filename: pooling_list})

        for assay in study.assays:
            log.info('Checking {}'.format(assay.filename))
            pooling_list = detect_graph_process_pooling(assay.graph)

            if len(pooling_list) > 0:
                report.append({assay.filename: pooling_list})
    return report
Example #17
0
def load(FP):  # loads IDF file
    # first cast to IDF
    inv_fp = cast_idf_to_inv(FP)
    df = pd.read_csv(inv_fp, names=range(0, 128), sep='\t',
                     engine='python').dropna(axis=1, how='all')
    df = df.T  # transpose
    df.reset_index(inplace=True)  # Reset index so it is accessible as column
    df.columns = df.iloc[
        0]  # If all was OK, promote this row to the column headers
    # second set output s_ and a_ files
    sdrf_file = df["Comment[SDRF File]"].iloc[1]
    study_df, assay_df = split_tables(
        sdrf_path=os.path.join(os.path.dirname(FP.name), sdrf_file))
    study_df.columns = study_df.isatab_header
    assay_df.columns = assay_df.isatab_header
    # write out ISA files
    tmp = "/Users/dj/PycharmProjects/isa-api/tests/data/tmp"
    inv_fp.seek(0)
    # print("Writing i_investigation.txt to {}".format(tmp))
    print("Writing s_{0} to {1}".format(tmp, os.path.basename(sdrf_file)))
    with open(os.path.join(tmp, "s_" + os.path.basename(sdrf_file)),
              "w") as s_fp:
        study_df.to_csv(
            path_or_buf=s_fp,
            mode='a',
            sep='\t',
            encoding='utf-8',
            index=False,
        )
    print("Writing a_{0} to {1}".format(tmp, os.path.basename(sdrf_file)))
    with open(os.path.join(tmp, "a_" + os.path.basename(sdrf_file)),
              "w") as a_fp:
        assay_df.to_csv(
            path_or_buf=a_fp,
            mode='a',
            sep='\t',
            encoding='utf-8',
            index=False,
        )
    with open(os.path.join(tmp, "i_investigation.txt")) as tmp_inv_fp:
        ISA = isatab.load(inv_fp)
        return ISA
Example #18
0
def getISAAssay(assayNum, studyNum, pathToISATABFile):
    """
    This function returns an Assay object given the assay and study numbers in an ISA file
    Typically, you should use the exploreISA function to check the contents
    of the ISA file and retrieve the assay and study numbers you are interested in!
    :param assayNum: The Assay number (notice it's not zero-based index).
    :type assayNum: int
    :param studyNum: The Study number (notice it's not zero-based index).
    :type studyNum: int
    :param pathToISATABFile: The path to the ISATAB file
    :type pathToISATABFile: str
    :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'.
    """
    from isatools import isatab
    import copy
    try:
        isa = isatab.load(pathToISATABFile, skip_load_tables=True)
        std = isa.studies[studyNum - 1]
        return copy.deepcopy(std.assays[assayNum - 1])
    except FileNotFoundError as err:
        raise err
Example #19
0
def exploreISA(pathToISATABFile, verbose=True):
    """
    This function loops through the ISATAB file and lists its Studies and their associated Assays.
    :param pathToISATABFile: The path to the ISATAB file.
    :type xpathToISATABFile: str
    :param verbose: Whether (or not) to print out details of Studies and Assays (default: True)
    :type verbose: boolean
    :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'.
    """
    try:
        isa_tab_record = isatab.load(pathToISATABFile, skip_load_tables=True)
        if verbose:
            print('In this ISATAB file you have:')
            for idx,st in enumerate(isa_tab_record.studies):
                print('Study: '+str(idx+1))
                print('\tStudy Identifier: '+st.identifier+', Study ID: '+st.id+', Study Filename: '+st.filename+', Study Title: '+st.title)
                print('\tThis Study has the following Assays:')
                for ix,a in enumerate(st.assays):
                    print('\tAssay: '+str(ix+1))
                    print('\t\tAssay Filename: '+a.filename+', Assay technology type: '+a.technology_type.term)
    except FileNotFoundError as err:
        raise err
Example #20
0
def getISAStudy(studyNum, pathToISATABFile, noAssays = True):
    """
    This function returns a Study object given the study number in an ISA file
    Typically, you should use the exploreISA function to check the contents
    of the ISA file and retrieve the study number you are interested in!
    :param studyNum: The Study number (notice it's not zero-based index).
    :type studyNum: int
    :param pathToISATABFile: The path to the ISATAB file
    :type pathToISATABFile: str
    :param noAssays: whetehr to remove all assays (i.e. return a copy of the study only)
    :type noAssays: boolean
    :raise FileNotFoundError: If pathToISATABFile does not contain file 'i_Investigation.txt'.
    """
    from isatools import isatab
    import copy
    try:
        isa = isatab.load(pathToISATABFile, skip_load_tables=True)
        st = copy.deepcopy(isa.studies[studyNum - 1])
        if noAssays:
            st.assays = []
        return st
    except FileNotFoundError as err:
        raise err
Example #21
0
def get_metabolights_metadata(studyID, study_directory, investigation_file):
    from isatools import isatab
    from os.path import join

    with open(investigation_file, encoding='utf-8') as f:
        ISA_object = isatab.load(f)

    met_vars = get_factor_names_m(study_directory)  #Metadata classes

    #Get full metadata values:
    metadata_full = get_factors_summary_m(ISA_object)
    s = ","
    header = "sample," + s.join(met_vars)
    f = open(join(study_directory, 'metadata.csv'), 'w')
    f.write(header)
    f.write('\n')
    for metadata_sample in metadata_full:
        line = "\"" + metadata_sample["sample"] + "\""
        for met_var in met_vars:
            line = line + "," + "\"" + metadata_sample[met_var] + "\""
        f.write(line)
        f.write('\n')
    f.close()
    return (join(study_directory, 'metadata.csv'))
Example #22
0
def main():
    """
    The simplest checks for ISA-Tab correctness is to attempt to load ISA-Tab
    files, and then count certain objects to get an idea if the parser created
    the correct number of each entity.
    """

    expected_values = {
        'num_study_sources': [3, 1],
        'num_study_samples': [12, 60]
    }
    study_number = 0
    for d in sorted(glob.glob("cmso*/isa")):
        print("attempting to load {}".format(d))
        i_files = glob.glob(os.path.join(d, 'i_*.txt'))
        if len(i_files) != 1:
            raise FileNotFoundError(
                "Could not find an investigation file in {}".format(d))
        with open(os.path.join(next(iter(i_files)))) as i_fp:
            isa_objects = isatab.load(i_fp)
            assert isa_objects is not None  # if not created, error

            # Some simple checks to sanity check that loaded what we expected
            num_study_sources = len(
                isa_objects.studies[-1].materials['sources'])
            num_study_samples = len(
                isa_objects.studies[-1].materials['samples'])
            print("loaded {} study sources".format(num_study_sources))
            print("loaded {} study samples".format(num_study_samples))
            assert num_study_sources == \
                expected_values["num_study_sources"][study_number]
            assert num_study_samples == \
                expected_values["num_study_samples"][study_number]
            print("{} load OK".format(d))
            study_number += 1
    """
import json
import requests

from isatools import isatab
from isatools.convert import isatab2json

directory_1 = '/Users/Philippe/Documents/Dropbox-Backup/Eurovis 2015 - Chronoglyph/ISATAB-datasets/BII-S-8_FP001RO-isatab-TEST'
inv_name_1 = 'i_fp001ro-investigation.txt'
isa_config_1 = '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-default_v2014-01-16/'

directory_2 = '/Users/Philippe/Documents/git/ISAdatasets/tab/MTBLS404/'
inv_name_2 = 'i_sacurine.txt'
# isa_config_2 = '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-seq_v2016-11-17-SRA1.5-august2014mod/'

try:
    # my_isa_read = isatab.load(open(os.path.join('/Users/Philippe/Downloads/ISAcreator-1.7.11-all/isatab files/SRA_assembly_test', 'i_investigation.txt')))
    my_isa_read = isatab.load(open(os.path.join(directory_1, inv_name_1)))
    print("reading in:", my_isa_read.studies)

    # my_json_report = isatab.validate(open(os.path.join('/Users/Philippe/Downloads/ISAcreator-1.7.11-all/isatab files/SRA_assembly_test', 'i_investigation.txt')), '/Users/Philippe/Documents/git/Configuration-Files/isaconfig-seq_v2016-11-17-SRA1.5-august2014mod/')

    # my_json_report = isatab.validate(open(os.path.join(directory_1,inv_name_1)), isa_config_1)
    # print(my_json_report)

    try:
        isa_json = isatab2json.convert(directory_2)
    except Exception as excep:
        print(excep)

except IOError as e:
    print(e)
Example #24
0
from isatools import isatab
from isatools.model import *
import sys
import json
import os

input_filepath = sys.argv[1]  # input path to ISA-Tab
output_filepath = sys.argv[2]  # output path to write ISA-Tab
mapping_filepath = sys.argv[3]  # path to mapping json file

ISA = isatab.load(input_filepath)

# only get first assay from first study obj
study = ISA.studies[0]

mapping = {}
with open(mapping_filepath) as fp:
    mapping = json.load(fp)

for assay in study.assays:
    # get mass spectrometry processes only
    ms_processes = [
        x for x in assay.process_sequence
        if x.executes_protocol.protocol_type.term == 'mass spectrometry'
    ]
    # insert the new parameter values
    for k, v in mapping.items():
        with open(os.path.join('MTBLS265-no-binary', 'json_meta',
                               v + '.json')) as fp2:
            mzml_meta = json.load(fp2)
            data_trans_meta = {
Example #25
0
    def replace_factor_with_protocol_parameter_value(self, factor_name,
                                                     protocol_ref):
        """Fixes a factor if it's supposed to be a Parameter Value

        :param factor_name: The factor that's incorrect
        :param protocol_ref: Protocol REF for the new Parameter Value
        :return: None
        """
        table_file_df = isatab.read_tfile(self.path)

        field_names = list(table_file_df.columns)
        clean_field_names = self.clean_isatab_field_names(field_names)

        factor_index = clean_field_names.index(
            'Factor Value[{factor_name}]'.format(factor_name=factor_name))

        with open(self.path) as tfile_fp:
            next(tfile_fp)
            line1 = next(tfile_fp)
            protocol_ref_index = list(
                map(lambda x: x[1:-1] if x[0] == '"' and x[-1] == '"' else x,
                    line1.split('\t'))).index(protocol_ref)

        if protocol_ref_index < 0:
            raise IOError(
                'Could not find protocol ref matching {protocol_ref}'.format(
                    protocol_ref=protocol_ref))

        if factor_index < len(field_names) and \
            'Term Source REF' in field_names[factor_index + 1] and \
                'Term Accession' in field_names[factor_index + 2]:
            log.debug('Moving Factor Value[{}] with term columns'.format(
                factor_name))
            # move Factor Value and Term Source REF and Term Accession columns
            field_names.insert(protocol_ref_index + 1,
                               field_names[factor_index])
            field_names.insert(protocol_ref_index + 2,
                               field_names[factor_index + 1 + 1])
            field_names.insert(protocol_ref_index + 3,
                               field_names[factor_index + 2 + 2])
            del field_names[factor_index + 3]  # del Factor Value[{}]
            del field_names[factor_index + 1 + 2]  # del Term Source REF
            del field_names[factor_index + 2 + 1]  # del Term Accession
        elif factor_index < len(field_names) and \
            'Unit' in field_names[factor_index + 1] and \
                'Term Source REF' in field_names[factor_index + 2] and \
                'Term Accession' in field_names[factor_index + 3]:
            log.debug(
                'Moving Factor Value[{factor_name}] with unit term columns'.
                format(factor_name=factor_name))
            # move Factor Value and Unit as ontology annotation
            field_names.insert(protocol_ref_index + 1,
                               field_names[factor_index])
            field_names.insert(protocol_ref_index + 2,
                               field_names[factor_index + 1 + 1])
            field_names.insert(protocol_ref_index + 3,
                               field_names[factor_index + 2 + 2])
            field_names.insert(protocol_ref_index + 4,
                               field_names[factor_index + 3 + 3])
            del field_names[factor_index + 4]  # del Factor Value[{}]
            del field_names[factor_index + 1 + 3]  # del Unit
            del field_names[factor_index + 2 + 2]  # del Term Source REF
            del field_names[factor_index + 3 + 1]  # del Term Accession
        elif factor_index < len(field_names) and \
                'Unit' in field_names[factor_index + 1]:
            log.debug(
                'Moving Factor Value[{factor_name}] with unit column'.format(
                    factor_name=factor_name))
            # move Factor Value and Unit columns
            field_names.insert(protocol_ref_index + 1,
                               field_names[factor_index])
            field_names.insert(protocol_ref_index + 2,
                               field_names[factor_index + 1 + 1])
            del field_names[factor_index + 2]  # del Factor Value[{}]
            del field_names[factor_index + 1 + 1]  # del Unit
        else:  # move only the Factor Value column
            log.debug('Moving Factor Value[{factor_name}]'.format(
                factor_name=factor_name))
            field_names.insert(protocol_ref_index + 1,
                               field_names[factor_index])
            del field_names[factor_index]  # del Factor Value[{}]

        table_file_df.columns = self.clean_isatab_field_names(field_names)

        # Rename Factor Value column to Parameter Value column
        field_names_modified = list(table_file_df.columns)
        field_names_modified[protocol_ref_index + 1] = \
            field_names_modified[protocol_ref_index + 1].replace(
                'Factor Value', 'Parameter Value')
        table_file_df.columns = self.clean_isatab_field_names(
            field_names_modified)

        investigation = isatab.load(os.path.dirname(self.path),
                                    skip_load_tables=True)
        study = investigation.studies[-1]
        protocol = study.get_prot(protocol_ref)
        if protocol is None:
            raise ISAModelAttributeError(
                'No protocol with name {protocol_ref} was found'.format(
                    protocol_ref=protocol_ref))
        protocol.add_param(factor_name)
        factor = study.get_factor(factor_name)
        if factor is None:
            raise ISAModelAttributeError(
                'No factor with name {factor_name} was found'.format(
                    factor_name=factor_name))
        else:
            study.del_factor(name=factor_name, are_you_sure=True)

        study.filename = '{study_filename}.fix'.format(
            study_filename=study.filename)

        isatab.dump(investigation,
                    output_path=os.path.dirname(self.path),
                    i_file_name='i_Investigation.txt.fix',
                    skip_dump_tables=True)

        with open(
                os.path.join(
                    os.path.dirname(self.path), '{s_filename}.fix'.format(
                        s_filename=os.path.basename(self.path))),
                'w') as out_fp:
            table_file_df.to_csv(path_or_buf=out_fp,
                                 index=False,
                                 sep='\t',
                                 encoding='utf-8')
Example #26
0
 def generate_study_design_report(self,
                                  get_num_study_groups=True,
                                  get_factors=True,
                                  get_num_levels=True,
                                  get_levels=True,
                                  get_study_groups=True):
     """Generates a study design report
     :return: JSON report
     """
     isa = isatab.load(self.path, skip_load_tables=False)
     study_design_report = []
     raw_data_file_prefix = ('Raw', 'Array', 'Free Induction Decay')
     for study in isa.studies:
         study_key = study.identifier if study.identifier != '' \
             else study.filename
         study_design_report.append({
             'study_key': study_key,
             'total_sources': len(study.sources),
             'total_samples': len(study.samples),
             'assays': []
         })
         with open(os.path.join(self.path, study.filename)) as s_fp:
             s_df = isatab.load_table(s_fp)
             for assay in study.assays:
                 assay_key = '/'.join([
                     assay.filename, assay.measurement_type.term,
                     assay.technology_type.term, assay.technology_platform
                 ])
                 assay_report = {
                     'assay_key':
                     assay_key,
                     'num_sources':
                     len(assay.samples),
                     'num_samples':
                     len([
                         x for x in assay.data_files
                         if x.label.startswith(raw_data_file_prefix)
                     ])
                 }
                 with open(os.path.join(self.path, assay.filename)) as a_fp:
                     a_df = isatab.load_table(a_fp)
                     merged_df = pd.merge(s_df, a_df, on='Sample Name')
                     factor_cols = [
                         x for x in merged_df.columns
                         if x.startswith("Factor Value")
                     ]
                     if len(factor_cols) > 0:
                         # add branch to get all if no FVs
                         study_group_factors_df = \
                             merged_df[factor_cols].drop_duplicates()
                         factors_list = [
                             x[13:-1]
                             for x in study_group_factors_df.columns
                         ]
                         queries = []
                         factors_and_levels = {}
                         for i, row in study_group_factors_df.iterrows():
                             fvs = []
                             for x, y in zip(factors_list, row):
                                 fvs.append(' == '.join([x, str(y)]))
                                 try:
                                     factor_and_levels = \
                                         factors_and_levels[x]
                                 except KeyError:
                                     factors_and_levels[x] = set()
                                     factor_and_levels = \
                                         factors_and_levels[x]
                                 factor_and_levels.add(str(y))
                             queries.append(' and '.join(fvs))
                         assay_report['total_study_groups'] = len(queries)
                         assay_report['factors_and_levels'] = []
                         assay_report['group_summary'] = []
                         for k, v in factors_and_levels.items():
                             assay_report['factors_and_levels'].append({
                                 'factor':
                                 k,
                                 'num_levels':
                                 len(v),
                             })
                         for query in queries:
                             try:
                                 columns = merged_df.columns
                                 columns = recast_columns(columns=columns)
                                 for i, column in enumerate(columns):
                                     columns[i] = pyvar(column) if \
                                         column.startswith(
                                             'Factor Value[') else column
                                 merged_df.columns = columns
                                 qlist = query.split(' and ')
                                 fmt_query = []
                                 for factor_query in qlist:
                                     factor_value = \
                                         factor_query.split(' == ')
                                     fmt_query_part = \
                                         "Factor_Value_{0}_ == '{1}'"\
                                         .format(pyvar(factor_value[0]),
                                                 factor_value[1])
                                     fmt_query.append(fmt_query_part)
                                 fmt_query = ' and '.join(fmt_query)
                                 log.debug(
                                     'running query: {}'.format(fmt_query))
                                 df2 = merged_df.query(fmt_query)
                                 data_column = [
                                     x for x in merged_df.columns
                                     if x.startswith(raw_data_file_prefix)
                                     and x.endswith('Data File')
                                 ][0]
                                 assay_report['group_summary'].append(
                                     dict(study_group=query,
                                          sources=len(
                                              list(df2['Source Name'].
                                                   drop_duplicates())),
                                          samples=len(
                                              list(df2['Sample Name'].
                                                   drop_duplicates())),
                                          raw_files=len(
                                              list(df2[data_column].
                                                   drop_duplicates()))))
                             except Exception as e:
                                 print("error in query, {}".format(e))
                 study_design_report[-1]['assays'].append(assay_report)
     return study_design_report
Example #27
0
def create_isatab_archive(inv_fp,
                          target_filename=None,
                          filter_by_measurement=None):
    """Function to create an ISArchive; option to select by assay
    measurement type

    :param inv_fp: A file-like buffer object pointing to an investigation file
    :param target_filename: Target ZIP file name
    :param filter_by_measurement: Select by measurement type
    :return: List of files zipped if successful, None if not successful
    """
    if target_filename is None:
        target_filename = os.path.join(os.path.dirname(inv_fp.name),
                                       'isatab.zip')
    ISA = isatab.load(inv_fp)

    all_files_in_isatab = []
    found_files = []

    for s in ISA.studies:
        if filter_by_measurement is not None:
            log.debug('Selecting ', filter_by_measurement)
            selected_assays = [
                a for a in s.assays
                if a.measurement_type.term == filter_by_measurement
            ]
        else:
            selected_assays = s.assays

        for a in selected_assays:
            all_files_in_isatab += [d.filename for d in a.data_files]
    dirname = os.path.dirname(inv_fp.name)

    for fname in all_files_in_isatab:
        if os.path.isfile(os.path.join(dirname, fname)):
            found_files.append(fname)
    missing_files = [f for f in all_files_in_isatab if f not in found_files]

    if len(missing_files) == 0:
        log.debug('Do zip')
        with ZipFile(target_filename, mode='w') as zip_file:
            # use relative dir_name to avoid absolute path on file names
            zip_file.write(inv_fp.name, arcname=os.path.basename(inv_fp.name))

            for s in ISA.studies:
                zip_file.write(os.path.join(dirname, s.filename),
                               arcname=s.filename)

                for a in selected_assays:
                    zip_file.write(os.path.join(dirname, a.filename),
                                   arcname=a.filename)

            for file in all_files_in_isatab:
                zip_file.write(os.path.join(dirname, file), arcname=file)

            log.debug(zip_file.namelist())
            return zip_file.namelist()

    else:
        log.debug('Not zipping')
        log.debug('Missing: ', missing_files)
        return None
Example #28
0
def load_investigation(investigation_file):
    f = open(investigation_file, 'r')
    investigation = ISATAB.load(f)
    return investigation
Example #29
0
    def test_batch_fixer(self):
        s_table_path = os.path.join(self._tmp_dir, 'BII-S-3', 's_BII-S-3.txt')
        settings = {
            s_table_path: {
                        "factor": "dose",
                        "protocol_ref": "environmental material collection - "
                                        "standard procedure 1"
                    }
                }
        utils.batch_fix_isatabs(settings)

        expected_field_names = [
            'Source Name',
            'Characteristics[organism]',
            'Term Source REF', 'Term Accession Number',
            'Characteristics[geographic location (country and/or sea,region)]',
            'Term Source REF', 'Term Accession Number',
            'Characteristics[geographic location (longitude)]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[geographic location (latitude)]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[chlorophyll a concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[fucoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[peridinin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[butfucoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[hexfucoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[alloxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[zeaxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[lutein concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[chl-c3 concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[chl-c2 concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[prasinoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[neoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[violaxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[diadinoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[diatoxanthin concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[divinyl-chl-b concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[chl-b concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[divinyl-chl-a concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[chl-a concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[BB carotene concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[bacteria count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[synechococcus count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[small picoeukaryotes count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[large picoeukaryotes count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[nanoflagellates count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[cryptophytes count]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[phosphate concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[nitrate concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[particulate organic nitrogen concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[particulate organic carbon concentration]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[primary production depth integrated production to '
            '3 m expressed_in mgC m-2 d-1]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[water salinity]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Characteristics[fluorescence]',
            'Term Source REF', 'Term Accession Number',
            'Characteristics[water temperature at 3 meter depth]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Protocol REF',
            'Parameter Value[dose]',
            'Term Source REF', 'Term Accession Number',
            'Parameter Value[filter pore size]',
            'Unit', 'Term Source REF', 'Term Accession Number',
            'Sample Name',
            'Factor Value[compound]',
            'Term Source REF', 'Term Accession Number',
            'Factor Value[collection time]',
            'Term Source REF', 'Term Accession Number']

        # check the columns got moved in the study file
        with open(s_table_path + '.fix') as fixed_tab_fp:
            actual_field_names = list(
                map(lambda field_name: field_name.strip(),
                    next(fixed_tab_fp).split('\t')))
            self.assertListEqual(actual_field_names, expected_field_names)

        # check the parameter got added to the protocol
        with open(os.path.dirname(
                s_table_path) + '/i_Investigation.txt.fix') as fixed_i_fp:
            investigation = isatab.load(fixed_i_fp)
            study = investigation.studies[-1]
            protocol = study.get_prot(
                'environmental material collection - standard procedure 1')
            param = protocol.get_param('dose')
            self.assertIsNotNone(param)
Example #30
0
def load_investigation(investigation_file):
    f = utf8_text_file_open(investigation_file)
    investigation = ISATAB.load(f)
    return investigation
Example #31
0
def modify_investigation(fp):
    """Load, edit, and dump an ISA-Tab 1.0 descriptor."""

    # Load an existing ISA-Tab investigation file. In this example, we load an unpopulated i_investigation.txt file
    investigation = load(fp, skip_load_tables=True)
    investigation.identifier = "i1"
    investigation.title = "My Simple ISA Investigation"
    investigation.description = "We could alternatively use the class constructor's parameters to set some default " \
                                "values at the time of creation, however we want to demonstrate how to use the " \
                                "object's instance variables to set values."
    investigation.submission_date = "2016-11-03"
    investigation.public_release_date = "2016-11-03"

    study = Study(filename="s_study.txt")
    study.identifier = "s1"
    study.title = "My ISA Study"
    study.description = "Like with the Investigation, we could use the class constructor to set some default values, " \
                        "but have chosen to demonstrate in this example the use of instance variables to set initial " \
                        "values."
    study.submission_date = "2016-11-03"
    study.public_release_date = "2016-11-03"
    investigation.studies[0] = study

    obi = OntologySource(name='OBI',
                         description="Ontology for Biomedical Investigations")
    investigation.ontology_source_references.append(obi)
    intervention_design = OntologyAnnotation(term_source=obi)
    intervention_design.term = "intervention design"
    intervention_design.term_accession = "http://purl.obolibrary.org/obo/OBI_0000115"
    study.design_descriptors.append(intervention_design)

    # Other instance variables common to both Investigation and Study objects include 'contacts' and 'publications',
    # each with lists of corresponding Person and Publication objects.

    contact = Person(first_name="Alice",
                     last_name="Robertson",
                     affiliation="University of Life",
                     roles=[OntologyAnnotation(term='submitter')])
    study.contacts.append(contact)
    publication = Publication(title="Experiments with Elephants",
                              author_list="A. Robertson, B. Robertson")
    publication.pubmed_id = "12345678"
    publication.status = OntologyAnnotation(term="published")
    study.publications.append(publication)

    source = Source(name='source_material')
    study.sources.append(source)

    prototype_sample = Sample(name='sample_material', derives_from=[source])
    ncbitaxon = OntologySource(name='NCBITaxon', description="NCBI Taxonomy")
    characteristic_organism = Characteristic(
        category=OntologyAnnotation(term="Organism"),
        value=OntologyAnnotation(
            term="H**o Sapiens",
            term_source=ncbitaxon,
            term_accession="http://purl.bioontology.org/ontology/NCBITAXON/9606"
        ))
    prototype_sample.characteristics.append(characteristic_organism)

    study.samples = batch_create_materials(prototype_sample,
                                           n=3)  # creates a batch of 3 samples

    sample_collection_protocol = Protocol(
        name="sample collection",
        protocol_type=OntologyAnnotation(term="sample collection"))
    study.protocols.append(sample_collection_protocol)
    sample_collection_process = Process(
        executes_protocol=sample_collection_protocol)

    for src in study.sources:
        sample_collection_process.inputs.append(src)
    for sam in study.samples:
        sample_collection_process.outputs.append(sam)

    study.process_sequence.append(sample_collection_process)

    assay = Assay(filename="a_assay.txt")
    extraction_protocol = Protocol(
        name='extraction',
        protocol_type=OntologyAnnotation(term="material extraction"))
    study.protocols.append(extraction_protocol)
    sequencing_protocol = Protocol(
        name='sequencing',
        protocol_type=OntologyAnnotation(term="material sequencing"))
    study.protocols.append(sequencing_protocol)

    for i, sample in enumerate(study.samples):
        extraction_process = Process(executes_protocol=extraction_protocol)

        extraction_process.inputs.append(sample)
        material = Material(name="extract-{}".format(i))
        material.type = "Extract Name"
        extraction_process.outputs.append(material)

        sequencing_process = Process(executes_protocol=sequencing_protocol)
        sequencing_process.name = "assay-name-{}".format(i)
        sequencing_process.inputs.append(extraction_process.outputs[0])

        datafile = DataFile(filename="sequenced-data-{}".format(i),
                            label="Raw Data File")
        sequencing_process.outputs.append(datafile)

        extraction_process.next_process = sequencing_process
        sequencing_process.prev_process = extraction_process

        assay.samples.append(sample)
        assay.data_files.append(datafile)
        assay.other_material.append(material)
        assay.process_sequence.append(extraction_process)
        assay.process_sequence.append(sequencing_process)
        assay.measurement_type = OntologyAnnotation(term="gene sequencing")
        assay.technology_type = OntologyAnnotation(
            term="nucleotide sequencing")

    study.assays.append(assay)

    # dumps() writes out the ISA as a string representation of the ISA-Tab, but we are skipping writing tables
    return dumps(investigation, skip_dump_tables=True)