def slice_data_files(dir, factor_selection=None):
    #Gets list of dictionaries with each one being the data file string(s) for
    #the sample
    results = list()
    # first collect matching samples
    for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')):
        log.info("Loading {}".format(table_file))
        with open(table_file, encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            if factor_selection is None:
                matches = df['Sample Name'].items()
                for indx, match in matches:
                    sample_name = match
                    if len([r for r in results
                            if r['sample'] == sample_name]) == 1:
                        continue
                    else:
                        results.append({
                            "sample": sample_name,
                            "data_files": []
                        })
            else:
                for factor_name, factor_value in factor_selection.items():
                    if 'Factor Value[{}]'.format(factor_name) in list(
                            df.columns.values):
                        matches = df.loc[df['Factor Value[{}]'.format(
                            factor_name)] ==
                                         factor_value]['Sample Name'].items()
                        for indx, match in matches:
                            sample_name = match
                            if len([
                                    r for r in results
                                    if r['sample'] == sample_name
                            ]) == 1:
                                continue
                            else:
                                results.append({
                                    "sample": sample_name,
                                    "data_files": [],
                                    "query_used": factor_selection
                                })
    # now collect the data files relating to the samples
    for result in results:
        sample_name = result['sample']
        for table_file in glob.iglob(os.path.join(dir, 'a_*')):
            with open(table_file, encoding='utf-8') as fp:
                df = isatab.load_table(fp)
                data_files = list()
                table_headers = list(df.columns.values)
                sample_rows = df.loc[df['Sample Name'] == sample_name]
                if 'Raw Spectral Data File' in table_headers:
                    data_files = sample_rows['Raw Spectral Data File']
                elif 'Free Induction Decay Data File' in table_headers:
                    data_files = sample_rows['Free Induction Decay Data File']
                result['data_files'] = [
                    i for i in list(data_files) if str(i) != 'nan'
                ]
    return results
Example #2
0
def insert_distinct_parameter(table_fp, protocol_ref_to_unpool):
    reader = csv.reader(table_fp, dialect="excel-tab")
    headers = next(reader)  # get column headings
    table_fp.seek(0)
    df = load_table(table_fp)
    protocol_ref_indices = [
        x for x, y in enumerate(df.columns)
        if df[y][0] == protocol_ref_to_unpool
    ]  # find protocol ref column by index
    if len(protocol_ref_indices) != 1:
        raise IndexError(
            "Could not find Protocol REF with provided value {}".format(
                protocol_ref_to_unpool))
    distindex = list()
    for i in range(0, len(df.index)):
        distindex.append(str(uuid.uuid4())[:8])
    protocol_ref_index = protocol_ref_indices[0]
    name_header = None
    head_from_prot = headers[protocol_ref_index:]
    for x, y in enumerate(head_from_prot):
        if y.endswith(" Name"):
            name_header = y
            break
    if name_header is not None:
        print(
            "Are you sure you want to add a column of hash values in {}? Y/(N)"
            .format(name_header))
        confirm = input()
        if confirm == "Y":
            df[name_header] = distindex
            table_fp.seek(0)
            df.to_csv(table_fp, index=None, header=headers, sep="\t")
    else:
        print("Could not find appropriate column to fill with hashes")
Example #3
0
def get_factor_values(mtbls_study_id, factor_name):
    """
    This function gets the factor values of a factor in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :param factor_name: The factor name for which values are being queried
    :return: A set of factor values associated with the factor and study

    Example usage:
        factor_values = get_factor_values('MTBLS1', 'genotype')
    """
    tmp_dir = get(mtbls_study_id)
    from isatools import isatab
    fvs = set()
    for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')):
        with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values):
                for _, match in df['Factor Value[{}]'.format(factor_name)].iteritems():
                    try:
                        match = match.item()
                    except AttributeError:
                        pass
                    if isinstance(match, (str, int, float)):
                        if str(match) != 'nan':
                            fvs.add(match)
    shutil.rmtree(tmp_dir)
    return fvs
Example #4
0
def get_factor_names(mtbls_study_id):
    """
    This function gets the factor names used in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :return: A set of factor names used in the study

    Example usage:
        factor_names = get_factor_names('MTBLS1')
    """
    tmp_dir = get(mtbls_study_id)
    table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))]
    from isatools import isatab

    factors = set()
    import re

    for table_file in table_files:
        df = isatab.load_table(os.path.join(tmp_dir, table_file))
        factors_headers = [
            header for header in list(df.columns.values) if re.compile("Factor Value\[(.*?)\]").match(header)
        ]
        for header in factors_headers:
            factors.add(header[13:-1])
    return factors
Example #5
0
def get_factor_values(mtbls_study_id, factor_name):
    """
    This function gets the factor values of a factor in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :param factor_name: The factor name for which values are being queried
    :return: A set of factor values associated with the factor and study

    Example usage:
        factor_values = get_factor_values('MTBLS1', 'genotype)
    """
    tmp_dir = get(mtbls_study_id)
    table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))]
    from isatools import isatab

    fvs = set()
    for table_file in table_files:
        df = isatab.load_table(os.path.join(tmp_dir, table_file))
        if "Factor Value[{}]".format(factor_name) in list(df.columns.values):
            for indx, match in df["Factor Value[{}]".format(factor_name)].items():
                if isinstance(match, (str, int, float)):
                    if str(match) != "nan":
                        fvs.add(match)
    shutil.rmtree(tmp_dir)
    return fvs
Example #6
0
def get_factor_values(mtbls_study_id, factor_name):
    """
    This function gets the factor values of a factor in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :param factor_name: The factor name for which values are being queried
    :return: A set of factor values associated with the factor and study

    Example usage:
        factor_values = get_factor_values('MTBLS1', 'genotype)
    """
    tmp_dir = get(mtbls_study_id)
    table_files = [
        f for f in os.listdir(tmp_dir) if f.startswith(('a_', 's_'))
    ]
    from isatools import isatab
    fvs = set()
    for table_file in table_files:
        df = isatab.load_table(os.path.join(tmp_dir, table_file))
        if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values):
            for indx, match in df['Factor Value[{}]'.format(
                    factor_name)].items():
                if isinstance(match, (str, int, float)):
                    if str(match) != 'nan':
                        fvs.add(match)
    shutil.rmtree(tmp_dir)
    return fvs
Example #7
0
def get_factor_names(mtbls_study_id):
    """
    This function gets the factor names used in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :return: A set of factor names used in the study

    Example usage:
        factor_names = get_factor_names('MTBLS1')
    """
    tmp_dir = get(mtbls_study_id)
    table_files = [
        f for f in os.listdir(tmp_dir) if f.startswith(('a_', 's_'))
    ]
    from isatools import isatab
    factors = set()
    import re
    for table_file in table_files:
        df = isatab.load_table(os.path.join(tmp_dir, table_file))
        factors_headers = [
            header for header in list(df.columns.values)
            if re.compile('Factor Value\[(.*?)\]').match(header)
        ]
        for header in factors_headers:
            factors.add(header[13:-1])
    return factors
Example #8
0
def get_factor_names(mtbls_study_id):
    """
    This function gets the factor names used in a MetaboLights study

    :param mtbls_study_id: Accession number of the MetaboLights study
    :return: A set of factor names used in the study

    Example usage:
        factor_names = get_factor_names('MTBLS1')
    """
    tmp_dir = get(mtbls_study_id)

    factors = set()

    for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')):
        with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)

            factors_headers = [
                header for header in list(df.columns.values)
                if _RX_FACTOR_VALUE.match(header)
            ]

            for header in factors_headers:
                factors.add(header[13:-1])
    return factors
Example #9
0
def get_filtered_df_on_factors_list(mtbls_study_id):
    factors_list = get_study_group_factors(mtbls_study_id=mtbls_study_id)
    queries = []

    for item in factors_list:
        query_str = []

        for k, v in item.items():
            k = k.replace(' ', '_').replace('[', '_').replace(']', '_')
            if isinstance(v, str):
                v = v.replace(' ', '_').replace('[', '_').replace(']', '_')
                query_str.append("{k} == '{v}' and ".format(k=k, v=v))

        query_str = ''.join(query_str)[:-4]
        queries.append(query_str)

    tmp_dir = get(mtbls_study_id)
    for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')):
        with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)

            cols = df.columns
            cols = cols.map(
                lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
            df.columns = cols

            cols = df.columns
            cols = cols.map(
                lambda x: x.replace('[', '_') if isinstance(x, str) else x)
            df.columns = cols

            cols = df.columns
            cols = cols.map(
                lambda x: x.replace(']', '_') if isinstance(x, str) else x)
            df.columns = cols

        for query in queries:
            df2 = df.query(query)  # query uses pandas.eval, which evaluates 
                                   # queries like pure Python notation
            if 'Sample_Name' in df.columns:
                print('Group: {query} / Sample_Name: {sample_name}'.format(
                    query=query, sample_name=list(df2['Sample_Name'])))

            if 'Source_Name' in df.columns:
                print('Group: {} / Sources_Name: {}'.format(
                    query, list(df2['Source_Name'])))

            if 'Raw_Spectral_Data_File' in df.columns:
                print('Group: {query} / Raw_Spectral_Data_File: {filename}'
                    .format( query=query[13:-2],
                             filename=list(df2['Raw_Spectral_Data_File'])))
    return queries
def get_factor_names_m(directory):
    #Get the metadata of the study:
    factors = set()
    for table_file in glob.iglob(os.path.join(directory, '[a|s]_*')):
        with open(os.path.join(directory, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            factors_headers = [
                header for header in list(df.columns.values)
                if _RX_FACTOR_VALUE.match(header)
            ]
            for header in factors_headers:
                factors.add(header[13:-1])
    return factors
Example #11
0
def get_study_group_factors(mtbls_study_id):
    factors_list = []
    tmp_dir = get(mtbls_study_id)
    if tmp_dir is None:
        raise FileNotFoundError("Could not download {}".format(mtbls_study_id))
    for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')):
        with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            factor_columns = [
                x for x in df.columns if x.startswith("Factor Value")
            ]
            if len(factor_columns) > 0:
                factors_list = df[factor_columns].drop_duplicates()\
                    .to_dict(orient='records')
    return factors_list
Example #12
0
def insert_distinct_parameter(table_fp, protocol_ref_to_unpool):
    """A curation function to fix pooling problems

    :param table_fp: A file-like buffer object pointing to a table file
    :param protocol_ref_to_unpool: Reference to which protocol REF column to
    'unpool'
    :return: None
    """
    reader = csv.reader(table_fp, dialect='excel-tab')
    headers = next(reader)  # get column headings
    table_fp.seek(0)

    df = isatab.load_table(table_fp)

    # find protocol ref column by index
    protocol_ref_indices = [
        x for x, y in enumerate(df.columns)
        if df[y][0] == protocol_ref_to_unpool
    ]

    if len(protocol_ref_indices) != 1:
        raise IndexError(
            'Could not find Protocol REF with provided value {}'.format(
                protocol_ref_to_unpool))
    distindex = []

    for i in range(0, len(df.index)):
        distindex.append(str(uuid.uuid4())[:8])

    protocol_ref_index = protocol_ref_indices[0]
    name_header = None
    head_from_prot = headers[protocol_ref_index:]

    for x, y in enumerate(head_from_prot):
        if y.endswith(' Name'):
            name_header = y
            break

    if name_header is not None:
        print('Are you sure you want to add a column of hash values in {}? '
              'Y/(N)'.format(name_header))
        confirm = input()
        if confirm == 'Y':
            df[name_header] = distindex
            table_fp.seek(0)
            df.to_csv(table_fp, index=None, header=headers, sep='\t')
    else:
        print('Could not find appropriate column to fill with hashes')
Example #13
0
def get_filtered_df_on_factors_list(mtbls_study_id):
    factors_list = get_study_group_factors(mtbls_study_id=mtbls_study_id)
    queries = []
    for item in factors_list:
        query_str = []
        for k, v in item.items():
            k = k.replace(' ', '_').replace('[', '_').replace(']', '_')
            if isinstance(v, str):
                v = v.replace(' ', '_').replace('[', '_').replace(']', '_')
                query_str.append("{0} == '{1}' and ".format(k, v))
        query_str = ''.join(query_str)[:-4]
        queries.append(query_str)
    tmp_dir = get(mtbls_study_id)
    for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')):
        with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            cols = df.columns
            cols = cols.map(lambda x: x.replace(' ', '_')
                            if isinstance(x, str) else x)
            df.columns = cols
            cols = df.columns
            cols = cols.map(lambda x: x.replace('[', '_')
                            if isinstance(x, str) else x)
            df.columns = cols
            cols = df.columns
            cols = cols.map(lambda x: x.replace(']', '_')
                            if isinstance(x, str) else x)
            df.columns = cols
        from pandas.computation.ops import UndefinedVariableError
        for query in queries:
            try:
                df2 = df.query(
                    query
                )  # query uses pandas.eval, which evaluates queries like pure Python notation
                if "Sample_Name" in df.columns:
                    print("Group: {} / Sample_Name: {}".format(
                        query, list(df2["Sample_Name"])))
                if "Source_Name" in df.columns:
                    print("Group: {} / Sources_Name: {}".format(
                        query, list(df2["Source_Name"])))
                if "Raw_Spectral_Data_File" in df.columns:
                    print("Group: {} / Raw_Spectral_Data_File: {}".format(
                        query[13:-2], list(df2["Raw_Spectral_Data_File"])))
            except UndefinedVariableError:
                pass
    return queries
Example #14
0
def slice_data_files(dir, factor_selection=None):
    """
    This function gets a list of samples and related data file URLs for a given MetaboLights study, optionally
    filtered by factor value (currently by matching on exactly 1 factor value)

    :param mtbls_study_id: Study identifier for MetaboLights study to get, as a str (e.g. MTBLS1)
    :param factor_selection: Selected factor values to filter on samples
    :return: A list of dicts {sample_name, list of data_files} containing sample names with associated data filenames

    Example usage:
        samples_and_data = mtbls.get_data_files('MTBLS1', {'Gender': 'Male'})

    TODO:  Need to work on more complex filters e.g.:
        {"gender": ["male", "female"]} selects samples matching "male" or "female" factor value
        {"age": {"equals": 60}} selects samples matching age 60
        {"age": {"less_than": 60}} selects samples matching age less than 60
        {"age": {"more_than": 60}} selects samples matching age more than 60

        To select samples matching "male" and age less than 60:
        {
            "gender": "male",
            "age": {
                "less_than": 60
            }
        }
    """
    results = list()
    # first collect matching samples
    for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')):
        logger.info("Loading {}".format(table_file))
        with open(table_file, encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            if factor_selection is None:
                matches = df['Sample Name'].items()
                for indx, match in matches:
                    sample_name = match
                    if len([r for r in results if r['sample'] == sample_name]) == 1:
                        continue
                    else:
                        results.append(
                            {
                                "sample": sample_name,
                                "data_files": []
                            }
                        )
            else:
                for factor_name, factor_value in factor_selection.items():
                    if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values):
                        matches = df.loc[df['Factor Value[{}]'.format(factor_name)] == factor_value]['Sample Name'].items()
                        for indx, match in matches:
                            sample_name = match
                            if len([r for r in results if r['sample'] == sample_name]) == 1:
                                continue
                            else:
                                results.append(
                                    {
                                        "sample": sample_name,
                                        "data_files": [],
                                        "query_used": factor_selection
                                    }
                                )
    # now collect the data files relating to the samples
    for result in results:
        sample_name = result['sample']
        for table_file in glob.iglob(os.path.join(dir, 'a_*')):
            with open(table_file, encoding='utf-8') as fp:
                df = isatab.load_table(fp)
                data_files = list()
                table_headers = list(df.columns.values)
                sample_rows = df.loc[df['Sample Name'] == sample_name]
                if 'Raw Spectral Data File' in table_headers:
                    data_files = sample_rows['Raw Spectral Data File']
                elif 'Free Induction Decay Data File' in table_headers:
                    data_files = sample_rows['Free Induction Decay Data File']
                result['data_files'] = [i for i in list(data_files) if str(i) != 'nan']
    return results
Example #15
0
 def generate_study_design_report(self,
                                  get_num_study_groups=True,
                                  get_factors=True,
                                  get_num_levels=True,
                                  get_levels=True,
                                  get_study_groups=True):
     """Generates a study design report
     :return: JSON report
     """
     isa = isatab.load(self.path, skip_load_tables=False)
     study_design_report = []
     raw_data_file_prefix = ('Raw', 'Array', 'Free Induction Decay')
     for study in isa.studies:
         study_key = study.identifier if study.identifier != '' \
             else study.filename
         study_design_report.append({
             'study_key': study_key,
             'total_sources': len(study.sources),
             'total_samples': len(study.samples),
             'assays': []
         })
         with open(os.path.join(self.path, study.filename)) as s_fp:
             s_df = isatab.load_table(s_fp)
             for assay in study.assays:
                 assay_key = '/'.join([
                     assay.filename, assay.measurement_type.term,
                     assay.technology_type.term, assay.technology_platform
                 ])
                 assay_report = {
                     'assay_key':
                     assay_key,
                     'num_sources':
                     len(assay.samples),
                     'num_samples':
                     len([
                         x for x in assay.data_files
                         if x.label.startswith(raw_data_file_prefix)
                     ])
                 }
                 with open(os.path.join(self.path, assay.filename)) as a_fp:
                     a_df = isatab.load_table(a_fp)
                     merged_df = pd.merge(s_df, a_df, on='Sample Name')
                     factor_cols = [
                         x for x in merged_df.columns
                         if x.startswith("Factor Value")
                     ]
                     if len(factor_cols) > 0:
                         # add branch to get all if no FVs
                         study_group_factors_df = \
                             merged_df[factor_cols].drop_duplicates()
                         factors_list = [
                             x[13:-1]
                             for x in study_group_factors_df.columns
                         ]
                         queries = []
                         factors_and_levels = {}
                         for i, row in study_group_factors_df.iterrows():
                             fvs = []
                             for x, y in zip(factors_list, row):
                                 fvs.append(' == '.join([x, str(y)]))
                                 try:
                                     factor_and_levels = \
                                         factors_and_levels[x]
                                 except KeyError:
                                     factors_and_levels[x] = set()
                                     factor_and_levels = \
                                         factors_and_levels[x]
                                 factor_and_levels.add(str(y))
                             queries.append(' and '.join(fvs))
                         assay_report['total_study_groups'] = len(queries)
                         assay_report['factors_and_levels'] = []
                         assay_report['group_summary'] = []
                         for k, v in factors_and_levels.items():
                             assay_report['factors_and_levels'].append({
                                 'factor':
                                 k,
                                 'num_levels':
                                 len(v),
                             })
                         for query in queries:
                             try:
                                 columns = merged_df.columns
                                 columns = recast_columns(columns=columns)
                                 for i, column in enumerate(columns):
                                     columns[i] = pyvar(column) if \
                                         column.startswith(
                                             'Factor Value[') else column
                                 merged_df.columns = columns
                                 qlist = query.split(' and ')
                                 fmt_query = []
                                 for factor_query in qlist:
                                     factor_value = \
                                         factor_query.split(' == ')
                                     fmt_query_part = \
                                         "Factor_Value_{0}_ == '{1}'"\
                                         .format(pyvar(factor_value[0]),
                                                 factor_value[1])
                                     fmt_query.append(fmt_query_part)
                                 fmt_query = ' and '.join(fmt_query)
                                 log.debug(
                                     'running query: {}'.format(fmt_query))
                                 df2 = merged_df.query(fmt_query)
                                 data_column = [
                                     x for x in merged_df.columns
                                     if x.startswith(raw_data_file_prefix)
                                     and x.endswith('Data File')
                                 ][0]
                                 assay_report['group_summary'].append(
                                     dict(study_group=query,
                                          sources=len(
                                              list(df2['Source Name'].
                                                   drop_duplicates())),
                                          samples=len(
                                              list(df2['Sample Name'].
                                                   drop_duplicates())),
                                          raw_files=len(
                                              list(df2[data_column].
                                                   drop_duplicates()))))
                             except Exception as e:
                                 print("error in query, {}".format(e))
                 study_design_report[-1]['assays'].append(assay_report)
     return study_design_report
Example #16
0
def get_data_files(mtbls_study_id, factor_selection=None):
    """
    This function gets a list of samples and related data file URLs for a given MetaboLights study, optionally
    filtered by factor value (currently by matching on exactly 1 factor value)

    :param mtbls_study_id: Study identifier for MetaboLights study to get, as a str (e.g. MTBLS1)
    :param factor_selection: Selected factor values to filter on samples
    :return: A list of dicts {sample_name, list of data_files} containing sample names with associated data filenames

    Example usage:
        samples_and_data = mtbls.get_data_files('MTBLS1', {'Gender': 'Male'})

    TODO:  Need to work on more complex filters e.g.:
        {"gender": ["male", "female"]} selects samples matching "male" or "female" factor value
        {"age": {"equals": 60}} selects samples matching age 60
        {"age": {"less_than": 60}} selects samples matching age less than 60
        {"age": {"more_than": 60}} selects samples matching age more than 60

        To select samples matching "male" and age less than 60:
        {
            "gender": "male",
            "age": {
                "less_than": 60
            }
        }
    """
    tmp_dir = get(mtbls_study_id)
    if tmp_dir is None:
        raise IOError("There was a problem retrieving study {}. Does it exist?".format(mtbls_study_id))
    table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))]
    from isatools import isatab

    results = list()
    # first collect matching samples
    for table_file in table_files:
        df = isatab.load_table(os.path.join(tmp_dir, table_file))
        if factor_selection is None:
            matches = df["Sample Name"].items()
            for indx, match in matches:
                sample_name = match
                if len([r for r in results if r["sample"] == sample_name]) == 1:
                    continue
                else:
                    results.append({"sample": sample_name, "data_files": []})
        else:
            for factor_name, factor_value in factor_selection.items():
                if "Factor Value[{}]".format(factor_name) in list(df.columns.values):
                    matches = df.loc[df["Factor Value[{}]".format(factor_name)] == factor_value]["Sample Name"].items()
                    for indx, match in matches:
                        sample_name = match
                        if len([r for r in results if r["sample"] == sample_name]) == 1:
                            continue
                        else:
                            results.append({"sample": sample_name, "data_files": [], "query_used": factor_selection})
    # now collect the data files relating to the samples
    for result in results:
        sample_name = result["sample"]
        for table_file in [f for f in os.listdir(tmp_dir) if f.startswith("a_")]:
            df = isatab.load_table(os.path.join(tmp_dir, table_file))
            data_files = list()
            table_headers = list(df.columns.values)
            sample_rows = df.loc[df["Sample Name"] == sample_name]
            if "Raw Spectral Data File" in table_headers:
                data_files = sample_rows["Raw Spectral Data File"]
            elif "Free Induction Decay Data File" in table_headers:
                data_files = sample_rows["Free Induction Decay Data File"]
            result["data_files"] = [i for i in list(data_files) if str(i) != "nan"]
    shutil.rmtree(tmp_dir)
    return results
Example #17
0
def slice_data_files(dir, factor_selection=None):
    """
    This function gets a list of samples and related data file URLs for a given
    MetaboLights study, optionally filtered by factor value (currently by
    matching on exactly 1 factor value)

    :param mtbls_study_id: Study identifier for MetaboLights study to get, as
    a str (e.g. MTBLS1)
    :param factor_selection: A list of selected factor values to filter on
    samples
    :return: A list of dicts {sample_name, list of data_files} containing
    sample names with associated data filenames

    Example usage:
        samples_and_data = mtbls.get_data_files('MTBLS1', [{'Gender': 'Male'}])

    TODO:  Need to work on more complex filters e.g.:
        {"gender": ["male", "female"]} selects samples matching "male" or
        "female" factor value
        {"age": {"equals": 60}} selects samples matching age 60
        {"age": {"less_than": 60}} selects samples matching age less than 60
        {"age": {"more_than": 60}} selects samples matching age more than 60

        To select samples matching "male" and age less than 60:
        {
            "gender": "male",
            "age": {
                "less_than": 60
            }
        }
    """
    results = []
    # first collect matching samples
    for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')):
        log.info('Loading {table_file}'.format(table_file=table_file))

        with open(table_file, encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            df = df[[
                x for x in df.columns
                if 'Factor Value' in x or 'Sample Name' in x
            ]]
            df.columns = [
                'sample' if 'Sample Name' in x else x for x in df.columns
            ]
            df.columns = [
                x[13:-1] if 'Factor Value' in x else x for x in df.columns
            ]
            df.columns = [x.replace(' ', '_') for x in df.columns]
            # build query
            sample_names_series = df['sample'].drop_duplicates()
            if factor_selection is None:
                results = sample_names_series.apply(lambda x: {
                    'sample': x,
                    'data_files': [],
                    'query_used': ''
                }).tolist()
            else:
                factor_query = ''
                for factor_name, factor_value in factor_selection.items():
                    factor_name = factor_name.replace(' ', '_')
                    factor_query += '{factor_name}=="{factor_value}" and '.format(
                        factor_name=factor_name, factor_value=factor_value)
                factor_query = factor_query[:-5]
                try:
                    query_results = df.query(
                        factor_query)['sample'].drop_duplicates()
                    results = query_results.apply(
                        lambda x: {
                            'sample': x,
                            'data_files': [],
                            'query_used': factor_selection
                        }).tolist()
                except pd.core.computation.ops.UndefinedVariableError:
                    pass

    # now collect the data files relating to the samples
    for table_file in glob.iglob(os.path.join(dir, 'a_*.txt')):
        with open(table_file, encoding='utf-8') as fp:
            df = isatab.load_table(fp)
            df = df[[
                x for x in df.columns if 'File' in x or 'Sample Name' in x
            ]]
            df.columns = [
                'sample' if 'Sample Name' in x else x for x in df.columns
            ]
            for result in results:
                sample_name = result['sample']
                sample_rows = df.loc[df['sample'] == sample_name]

                for data_col in [
                        x for x in sample_rows.columns if 'File' in x
                ]:
                    data_files = sample_rows[data_col]
                    result['data_files'] = [
                        i for i in data_files if str(i) != 'nan'
                    ]
    return results