def import_study_data(jvm_args, meta_filename, data_filename):

    args = jvm_args.split(' ')
    meta_file_dict, meta_file_type = cbioportal_common.parse_metadata_file(
        meta_filename, logger=LOGGER)
    if meta_file_type is None:
        # invalid file, skip
        return

    if not data_filename.endswith(meta_file_dict['data_filename']):
        print >> ERROR_FILE, ("'data_filename' in meta file contradicts "
                              "data filename in command, skipping file")
        return

    importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type]

    args.append(importer)
    if IMPORTER_REQUIRES_METADATA[importer]:
        args.append("--meta")
        args.append(meta_filename)
        args.append("--loadMode")
        args.append("bulkload")
    if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"):
        args.append("--data")
        args.append(data_filename)
        args.append("--study")
        args.append(meta_file_dict['cancer_study_identifier'])
    else:
        args.append("--data")
        args.append(data_filename)

    args.append("--noprogress") # don't report memory usage and % progress
    run_java(*args)
Esempio n. 2
0
def import_study_data(jvm_args, meta_filename, data_filename):

    args = jvm_args.split(' ')
    meta_file_dict, meta_file_type = cbioportal_common.parse_metadata_file(
        meta_filename, logger=LOGGER)
    if meta_file_type is None:
        # invalid file, skip
        return

    if not data_filename.endswith(meta_file_dict['data_filename']):
        print >> ERROR_FILE, ("'data_filename' in meta file contradicts "
                              "data filename in command, skipping file")
        return

    importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type]

    args.append(importer)
    if IMPORTER_REQUIRES_METADATA[importer]:
        args.append("--meta")
        args.append(meta_filename)
        args.append("--loadMode")
        args.append("bulkload")
    if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData",
                    "org.mskcc.cbio.portal.scripts.ImportGisticData"):
        args.append("--data")
        args.append(data_filename)
        args.append("--study")
        args.append(meta_file_dict['cancer_study_identifier'])
    else:
        args.append("--data")
        args.append(data_filename)

    args.append("--noprogress")  # don't report memory usage and % progress
    run_java(*args)
Esempio n. 3
0
def remove_study(jvm_args, meta_filename):
    args = jvm_args.split(" ")
    args.append(REMOVE_STUDY_CLASS)
    meta_dict, meta_type = cbioportal_common.parse_metadata_file(meta_filename, logger=LOGGER)
    if meta_type != MetaFileTypes.STUDY:
        # invalid file, skip
        print >> ERROR_FILE, "Not a study meta file: " + meta_filename
        return
    args.append(meta_dict["cancer_study_identifier"])
    args.append("--noprogress")  # don't report memory usage and % progress
    run_java(*args)
Esempio n. 4
0
def remove_study(jvm_args, meta_filename):
    args = jvm_args.split(' ')
    args.append(REMOVE_STUDY_CLASS)
    meta_dictionary = cbioportal_common.parse_metadata_file(meta_filename,
                                                            logger=LOGGER)
    if meta_dictionary['meta_file_type'] != MetaFileTypes.STUDY:
        # invalid file, skip
        print >> ERROR_FILE, 'Not a study meta file: ' + meta_filename
        return
    args.append(meta_dictionary['cancer_study_identifier'])
    args.append("--noprogress")  # don't report memory usage and % progress
    run_java(*args)
Esempio n. 5
0
def import_study_data(jvm_args,
                      meta_filename,
                      data_filename,
                      meta_file_dictionary=None):
    args = jvm_args.split(' ')

    # In case the meta file is already parsed in a previous function, it is not
    # necessary to parse it again
    if meta_file_dictionary is None:
        meta_file_dictionary = cbioportal_common.parse_metadata_file(
            meta_filename, logger=LOGGER)

    # Retrieve meta file type
    meta_file_type = meta_file_dictionary['meta_file_type']

    # invalid file, skip
    if meta_file_type is None:
        print >> ERROR_FILE, (
            "Unrecognized meta file type '%s', skipping file" %
            (meta_file_type))
        return

    if not data_filename.endswith(meta_file_dictionary['data_filename']):
        print >> ERROR_FILE, ("'data_filename' in meta file contradicts "
                              "data filename in command, skipping file")
        return

    importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type]

    args.append(importer)
    if IMPORTER_REQUIRES_METADATA[importer]:
        args.append("--meta")
        args.append(meta_filename)
        args.append("--loadMode")
        args.append("bulkload")
    if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData",
                    "org.mskcc.cbio.portal.scripts.ImportGisticData"):
        args.append("--data")
        args.append(data_filename)
        args.append("--study")
        args.append(meta_file_dictionary['cancer_study_identifier'])
    elif importer == "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap":
        args.append("--meta")
        args.append(meta_filename)
        args.append("--data")
        args.append(data_filename)
    else:
        args.append("--data")
        args.append(data_filename)

    args.append("--noprogress")  # don't report memory usage and % progress
    run_java(*args)
Esempio n. 6
0
def import_study_data(jvm_args, meta_filename, data_filename, meta_file_dictionary = None):
    args = jvm_args.split(' ')

    # In case the meta file is already parsed in a previous function, it is not
    # necessary to parse it again
    if meta_file_dictionary is None:
        meta_file_dictionary = cbioportal_common.parse_metadata_file(
        meta_filename, logger=LOGGER)

    # Retrieve meta file type
    meta_file_type = meta_file_dictionary['meta_file_type']

    # invalid file, skip
    if meta_file_type is None:
        print >> ERROR_FILE, ("Unrecognized meta file type '%s', skipping file"
                              % (meta_file_type))
        return

    if not data_filename.endswith(meta_file_dictionary['data_filename']):
        print >> ERROR_FILE, ("'data_filename' in meta file contradicts "
                              "data filename in command, skipping file")
        return

    importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type]

    args.append(importer)
    if IMPORTER_REQUIRES_METADATA[importer]:
        args.append("--meta")
        args.append(meta_filename)
        args.append("--loadMode")
        args.append("bulkload")
    if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"):
        args.append("--data")
        args.append(data_filename)
        args.append("--study")
        args.append(meta_file_dictionary['cancer_study_identifier'])
    elif importer == "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap":
        args.append("--meta")
        args.append(meta_filename)
        args.append("--data")
        args.append(data_filename)
    else:
        args.append("--data")
        args.append(data_filename)

    args.append("--noprogress") # don't report memory usage and % progress
    run_java(*args)
def process_directory(jvm_args, study_directory):

    """Import an entire study directory based on meta files found."""

    meta_filenames = (
        os.path.join(study_directory, f) for
        f in os.listdir(study_directory) if
        re.search(r'(\b|_)meta(\b|[_0-9])', f,
                  flags=re.IGNORECASE) and
        not (f.startswith('.') or f.endswith('~')))
    study_id = None
    study_metafile = None
    study_metadata = None
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []

    # read all meta files (excluding case lists) to determine what to import
    for f in meta_filenames:
        # parse meta file
        metadata, meta_file_type = cbioportal_common.parse_metadata_file(
            f, study_id=study_id, logger=LOGGER)
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + f)
        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in metadata:
            study_id = metadata['cancer_study_identifier']

        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_metafile is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_metafile, f))
            study_metafile = f
            study_metadata = metadata
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], f))
            sample_attr_filepair = (
                f, os.path.join(study_directory, metadata['data_filename']))
        else:
            regular_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_metafile is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_metafile)
        import_study(jvm_args, study_metafile)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now, import everything else
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # do the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_metadata.get('add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)
        
    # enable study
    update_study_status(jvm_args, study_id)
Esempio n. 8
0
def process_directory(jvm_args, study_directory):
    """
    Import an entire study directory based on meta files found.
    1. Determine meta files in study directory.
    2. Read all meta files and determine file types.
    3. Import data files in specific order by file type.
    """

    study_id = None
    study_meta_filename = None
    study_meta_dictionary = {}
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []
    gene_panel_matrix_filepair = None
    zscore_filepairs = []
    gsva_score_filepair = None
    gsva_pvalue_filepair = None
    fusion_filepair = None

    # Determine meta filenames in study directory
    meta_filenames = (
        os.path.join(study_directory, meta_filename)
        for meta_filename in os.listdir(study_directory) if
        re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, flags=re.IGNORECASE)
        and not (meta_filename.startswith('.') or meta_filename.endswith('~')))

    # Read all meta files (excluding case lists) to determine what to import
    for meta_filename in meta_filenames:

        # Parse meta file
        meta_dictionary = cbioportal_common.parse_metadata_file(
            meta_filename, study_id=study_id, logger=LOGGER)

        # Save meta dictionary in study meta dictionary
        study_meta_dictionary[meta_filename] = meta_dictionary

        # Retrieve meta file type
        meta_file_type = meta_dictionary['meta_file_type']
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + meta_filename)

        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in meta_dictionary:
            study_id = meta_dictionary['cancer_study_identifier']

        # Check the type of metafile. It is to know which metafile types the
        # study contains because at a later stage we want to import in a
        # specific order.

        # Check for cancer type file
        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (meta_filename,
                 os.path.join(study_directory,
                              meta_dictionary['data_filename'])))
        # Check for meta study file
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_meta_filename is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_meta_filename, meta_filename))
            # Determine the study meta filename
            study_meta_filename = meta_filename
            study_meta_dictionary[study_meta_filename] = meta_dictionary
        # Check for sample attributes
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], meta_filename))  # pylint: disable=unsubscriptable-object
            sample_attr_filepair = (meta_filename,
                                    os.path.join(
                                        study_directory,
                                        meta_dictionary['data_filename']))
        # Check for gene panel matrix
        elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX:
            gene_panel_matrix_filepair = ((
                meta_filename,
                os.path.join(study_directory,
                             meta_dictionary['data_filename'])))
        # Check for z-score exression files
        elif meta_file_type == MetaFileTypes.EXPRESSION and meta_dictionary[
                'datatype'] == "Z-SCORE":
            zscore_filepairs.append(
                (meta_filename,
                 os.path.join(study_directory,
                              meta_dictionary['data_filename'])))
        # Check for GSVA scores
        elif meta_file_type == MetaFileTypes.GSVA_SCORES:
            gsva_score_filepair = ((meta_filename,
                                    os.path.join(
                                        study_directory,
                                        meta_dictionary['data_filename'])))
        # Check for GSVA p-values
        elif meta_file_type == MetaFileTypes.GSVA_PVALUES:
            gsva_pvalue_filepair = ((meta_filename,
                                     os.path.join(
                                         study_directory,
                                         meta_dictionary['data_filename'])))
        # Check for fusion data
        elif meta_file_type == MetaFileTypes.FUSION:
            fusion_filepair = ((meta_filename,
                                os.path.join(
                                    study_directory,
                                    meta_dictionary['data_filename'])))
        # Add all other types of data
        else:
            regular_filepairs.append(
                (meta_filename,
                 os.path.join(study_directory,
                              meta_dictionary['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_meta_filename is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_meta_filename)
        import_study(jvm_args, study_meta_filename)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    # Next, import everything else except gene panel, fusion data, GSVA and
    # z-score expression. If in the future more types refer to each other, (like
    # in a tree structure) this could be programmed in a recursive fashion.
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    # Import fusion data (after mutation)
    if fusion_filepair is not None:
        meta_filename, data_filename = fusion_filepair
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    # Import expression z-score (after expression)
    for meta_filename, data_filename in zscore_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    # Import GSVA genetic profiles (after expression and z-scores)
    if gsva_score_filepair is not None:

        # First import the GSVA score data
        meta_filename, data_filename = gsva_score_filepair
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

        # Second import the GSVA p-value data
        meta_filename, data_filename = gsva_pvalue_filepair
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    if gene_panel_matrix_filepair is not None:
        meta_filename, data_filename = gene_panel_matrix_filepair
        import_study_data(jvm_args, meta_filename, data_filename,
                          study_meta_dictionary[meta_filename])

    # Import the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_meta_dictionary[study_meta_filename].get(
            'add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)

    # enable study
    update_study_status(jvm_args, study_id)
Esempio n. 9
0
def process_directory(jvm_args, study_directory):
    """Import an entire study directory based on meta files found."""

    meta_filenames = (
        os.path.join(study_directory, f) for f in os.listdir(study_directory)
        if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE)
        and not (f.startswith('.') or f.endswith('~')))
    study_id = None
    study_metafile = None
    study_metadata = None
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []
    gene_panel_matrix_filepair = None
    zscore_filepairs = []
    gsva_score_filepair = None
    gsva_pvalue_filepair = None

    # read all meta files (excluding case lists) to determine what to import
    for f in meta_filenames:
        # parse meta file
        metadata, meta_file_type = cbioportal_common.parse_metadata_file(
            f, study_id=study_id, logger=LOGGER)
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + f)
        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in metadata:
            study_id = metadata['cancer_study_identifier']

        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_metafile is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_metafile, f))
            study_metafile = f
            study_metadata = metadata
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], f))  # pylint: disable=unsubscriptable-object
            sample_attr_filepair = (f,
                                    os.path.join(study_directory,
                                                 metadata['data_filename']))
        elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX:
            gene_panel_matrix_filepair = ((f,
                                           os.path.join(
                                               study_directory,
                                               metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.EXPRESSION and metadata[
                'datatype'] == "Z-SCORE":
            zscore_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.GSVA_SCORES:
            gsva_score_filepair = ((f,
                                    os.path.join(study_directory,
                                                 metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.GSVA_PVALUES:
            gsva_pvalue_filepair = ((f,
                                     os.path.join(study_directory,
                                                  metadata['data_filename'])))
        else:
            regular_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_metafile is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_metafile)
        import_study(jvm_args, study_metafile)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now, import everything else except gene panel, gsva & z-score expression
    # These data types have to be imported last, because they both refer to expression by
    # source_stable_id. If in the future more types refer to each other, (like
    # in a tree structure) this could be programmed in a recursive fashion.
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now import expression z-score
    for meta_filename, data_filename in zscore_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now import gsva genetic profiles
    if gsva_score_filepair is not None:
        # First import the score data
        meta_filename, data_filename = gsva_score_filepair
        import_study_data(jvm_args, meta_filename, data_filename)
        #Second import the pvalue data
        meta_filename, data_filename = gsva_pvalue_filepair
        import_study_data(jvm_args, gsva_pvalue_filepair[0],
                          gsva_pvalue_filepair[1])

    if gene_panel_matrix_filepair is not None:
        import_study_data(jvm_args, gene_panel_matrix_filepair[0],
                          gene_panel_matrix_filepair[1])

    # do the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_metadata.get('add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)

    # enable study
    update_study_status(jvm_args, study_id)
Esempio n. 10
0
def process_directory(jvm_args, study_directory):
    """Import an entire study directory based on meta files found."""

    meta_filenames = (
        os.path.join(study_directory, f) for f in os.listdir(study_directory)
        if re.search(r'(\b|_)meta(\b|[_0-9])', f, flags=re.IGNORECASE)
        and not (f.startswith('.') or f.endswith('~')))
    study_id = None
    study_metafile = None
    study_metadata = None
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []

    # read all meta files (excluding case lists) to determine what to import
    for f in meta_filenames:
        # parse meta file
        metadata, meta_file_type = cbioportal_common.parse_metadata_file(
            f, study_id=study_id, logger=LOGGER)
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + f)
        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in metadata:
            study_id = metadata['cancer_study_identifier']

        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_metafile is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_metafile, f))
            study_metafile = f
            study_metadata = metadata
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], f))
            sample_attr_filepair = (f,
                                    os.path.join(study_directory,
                                                 metadata['data_filename']))
        else:
            regular_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_metafile is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_metafile)
        import_study(jvm_args, study_metafile)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now, import everything else
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # do the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_metadata.get('add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)

    # enable study
    update_study_status(jvm_args, study_id)
def process_directory(jvm_args, study_directory):

    """Import an entire study directory based on meta files found."""

    meta_filenames = (
        os.path.join(study_directory, f) for
        f in os.listdir(study_directory) if
        re.search(r'(\b|_)meta(\b|[_0-9])', f,
                  flags=re.IGNORECASE) and
        not (f.startswith('.') or f.endswith('~')))
    study_id = None
    study_metafile = None
    study_metadata = None
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []
    gene_panel_matrix_filepair = None
    zscore_filepairs = []
    gsva_score_filepair = None
    gsva_pvalue_filepair = None

    # read all meta files (excluding case lists) to determine what to import
    for f in meta_filenames:
        # parse meta file
        metadata, meta_file_type = cbioportal_common.parse_metadata_file(
            f, study_id=study_id, logger=LOGGER)
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + f)
        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in metadata:
            study_id = metadata['cancer_study_identifier']

        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_metafile is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_metafile, f))
            study_metafile = f
            study_metadata = metadata
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], f))   # pylint: disable=unsubscriptable-object
            sample_attr_filepair = (
                f, os.path.join(study_directory, metadata['data_filename']))
        elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX:
            gene_panel_matrix_filepair = (
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.EXPRESSION and metadata['datatype'] == "Z-SCORE":
            zscore_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.GSVA_SCORES:
            gsva_score_filepair = (
                (f, os.path.join(study_directory, metadata['data_filename'])))
        elif meta_file_type == MetaFileTypes.GSVA_PVALUES:
            gsva_pvalue_filepair = (
                (f, os.path.join(study_directory, metadata['data_filename'])))
        else:
            regular_filepairs.append(
                (f, os.path.join(study_directory, metadata['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_metafile is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_metafile)
        import_study(jvm_args, study_metafile)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now, import everything else except gene panel, gsva & z-score expression
    # These data types have to be imported last, because they both refer to expression by
    # source_stable_id. If in the future more types refer to each other, (like
    # in a tree structure) this could be programmed in a recursive fashion.
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now import expression z-score
    for meta_filename, data_filename in zscore_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename)

    # Now import gsva genetic profiles
    if gsva_score_filepair is not None:
        # First import the score data
        meta_filename, data_filename = gsva_score_filepair
        import_study_data(jvm_args, meta_filename, data_filename)
        #Second import the pvalue data
        meta_filename, data_filename = gsva_pvalue_filepair
        import_study_data(jvm_args, gsva_pvalue_filepair[0], gsva_pvalue_filepair[1])


    if gene_panel_matrix_filepair is not None:
        import_study_data(jvm_args, gene_panel_matrix_filepair[0], gene_panel_matrix_filepair[1])

    # do the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_metadata.get('add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)
        
    # enable study
    update_study_status(jvm_args, study_id)
Esempio n. 12
0
def process_directory(jvm_args, study_directory):
    """
    Import an entire study directory based on meta files found.

    1. Determine meta files in study directory.
    2. Read all meta files and determine file types.
    3. Import data files in specific order by file type.
    """

    study_id = None
    study_meta_filename = None
    study_meta_dictionary = {}
    cancer_type_filepairs = []
    sample_attr_filepair = None
    regular_filepairs = []
    gene_panel_matrix_filepair = None
    zscore_filepairs = []
    gsva_score_filepair = None
    gsva_pvalue_filepair = None
    fusion_filepair = None

    # Determine meta filenames in study directory
    meta_filenames = (
        os.path.join(study_directory, meta_filename) for
        meta_filename in os.listdir(study_directory) if
        re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename,
                  flags=re.IGNORECASE) and
        not (meta_filename.startswith('.') or meta_filename.endswith('~')))

    # Read all meta files (excluding case lists) to determine what to import
    for meta_filename in meta_filenames:

        # Parse meta file
        meta_dictionary = cbioportal_common.parse_metadata_file(
            meta_filename, study_id=study_id, logger=LOGGER)

        # Save meta dictionary in study meta dictionary
        study_meta_dictionary[meta_filename] = meta_dictionary

        # Retrieve meta file type
        meta_file_type = meta_dictionary['meta_file_type']
        if meta_file_type is None:
            # invalid meta file, let's die
            raise RuntimeError('Invalid meta file: ' + meta_filename)

        # remember study id to give an error in case any other file is referencing a different one
        if study_id is None and 'cancer_study_identifier' in meta_dictionary:
            study_id = meta_dictionary['cancer_study_identifier']

        # Check the type of metafile. It is to know which metafile types the
        # study contains because at a later stage we want to import in a
        # specific order.

        # Check for cancer type file
        if meta_file_type == MetaFileTypes.CANCER_TYPE:
            cancer_type_filepairs.append(
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Check for meta study file
        elif meta_file_type == MetaFileTypes.STUDY:
            if study_meta_filename is not None:
                raise RuntimeError(
                    'Multiple meta_study files found: {} and {}'.format(
                        study_meta_filename, meta_filename))
            # Determine the study meta filename
            study_meta_filename = meta_filename
            study_meta_dictionary[study_meta_filename] = meta_dictionary
        # Check for sample attributes
        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
            if sample_attr_filepair is not None:
                raise RuntimeError(
                    'Multiple sample attribute files found: {} and {}'.format(
                        sample_attr_filepair[0], meta_filename))   # pylint: disable=unsubscriptable-object
            sample_attr_filepair = (
                meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))
        # Check for gene panel matrix
        elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX:
            gene_panel_matrix_filepair = (
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Check for z-score exression files
        elif meta_file_type == MetaFileTypes.EXPRESSION and meta_dictionary['datatype'] == "Z-SCORE":
            zscore_filepairs.append(
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Check for GSVA scores
        elif meta_file_type == MetaFileTypes.GSVA_SCORES:
            gsva_score_filepair = (
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Check for GSVA p-values
        elif meta_file_type == MetaFileTypes.GSVA_PVALUES:
            gsva_pvalue_filepair = (
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Check for fusion data
        elif meta_file_type == MetaFileTypes.FUSION:
            fusion_filepair = (
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
        # Add all other types of data
        else:
            regular_filepairs.append(
                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))

    # First, import cancer types
    for meta_filename, data_filename in cancer_type_filepairs:
        import_cancer_type(jvm_args, data_filename)

    # Then define the study
    if study_meta_filename is None:
        raise RuntimeError('No meta_study file found')
    else:
        # First remove study if exists
        remove_study(jvm_args, study_meta_filename)
        import_study(jvm_args, study_meta_filename)

    # Next, we need to import sample definitions
    if sample_attr_filepair is None:
        raise RuntimeError('No sample attribute file found')
    else:
        meta_filename, data_filename = sample_attr_filepair
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    # Next, import everything else except gene panel, fusion data, GSVA and
    # z-score expression. If in the future more types refer to each other, (like
    # in a tree structure) this could be programmed in a recursive fashion.
    for meta_filename, data_filename in regular_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    # Import fusion data (after mutation)
    if fusion_filepair is not None:
        meta_filename, data_filename = fusion_filepair
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    # Import expression z-score (after expression)
    for meta_filename, data_filename in zscore_filepairs:
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    # Import GSVA genetic profiles (after expression and z-scores)
    if gsva_score_filepair is not None:

        # First import the GSVA score data
        meta_filename, data_filename = gsva_score_filepair
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

        # Second import the GSVA p-value data
        meta_filename, data_filename = gsva_pvalue_filepair
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    if gene_panel_matrix_filepair is not None:
        meta_filename, data_filename = gene_panel_matrix_filepair
        import_study_data(jvm_args, meta_filename, data_filename, study_meta_dictionary[meta_filename])

    # Import the case lists
    case_list_dirname = os.path.join(study_directory, 'case_lists')
    if os.path.isdir(case_list_dirname):
        process_case_lists(jvm_args, case_list_dirname)

    if study_meta_dictionary[study_meta_filename].get('add_global_case_list', 'false').lower() == 'true':
        add_global_case_list(jvm_args, study_id)

    # enable study
    update_study_status(jvm_args, study_id)