Exemple #1
0
def add_design(session,
               design_file,
               dataset_name,
               name=None,
               description=None,
               sample_groups_file=None,
               sample_group_levels_file=None,
               sample_group_pheno_column=None):
    ds = get_ds(session, dataset_name)

    name = get_or_guess_name(name, design_file)

    design = query_design(session, name, dataset_name).one_or_none()
    if design is not None:
        raise ROGERUsageError(
            "Design of data set '%s' with name '%s' already exist" %
            (dataset_name, name))

    design_data = create_design_data(
        read_table(design_file, sep='\t',
                   index_col=0), ds.pheno_data, name, description,
        read_array(sample_groups_file, nullable=True),
        read_array(sample_group_levels_file, nullable=True),
        sample_group_pheno_column)

    design_data.design.DataSetID = ds.ID
    session.add(design_data.design)
    session.flush()

    design_data.sample_subset["DesignID"] = design_data.design.ID
    insert_data_frame(session, design_data.sample_subset,
                      SampleSubset.__table__)

    session.commit()
    return name
Exemple #2
0
def add_contrast(session,
                 contrast_file,
                 design_name,
                 dataset_name,
                 name=None,
                 description=None):
    design = get_design(session, design_name, dataset_name)

    name = get_or_guess_name(name, contrast_file)

    if query_contrast(session, name, design_name,
                      dataset_name).one_or_none() is not None:
        raise ROGERUsageError("Contrast '%s' already exist in '%s'" %
                              (name, design_name))

    contrast = Contrast(DesignID=design.ID,
                        Name=name,
                        Description=description,
                        CreatedBy=get_current_user_name(),
                        CreationTime=get_current_datetime())
    session.add(contrast)
    session.flush()

    contrast_data = read_table(contrast_file, sep='\t', index_col=0)
    check_contrast_matrix(design.design_matrix.columns, contrast_data)

    contrast_cols = contrast_data.columns
    contrast_table = DataFrame({
        "ContrastID":
        contrast.ID,
        "DesignID":
        design.ID,
        "Name":
        contrast_cols,
        "Description":
        contrast_cols,
        "ColumnData": [
            contrast_data[col_name].values.tolist()
            for col_name in contrast_cols
        ]
    })

    insert_data_frame(session, contrast_table, ContrastColumn.__table__)

    session.commit()
    return name
Exemple #3
0
def perform_gse(session: Session,
                roger_wd_dir: str,
                dge_model: DGEmodel,
                algorithm: GSEAlgorithm,
                gene_set_category_filter: List[str] = None):
    existing_results = get_gse_result(session, dge_model.Contrast.Name,
                                      dge_model.Contrast.Design.Name,
                                      dge_model.Contrast.Design.DataSet.Name,
                                      dge_model.Method.Name, algorithm.name)
    if existing_results:
        raise ROGERUsageError(
            "Result for %s:%s:%s:%s:%s already exists" %
            (dge_model.Contrast.Name, dge_model.Contrast.Design.Name,
             dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name,
             algorithm.name))

    gene_sets = get_gmt_locations(session, gene_set_category_filter)
    gscs_list = {
        gene_set.Category: gene_set.FileWC
        for index, gene_set in gene_sets.iterrows()
    }

    if len(gscs_list) == 0:
        raise ROGERUsageError(
            "Cannot perform GSE without preexisting gene sets (did you import GMT files?)"
        )

    gscs = ribios_gsea.readGmt(ListVector(gscs_list))

    contrast_columns = dge_model.Contrast.contrast_columns

    gse_method_id = session.query(GSEmethod.ID) \
        .filter(GSEmethod.DGEmethodID == DGEmethod.ID) \
        .filter(DGEmethod.ID == dge_model.Method.ID) \
        .filter(GSEmethod.Name == algorithm.name).scalar()

    gse_algo_result = algorithm.exec_gse(dge_model, gscs)
    enrich_tbl = gse_algo_result.raw_gse_table

    gene_sets.Category = gene_sets.Category.str.lower()
    gene_sets.Name = gene_sets.Name.str.lower()
    enrich_tbl.Category = enrich_tbl.Category.str.lower()
    enrich_tbl.GeneSet = enrich_tbl.GeneSet.str.lower()
    merged_enrich_tbl = enrich_tbl.join(
        gene_sets.set_index(['Category', 'Name']),
        on=['Category', "GeneSet"]).join(contrast_columns.set_index("Name"),
                                         on="Contrast",
                                         lsuffix="_GENE_SET")

    gse_method_sub_dir = "%d_%s" % (dge_model.Contrast.ID, algorithm.name)
    gse_models_path = os.path.join(roger_wd_dir, GSE_RESULT_SUB_FOLDER)
    gse_model_path = os.path.join(gse_models_path, gse_method_sub_dir)
    if not os.path.exists(gse_model_path):
        os.makedirs(gse_model_path)

    gse_result_file = os.path.join(gse_model_path, "gse_table.txt")
    write_df(enrich_tbl, gse_result_file)

    gse_result = GSEresult(ContrastID=dge_model.ContrastID,
                           DGEmethodID=dge_model.DGEmethodID,
                           GSEmethodID=gse_method_id,
                           OutputFile=gse_result_file,
                           MethodDescription=gse_algo_result.method_desc)
    session.add(gse_result)
    session.flush()

    gse_tbl = DataFrame({
        "GSEresultID":
        gse_result.ID,
        "ContrastColumnID":
        merged_enrich_tbl.ID,
        "GeneSetID":
        merged_enrich_tbl.ID_GENE_SET,
        "Correlation":
        merged_enrich_tbl.Correlation,
        "Direction":
        merged_enrich_tbl.Direction.map({
            "Up": 1,
            "Down": -1
        }),
        "PValue":
        merged_enrich_tbl.PValue,
        "FDR":
        merged_enrich_tbl.FDR,
        "EnrichmentScore":
        merged_enrich_tbl.Direction.map({
            "Up": 1,
            "Down": -1
        }) * abs(log10(merged_enrich_tbl.PValue)),
        "EffGeneCount":
        merged_enrich_tbl.NGenes
    })
    unmapped = gse_tbl[gse_tbl.GeneSetID.isnull()]
    mapped = gse_tbl[~gse_tbl.GeneSetID.isnull()]
    if unmapped.shape[0] > 0:
        print("Warning: unable to map %d of %d entries to gene sets " %
              (unmapped.shape[0], merged_enrich_tbl.shape[0]))

    mapped_duplications = mapped.drop_duplicates(
        subset=['ContrastColumnID', 'GeneSetID'])

    if mapped_duplications.shape[0] < mapped.shape[0]:
        print(
            "Warning: %d of %d entries of mapped result entries are duplicated"
            %
            (mapped.shape[0] - mapped_duplications.shape[0], mapped.shape[0]))

    insert_data_frame(session, mapped_duplications, GSEtable.__table__)
    session.commit()
Exemple #4
0
def add_gmt(session,
            roger_wd_dir,
            category_name,
            file,
            tax_id,
            description=None):
    gene_anno = as_data_frame(
        session.query(GeneAnnotation).filter(GeneAnnotation.TaxID == tax_id))
    # TODO Make min_size configurable?
    gmt = gsea_gmt_parser(file, min_size=1, max_size=sys.maxsize)

    gene_sets_path = os.path.join(roger_wd_dir, GENE_SET_SUB_FOLDER)
    file_copy_path = os.path.join(gene_sets_path, os.path.basename(file))

    category = GeneSetCategory(Name=category_name,
                               FileWC=file_copy_path,
                               FileSrc=os.path.abspath(file))
    session.add(category)

    if not os.path.exists(gene_sets_path):
        os.makedirs(gene_sets_path)
    shutil.copy(file, file_copy_path)

    session.flush()

    gene_sets = [
        GeneSet(Category=category,
                Name=gene_set_name,
                TaxID=tax_id,
                Description=description,
                GeneCount=len(genes),
                IsPrivate=False) for gene_set_name, genes in gmt.items()
    ]
    session.add_all(gene_sets)
    session.flush()
    gene_set_dict = {gene_set.Name: gene_set for gene_set in gene_sets}

    gene_set_data = {'GeneSetID': [], 'GeneSymbol': []}
    for gene_set_name, genes in gmt.items():
        gene_set_data['GeneSetID'] += [gene_set_dict[gene_set_name].ID
                                       ] * len(genes)
        gene_set_data['GeneSymbol'] += genes

    genes_table = pd.DataFrame.from_dict(gene_set_data)
    annotated_genes = genes_table.join(gene_anno.set_index('GeneSymbol'),
                                       on='GeneSymbol')
    # Filter out non-matching genes
    matched_genes = annotated_genes[annotated_genes.RogerGeneIndex.notna()] \
        .drop_duplicates(subset=['RogerGeneIndex', 'GeneSetID'], keep=False)

    # Bulk insert all gene set genes
    insert_data_frame(session,
                      matched_genes,
                      GeneSetGene.__table__,
                      chunk_size=100000)
    session.commit()

    # Report number of gene symbols that could not be matched with gene annotation
    p_unknown_gene_symbols = (annotated_genes.shape[0] - matched_genes.shape[0]
                              ) / float(annotated_genes.shape[0])
    return p_unknown_gene_symbols
Exemple #5
0
def add_species(session, dataset_name, tax_id):
    annotation_service = roger.logic.mart.provider.get_annotation_service()
    # Check if dataset is already preset in the database
    species_table = list_species(session)

    if species_table[species_table.TaxID ==
                     human_tax_id].empty and human_tax_id != tax_id:
        raise ROGERUsageError(
            'No human species annotation data present - import human gene annotation first'
        )
    if not species_table[species_table.TaxID == tax_id].empty:
        raise ROGERUsageError('Species already exists in database: %s' %
                              dataset_name)

    homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene',
                          dataset_name)
    homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog',
                            dataset_name)

    # Insert Gene annotation
    dataset = annotation_service.get_dataset(dataset_name)
    # TODO fix this, should move into provider.py
    version = "%s %s" % (dataset_name,
                         re.search(r'[^(]+\(([^)]+)\)',
                                   dataset.display_name).group(1))

    gene_anno = dataset.get_bulk_query(
        params={
            'attributes': [
                "ensembl_gene_id", "entrezgene", "gene_biotype",
                "external_gene_name"
            ]
        })

    next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex)

    genes = DataFrame({
        'RogerGeneIndex':
        range(next_id, next_id + gene_anno.shape[0]),
        'Version':
        version,
        'TaxID':
        tax_id,
        'EnsemblGeneID':
        gene_anno["ensembl_gene_id"],
        'EntrezGeneID':
        gene_anno["entrezgene"],
        'GeneType':
        gene_anno["gene_biotype"],
        'GeneSymbol':
        gene_anno["external_gene_name"],
        'IsObsolete':
        False
    })
    insert_data_frame(session, genes, GeneAnnotation.__table__)

    # Insert orthologs
    if tax_id == human_tax_id:
        orthologs = DataFrame({
            'RogerGeneIndex': genes["RogerGeneIndex"],
            'HumanRogerGeneIndex': genes["RogerGeneIndex"]
        })
        insert_data_frame(session, orthologs, Ortholog.__table__)
        session.commit()
        return

    huma_anno_query = as_data_frame(
        session.query(GeneAnnotation).filter(
            GeneAnnotation.TaxID == human_tax_id))
    ortho = annotation_service.get_bulk_query(
        human_dataset,
        params={
            'attributes': ["ensembl_gene_id", homolog_attr],
            'filters': {
                homolog_filter: True
            }
        })
    merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \
        .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other')

    orthologs = DataFrame({
        'RogerGeneIndex':
        merged_ortho["RogerGeneIndexOther"],
        'HumanRogerGeneIndex':
        merged_ortho["RogerGeneIndexHuman"]
    })
    insert_data_frame(session, orthologs, Ortholog.__table__)
    session.commit()
Exemple #6
0
def run_dge(session, roger_wd_dir, contrast, design, dataset,
            algorithm: DGEAlgorithm):
    model = query_dge_models(session, contrast, design, dataset,
                             algorithm.name, DGEmodel).one_or_none()
    if model is not None:
        raise ROGERUsageError(
            "A model for %s:%s:%s has already been generated by the method '%s'"
            % (dataset, design, contrast, algorithm.name))

    print("Retrieving data from database")
    contrast_data = get_contrast(session, contrast, design, dataset)
    design_data = contrast_data.Design
    ds_data = design_data.DataSet

    feature_data = ds_data.feature_data

    print("Performing differential gene expression analysis using %s" %
          algorithm.name)
    contrast_matrix = contrast_data.contrast_matrix
    dge_result = algorithm.exec_dge(ds_data.ExprsWC, feature_data, design_data,
                                    contrast_matrix)

    print("Persisting model information")
    method = session.query(DGEmethod).filter(
        DGEmethod.Name == algorithm.name).one()

    dge_method_sub_dir = "%d_%d" % (contrast_data.ID, method.ID)
    dge_models_path = os.path.join(roger_wd_dir, DGE_RESULT_SUB_FOLDER)
    dge_model_path = os.path.join(dge_models_path, dge_method_sub_dir)
    if not os.path.exists(dge_model_path):
        os.makedirs(dge_model_path)

    input_obj_file = os.path.abspath(
        os.path.join(dge_model_path, "limma_input_obj.rds"))
    base.saveRDS(dge_result.input_obj, file=input_obj_file)

    fit_obj_file = os.path.abspath(
        os.path.join(dge_model_path, "limma_fit_obj"))
    base.saveRDS(dge_result.fit_obj, file=fit_obj_file)

    dge_model = DGEmodel(ContrastID=contrast_data.ID,
                         DGEmethodID=method.ID,
                         InputObjFile=input_obj_file,
                         FitObjFile=fit_obj_file,
                         MethodDescription=dge_result.method_description)

    session.add(dge_model)
    session.flush()

    print("Persisting feature subsets")
    feature_subset = pd.DataFrame({
        "FeatureIndex":
        feature_data["FeatureIndex"],
        "DataSetID":
        ds_data.ID,
        "ContrastID":
        contrast_data.ID,
        "DGEmethodID":
        method.ID,
        "IsUsed":
        dge_result.used_feature_list,
        "Description":
        "Default filtering by '%s'" % algorithm.name
    })
    insert_data_frame(session, feature_subset, FeatureSubset.__table__)

    dge_tbl = dge_result.dge_table \
        .join(contrast_data.contrast_columns.set_index("Name"), on="Contrast", rsuffix="_C") \
        .join(feature_data.set_index("FeatureIndex"), on="FeatureIndex", rsuffix="_F")

    dgetable = pd.DataFrame({
        'ContrastColumnID': dge_tbl["ID"],
        'FeatureIndex': dge_tbl["FeatureIndex"],
        "ContrastID": contrast_data.ID,
        "DGEmethodID": method.ID,
        'DataSetID': dge_tbl["DataSetID"],
        'AveExprs': dge_tbl["AveExpr"],
        'Statistic': dge_tbl["t"],
        'LogFC': dge_tbl["logFC"],
        'PValue': dge_tbl["PValue"],
        'FDR': dge_tbl["FDR"]
    })
    insert_data_frame(session, dgetable, DGEtable.__table__)
    session.commit()