Example #1
0
def main():
    # Reads in a data set
    df, tar, grp, _, _ = data_reader.read_main(raw=True)

    # Code for DS 4 (GuihuaSun)
    """
    missing = ['110608_TGCTCG_s_5', '110608_TGCTCG_s_1', '110608_TCGTCG_s_6', '110608_TCGTCG_s_5', '110602_TGCTCG_s_4', '110602_TGCTCG_s_3', '110602_TCGTCG_s_4', '110602_TCGTCG_s_3']
    df = df.drop(missing)
    tar = tar.drop(missing)
    grp = grp.drop(missing)
    #lst = extract_mirnas_r_guihuasun(df, ss, gs_name="_4")
    """

    # Running extract_mirnas_r
    ss = pd.DataFrame([tar, grp])
    ss = ss.rename({
        ss.axes[0][0]: 'groups',
        ss.axes[0][1]: 'block'
    },
                   axis='index').transpose()
    df = df.transpose()
    lst = extract_mirnas_r(df, ss, gs_name="_3")

    # Creating gmt file
    path = r'%s' % getcwd().replace('\\', '/') + "/Out/"
    create_gmt_file(lst, path, new=True)
Example #2
0
def test_make_density_plot():
    df, _, _, length, _ = data_reader.read_main(raw=False)
    features = df.axes[1]
    samples = df.axes[0]
    df = MiRNAScaler.choose_scaling(df, length)
    df = pd.DataFrame(df, index=samples, columns=features)
    df = df.transpose()
    #make_density_plot(df)
    latexify(columns=2)
    make_full_density_plot(df, 'Density Plot of Hepmark Tissue')
def main():
    # Import data
    df, tar, grp, lengths, _ = data_reader.read_main(raw=True)

    # Log transform keeping all columns as they will be used in the gsea.
    df = df_utils.transform_sequence_to_microarray(df.T, all=True)

    # Handling for microarray set 0 as this does not require log transformation
    """
    df1, _, _ = data_reader.read_number(1)
    df2, _, _ = data_reader.read_number(2)
    df_len = len(df)
    df = df_utils.merge_frames([df,df1,df2], drop=False)
    df = df.head(df_len)
    """


    sample = df.T

    ss = gseapy.ssgsea(data=sample
                    , gene_sets='Out/gmt_hepmark.gmt'
                    , no_plot=True
                    , outdir='Out/gsea_hepmark'
                    , min_size=10)
    # "When you run the gene set enrichment analysis, the GSEA software automatically normalizes
    # the enrichment scores for variation in gene set size, as described in GSEA Statistics.
    # Nevertheless, the normalization is not very accurate for extremely small or extremely
    # large gene sets. For example, for gene sets with fewer than 10 genes, just 2 or 3 genes
    # can generate significant results. Therefore, by default, GSEA ignores gene sets that
    # contain fewer than 25 genes or more than 500 genes; defaults that are appropriate for
    # datasets with 10,000 to 20,000 features. To change these default values, use the Max Size
    # and Min Size parameters on the Run GSEA Page; however, keep in mind the possibility of
    # inflated scorings for very small gene sets and inaccurate normalization for large ones."

    # Setup df file
    rows = []
    for s in ss.resultsOnSamples:
        row = [s]
        for val in ss.resultsOnSamples[s]:
            row.append(val)
        rows.append(row)
    columns = ['index']
    columns.extend([x for x in ss.resultsOnSamples[s].axes[0]])
    # NB : Do not use the res2d as this is the normalized score

    # Create es file
    df_out = pd.DataFrame(rows, columns = columns)
    df_out.set_index('index', inplace=True)
    path = r'%s' % getcwd().replace('\\','/') + "/Out/enrichment_scores/"
    df_out.to_csv(path+"es_test.csv")
Example #4
0
from sklearn import svm
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from scipy import interp
import scaler as MiRNAScaler
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

# Import data
df, target, group, lengths, es = data_reader.read_main()

features = df.axes[1]
samples = df.axes[0]

# Scale data
print(df.shape)
X = MiRNAScaler.choose_scaling(df, lengths)
df = pd.DataFrame(X, index=samples, columns=features)
print("DF shape", X.shape)

# Set seed for reproducability
np.random.seed(0)

n_samples, n_features = X.shape
Example #5
0
import df_utils
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy import interp
import scaler as MiRNAScaler
from sklearn.neighbors import KNeighborsClassifier
from utils import latexify
from sklearn import svm

# Import data
df, target, group, lengths, _ = data_reader.read_main(raw=False)

features = df.axes[1]
samples = df.axes[0]

# Scale data
print(df.shape)
X = MiRNAScaler.choose_scaling(df, lengths)
df = pd.DataFrame(X, index=samples, columns=features)
print("DF shape", X.shape)

# Transform labels to real values
y = np.array([0 if l == 'Normal' else 1 if l == 'Tumor' else 2 for l in target])
df["target"] = y

# Set seed for reproducability
Example #6
0
"""
Vegard Bjørgan 2019

Analyze one or more data sets and the effects of scaling miRNAs in a box plot
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import data_reader
import scaler as MiRNAScaler

# Import data
df, tar, grp, _, _ = data_reader.read_main()

# Scale data
features = df.axes[1].values
#df[features] = MiRNAScaler.standard_scaler(df)
#df[features] = MiRNAScaler.robust_scaler(df)
df[features] = MiRNAScaler.minmax_scaler(df)
#df[features] = MiRNAScaler.quantile_scaler(df)
#df[features] = MiRNAScaler.individual_scaler(df.values)

data = []
row = []
for sample in df.axes[0]:
    data.append(df.loc[sample])
    row.append(sample)

# Extract 10 miRNA og show them in a box plot
last = 0