def main(): # Reads in a data set df, tar, grp, _, _ = data_reader.read_main(raw=True) # Code for DS 4 (GuihuaSun) """ missing = ['110608_TGCTCG_s_5', '110608_TGCTCG_s_1', '110608_TCGTCG_s_6', '110608_TCGTCG_s_5', '110602_TGCTCG_s_4', '110602_TGCTCG_s_3', '110602_TCGTCG_s_4', '110602_TCGTCG_s_3'] df = df.drop(missing) tar = tar.drop(missing) grp = grp.drop(missing) #lst = extract_mirnas_r_guihuasun(df, ss, gs_name="_4") """ # Running extract_mirnas_r ss = pd.DataFrame([tar, grp]) ss = ss.rename({ ss.axes[0][0]: 'groups', ss.axes[0][1]: 'block' }, axis='index').transpose() df = df.transpose() lst = extract_mirnas_r(df, ss, gs_name="_3") # Creating gmt file path = r'%s' % getcwd().replace('\\', '/') + "/Out/" create_gmt_file(lst, path, new=True)
def test_make_density_plot(): df, _, _, length, _ = data_reader.read_main(raw=False) features = df.axes[1] samples = df.axes[0] df = MiRNAScaler.choose_scaling(df, length) df = pd.DataFrame(df, index=samples, columns=features) df = df.transpose() #make_density_plot(df) latexify(columns=2) make_full_density_plot(df, 'Density Plot of Hepmark Tissue')
def main(): # Import data df, tar, grp, lengths, _ = data_reader.read_main(raw=True) # Log transform keeping all columns as they will be used in the gsea. df = df_utils.transform_sequence_to_microarray(df.T, all=True) # Handling for microarray set 0 as this does not require log transformation """ df1, _, _ = data_reader.read_number(1) df2, _, _ = data_reader.read_number(2) df_len = len(df) df = df_utils.merge_frames([df,df1,df2], drop=False) df = df.head(df_len) """ sample = df.T ss = gseapy.ssgsea(data=sample , gene_sets='Out/gmt_hepmark.gmt' , no_plot=True , outdir='Out/gsea_hepmark' , min_size=10) # "When you run the gene set enrichment analysis, the GSEA software automatically normalizes # the enrichment scores for variation in gene set size, as described in GSEA Statistics. # Nevertheless, the normalization is not very accurate for extremely small or extremely # large gene sets. For example, for gene sets with fewer than 10 genes, just 2 or 3 genes # can generate significant results. Therefore, by default, GSEA ignores gene sets that # contain fewer than 25 genes or more than 500 genes; defaults that are appropriate for # datasets with 10,000 to 20,000 features. To change these default values, use the Max Size # and Min Size parameters on the Run GSEA Page; however, keep in mind the possibility of # inflated scorings for very small gene sets and inaccurate normalization for large ones." # Setup df file rows = [] for s in ss.resultsOnSamples: row = [s] for val in ss.resultsOnSamples[s]: row.append(val) rows.append(row) columns = ['index'] columns.extend([x for x in ss.resultsOnSamples[s].axes[0]]) # NB : Do not use the res2d as this is the normalized score # Create es file df_out = pd.DataFrame(rows, columns = columns) df_out.set_index('index', inplace=True) path = r'%s' % getcwd().replace('\\','/') + "/Out/enrichment_scores/" df_out.to_csv(path+"es_test.csv")
from sklearn import svm from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from sklearn.model_selection import LeaveOneOut from scipy import interp import scaler as MiRNAScaler from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier from sklearn.metrics import classification_report from sklearn.feature_selection import RFECV from sklearn.ensemble import RandomForestClassifier from tqdm import tqdm # Import data df, target, group, lengths, es = data_reader.read_main() features = df.axes[1] samples = df.axes[0] # Scale data print(df.shape) X = MiRNAScaler.choose_scaling(df, lengths) df = pd.DataFrame(X, index=samples, columns=features) print("DF shape", X.shape) # Set seed for reproducability np.random.seed(0) n_samples, n_features = X.shape
import df_utils from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_curve, auc, roc_auc_score from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV from scipy import interp import scaler as MiRNAScaler from sklearn.neighbors import KNeighborsClassifier from utils import latexify from sklearn import svm # Import data df, target, group, lengths, _ = data_reader.read_main(raw=False) features = df.axes[1] samples = df.axes[0] # Scale data print(df.shape) X = MiRNAScaler.choose_scaling(df, lengths) df = pd.DataFrame(X, index=samples, columns=features) print("DF shape", X.shape) # Transform labels to real values y = np.array([0 if l == 'Normal' else 1 if l == 'Tumor' else 2 for l in target]) df["target"] = y # Set seed for reproducability
""" Vegard Bjørgan 2019 Analyze one or more data sets and the effects of scaling miRNAs in a box plot """ import numpy as np import matplotlib.pyplot as plt import pandas as pd import data_reader import scaler as MiRNAScaler # Import data df, tar, grp, _, _ = data_reader.read_main() # Scale data features = df.axes[1].values #df[features] = MiRNAScaler.standard_scaler(df) #df[features] = MiRNAScaler.robust_scaler(df) df[features] = MiRNAScaler.minmax_scaler(df) #df[features] = MiRNAScaler.quantile_scaler(df) #df[features] = MiRNAScaler.individual_scaler(df.values) data = [] row = [] for sample in df.axes[0]: data.append(df.loc[sample]) row.append(sample) # Extract 10 miRNA og show them in a box plot last = 0