def correlations(df: pd.DataFrame, include_categorical: bool = False, plot_type: str = "dendogram", figsize=[10, 5], categorical_cols: List[str] = []): corr = None cols: List = [] if include_categorical: corr = sr(df).correlation cols = df.columns else: if not len(categorical_cols): categorical_cols = df.select_dtypes(include=[np.object]).columns cols = [c for c in df.columns if c not in categorical_cols] corr = df[cols].corr() cols = corr.columns if plot_type == "dendogram": plot_dendogram(corr, cols, figsize=figsize) elif plot_type == "matrix": plot_matrix(corr, cols, figsize=figsize) else: raise (f"Variable plot_type not valid. Provided: {plot_type}") return corr
def show_corre(y_true, y_test): plt.figure() ax1 = plt.subplot(1, 1, 1) plt.title("after: " + format(sr(y_true, y_test.reshape(-1, 1))[0], '.4f')) plt.scatter(y_true, y_test.reshape(-1, 1)) plt.grid() x0, x1 = ax1.get_xlim() y0, y1 = ax1.get_ylim() ax1.set_aspect((x1 - x0) / (y1 - y0)) plt.ylabel("predicted QoE") plt.xlabel("real QoE") plt.show()
def show_results(X_test, X_test_before_scaling, y_test, regressor_name, feature_labels, answer): if cfg.QUALITY_MODEL + "_" + cfg.POOLING_TYPE in feature_labels: position_vqa = feature_labels.index(cfg.QUALITY_MODEL + "_" + cfg.POOLING_TYPE) plt.figure() ax1 = plt.subplot(1, 1, 1) plt.title( "before: " + format(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0], '.4f')) plt.scatter(y_test, X_test_before_scaling[:, position_vqa].reshape(-1, 1)) plt.grid() x0, x1 = ax1.get_xlim() y0, y1 = ax1.get_ylim() ax1.set_aspect((x1 - x0) / (y1 - y0)) plt.ylabel("predicted QoE") plt.xlabel("MOS") plt.show() plt.figure() ax1 = plt.subplot(1, 1, 1) plt.title("after: " + format(sr(y_test, answer.reshape(-1, 1))[0], '.4f')) plt.scatter(y_test, answer.reshape(-1, 1)) plt.grid() x0, x1 = ax1.get_xlim() y0, y1 = ax1.get_ylim() ax1.set_aspect((x1 - x0) / (y1 - y0)) plt.ylabel("predicted QoE") plt.xlabel("MOS") plt.show() print("SROCC before (" + str(cfg.QUALITY_MODEL) + "): " + str(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0])) print("SROCC using DeepQoE (" + str(cfg.QUALITY_MODEL) + " + " + regressor_name + "): " + str(sr(y_test, answer.reshape(-1, 1))[0]))
def correlations( df: pd.DataFrame, include_categorical: bool = False, plot_type: str = "dendogram", plt_kwargs={}, categorical_cols: List[str] = []): """ Computes the correlations for the columns provided and plots the relevant image as requested by the parameters. :Example: cat_df = xai.balance( df, "gender", "loan", upsample=0.8, downsample=0.8) :param df: Pandas Dataframe containing data (inputs and target ) :type df: pandas.DataFrame :param *cross_cols: One or more positional arguments (passed as *args) that are used to split the data into the cross product of their values :type cross_cols: List[str] :param upsample: [Default: 0.5] Target upsample for columns lower than percentage. :type upsample: float :param downsample: [Default: 1] Target downsample for columns higher than percentage. :type downsample: float :param bins: [Default: 6] Number of bins to be used for numerical cols :type bins: int :param categorical_cols: [Default: []] Columns within dataframe that are categorical. Columns that are not np.objects and are not part explicitly provided here will be treated as numeric, and bins will be used. :type categorical_cols: List[str] :param threshold: [Default: 0.5] Threshold to display in the chart. :type threshold: float :returns: Returns a dataframe containing the correlation values for the features :rtype: pandas.DataFrame """ corr = None cols: List = [] if include_categorical: corr = sr(df).correlation cols = df.columns else: if not len(categorical_cols): categorical_cols = df.select_dtypes(include=[np.object, np.bool]).columns cols = [c for c in df.columns if c not in categorical_cols] corr = df[cols].corr() cols = corr.columns if plot_type == "dendogram": _plot_correlation_dendogram(corr, cols, plt_kwargs=plt_kwargs) elif plot_type == "matrix": _plot_correlation_matrix(corr, cols, plt_kwargs=plt_kwargs) else: raise ValueError(f"Variable plot_type not valid. Provided: {plot_type}") return corr
def calc_spearman(pred, true): try: r, p_value = sr(np.asarray(pred), np.asarray(true)) except ValueError: r = -1.0 return r
def train_shuffle(min_mse=200, max_corr=0): round_max_spea = 0 round_min_mse = 200 # random.shuffle(f) # train = samples[:100] # test = samples[100:] trainset = videoDataset(root=args.root, label="./data/train_dataset.txt", suffix=".npy", transform=transform, data=None, pcs=args.pcs) trainLoader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=0) testset = videoDataset(root=args.root, label="./data/test_dataset.txt", suffix='.npy', transform=transform, data=None, pcs=args.pcs) testLoader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=0) # build the model scoring = Scoring(feature_size=4096) if torch.cuda.is_available(): scoring.cuda() # turn the model into gpu # scoring.load_state_dict(torch.load("./models/merge/pcs.pt")) total_params = sum(p.numel() for p in scoring.parameters() if p.requires_grad) loss_log.write("Total Params: " + str(total_params) + '\n') optimizer = optim.Adam( params=scoring.parameters(), lr=0.0005) # use SGD optimizer to optimize the loss function scheduler = lr_scheduler.StepLR(optimizer, step_size=70, gamma=0.7) for epoch in range(500): # total 40 epoches # scheduler.step() print("Epoch: " + str(epoch) + "Total Params: %d" % total_params) total_regr_loss = 0 total_sample = 0 for i, (features, scores) in enumerate(trainLoader): # get mini-batch # print("%d batches have done" % i) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() # regression, logits = scoring(features) logits, penal = scoring(features) if penal is None: regr_loss = scoring.loss(logits, scores) else: regr_loss = scoring.loss(logits, scores) + penal # new three lines are back propagation optimizer.zero_grad() regr_loss.backward() # nn.utils.clip_grad_norm(scoring.parameters(), 1.5) optimizer.step() total_regr_loss += regr_loss.data.item() * scores.shape[0] total_sample += scores.shape[0] loss_log.write(str(total_regr_loss / total_sample) + '\n') print("Classification Loss: " + str(total_regr_loss / total_sample) + '\n') # the rest is used to evaluate the model with the test dataset torch.save(scoring.state_dict(), './models/epoch{}.pt'.format(epoch)) scoring.eval() val_pred = [] val_sample = 0 val_loss = 0 val_truth = [] for j, (features, scores) in enumerate(testLoader): val_truth.append(scores.numpy()) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() regression, _ = scoring(features) val_pred.append(regression.data.cpu().numpy()) regr_loss = scoring.loss(regression, scores) val_loss += (regr_loss.data.item()) * scores.shape[0] val_sample += scores.shape[0] val_truth = np.concatenate(val_truth) val_pred = np.concatenate(val_pred) val_sr, _ = sr(val_truth, val_pred) if val_loss / val_sample < min_mse: torch.save(scoring.state_dict(), './models/merge/tes_40attn.pt') min_mse = min(min_mse, val_loss / val_sample) max_corr = max(max_corr, val_sr) round_min_mse = min(round_min_mse, val_loss / val_sample) round_max_spea = max(val_sr, round_max_spea) print( "Val Loss: %.2f Correlation: %.2f Min Val Loss: %.2f Max Correlation: %.2f" % (val_loss / val_sample, val_sr, min_mse, max_corr)) scoring.train() w.write('MSE: %.2f spearman: %.2f' % (round_min_mse, round_max_spea)) return min_mse, max_corr
default="movies_accuracy.csv") parser.add_argument("--out_file", help="name of file to write correlation to, if desired", default="out.txt") args = parser.parse_args() # scores: stores modularity scores - outer key = k, inner key = level scores = {} # reading in modularity data with open(args.modularity_file, newline="") as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] for i in range(len(rows[0])): scores[rows[0][i]] = [float(score[i]) for score in rows[1:]] metric = [] # reading in performance data, assuming 1-column list w/ no header with open(args.metrics_file, "r") as csvfile: reader = csv.reader(csvfile) rows = [row for row in reader] for i in range(len(rows)): metric.append(float(rows[i][0])) # writes results to a file, separates columns by commas with open(args.out_file, "w") as o: o.write("category,correlation\n") # column headers for cat in scores.keys(): o.write(cat + "," + str(sr(scores[cat], metric).correlation) + "\n")
def train_shuffle(min_mse=200, max_corr=0): trainset = videoDataset(root="figure_skating/c3d_feat", label="data/train_dataset.txt", suffix=".npy", transform=transform, data=None) trainLoader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=0) testset = videoDataset(root="figure_skating/c3d_feat", label="data/test_dataset.txt", suffix='.npy', transform=transform, data=None) testLoader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=0) # build the model scoring = Scoring(feature_size=4096) if torch.cuda.is_available(): scoring.cuda() # turn the model into gpu total_params = sum(p.numel() for p in scoring.parameters() if p.requires_grad) optimizer = optim.Adam(params=scoring.parameters(), lr=0.0005) # Adam for epoch in range(epoch_num): print("Epoch: " + str(epoch) + "; Total Params: %d" % total_params) total_regr_loss = 0 total_sample = 0 for i, (features, scores) in enumerate(trainLoader): # get mini-batch print("%d batches have done" % i) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() logits, penal = scoring(features) # features.shape=(128,300,4096) if penal is None: regr_loss = scoring.loss(logits, scores) else: regr_loss = scoring.loss(logits, scores) + penal # back propagation optimizer.zero_grad( ) # PyTorch accumulates the gradients, so clean it every step of backward. regr_loss.backward() optimizer.step() total_regr_loss += regr_loss.data.item() * scores.shape[0] total_sample += scores.shape[0] loss = total_regr_loss / total_sample train_loss.append(loss) print("Regression Loss: " + str(loss) + '\n') ### the rest is used to evaluate the model with the test dataset ### scoring.eval( ) # turn the model into evaluation mode(for batch-normalization / dropout layer) val_pred = [] val_sample = 0 val_loss = 0 val_truth = [] for j, (features, scores) in enumerate(testLoader): val_truth.append(scores.numpy()) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() regression, _ = scoring(features) val_pred.append(regression.data.cpu().numpy()) regr_loss = scoring.loss(regression, scores) val_loss += (regr_loss.data.item()) * scores.shape[0] val_sample += scores.shape[0] val_truth = np.concatenate(val_truth) val_pred = np.concatenate(val_pred) val_sr, _ = sr(val_truth, val_pred) if val_loss / val_sample < min_mse: torch.save(scoring.state_dict(), 'S_LSTM+M_LSTM+PCS.pt') min_mse = min(min_mse, val_loss / val_sample) max_corr = max(max_corr, val_sr) loss = val_loss / val_sample test_loss.append(loss) print( "Val Loss: {:.2f} Correlation: {:.2f} Min Val Loss: {:.2f} Max Correlation: {:.2f}" .format(loss, val_sr, min_mse, max_corr)) scoring.train() # turn back to train mode
modularities against wordsim task results. Results printed to console. ''' # imports import csv from scipy.stats import spearmanr as sr import argparse # processing command-line arguments parser = argparse.ArgumentParser() parser.add_argument("--modularity_file", help="name of file with modularity scores, 1-column .csv", default="data/3_2.csv") parser.add_argument( "--downstream_file", help="name of file with downstream performance metrics, 1-column .csv", default="data/loss.csv") args = parser.parse_args() mods = [] with open(args.modularity_file, newline="") as csvfile: reader = csv.reader(csvfile) mods = [float(row[0]) for row in reader] metrics = [] with open(args.downstream_file, newline="") as csvfile: reader = csv.reader(csvfile) metrics = [float(row[0]) for row in reader] print(sr(mods, metrics))
regressor.fit(X_train, np.ravel(y_train)) if hasattr(regressor, 'best_estimator_'): answer = regressor.best_estimator_.predict(X_test) else: answer = regressor.predict(X_test) # locate column of quality model for SROCC without regression if quality_model + "_" + pooling_type in feature_labels: position_vqa = feature_labels.index(quality_model + "_" + pooling_type) # display results plt.figure() ax1 = plt.subplot(1, 1, 1) plt.title("before: " + format(sr(y_test, X_test[:, position_vqa].reshape(-1, 1))[0], '.4f')) plt.scatter(y_test, X_test_before_scaling[:, position_vqa].reshape(-1, 1)) plt.grid() x0, x1 = ax1.get_xlim() y0, y1 = ax1.get_ylim() ax1.set_aspect((x1 - x0) / (y1 - y0)) plt.ylabel("predicted QoE") plt.xlabel("MOS") plt.show() plt.figure() ax1 = plt.subplot(1, 1, 1) plt.title("after: " + format(sr(y_test, answer.reshape(-1, 1))[0], '.4f')) plt.scatter(y_test, answer.reshape(-1, 1)) plt.grid() x0, x1 = ax1.get_xlim()
def train_shuffle(): #random.shuffle(f) #train = samples[:100] #test = samples[100:] trainset = videoDataset( root="/home/xuchengming/MM18/figure-skating/c3d_feat", label="./data/train_dataset.txt", suffix=".npy", transform=transform, data=None) trainLoader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=0) testset = videoDataset( root="/home/xuchengming/MM18/figure-skating/c3d_feat", label="./data/test_dataset.txt", suffix='.npy', transform=transform, data=None) testLoader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False, num_workers=0) #build the model scoring = Scoring(feature_size=4096) if torch.cuda.is_available(): scoring.cuda() #turn the model into gpu optimizer = optim.Adam( params=scoring.parameters(), lr=0.0005) # use SGD optimizer to optimize the loss function scheduler = lr_scheduler.StepLR(optimizer, step_size=70, gamma=0.7) min_mse = 200 max_corr = 0 for epoch in range(500): # total 40 epoches #scheduler.step() print("Epoch: " + str(epoch)) total_regr_loss = 0 total_sample = 0 for i, (features, scores) in enumerate(trainLoader): # get mini-batch #print("%d batches have done" % i) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() #regression, logits = scoring(features) logits, penal = scoring(features) regr_loss = scoring.loss(logits, scores) + penal * 1.0 # new three lines are back propagation optimizer.zero_grad() regr_loss.backward() #nn.utils.clip_grad_norm(scoring.parameters(), 1.5) optimizer.step() total_regr_loss += regr_loss.data.item() * scores.shape[0] total_sample += scores.shape[0] print("Classification Loss: " + str(total_regr_loss / total_sample)) # the rest is used to evaluate the model with the test dataset torch.save(scoring.state_dict(), './models/epoch{}.pt'.format(epoch)) scoring.eval() val_pred = [] val_sample = 0 val_loss = 0 val_truth = [] for j, (features, scores) in enumerate(testLoader): val_truth.append(scores.numpy()) if torch.cuda.is_available(): features = Variable(features).cuda() scores = Variable(scores).cuda() regression, _ = scoring(features) val_pred.append(regression.data.cpu().numpy()) regr_loss = scoring.loss(regression, scores) val_loss += (regr_loss.data.item()) * scores.shape[0] val_sample += scores.shape[0] val_truth = np.concatenate(val_truth) val_pred = np.concatenate(val_pred) val_sr, _ = sr(val_truth, val_pred) min_mse = min(min_mse, val_loss / val_sample) max_corr = max(max_corr, val_sr) print( "Val Loss: %.2f Correlation: %.2f Min Val Loss: %.2f Max Correlation: %.2f" % (val_loss / val_sample, val_sr, min_mse, max_corr)) scoring.train() w.write(str(max_corr) + '\n')