yO = yO[0:subsetSize] yC = yC[0:subsetSize] yE = yE[0:subsetSize] yA = yA[0:subsetSize] yN = yN[0:subsetSize] #save lists because transformTextForTraining() changes them old_yO = yO old_yC = yC old_yE = yE old_yA = yA old_yN = yN [sumE, yO, yC, yE, yA, yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, "sum", transform) maxE = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, "max", transform)[0] minE = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, "min", transform)[0] avgE = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, "avg", transform)[0] conE = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, "conc", transform)[0]
subsetSize = 1000 posts = posts[0:subsetSize] yO = yO[0:subsetSize] yC = yC[0:subsetSize] yE = yE[0:subsetSize] yA = yA[0:subsetSize] yN = yN[0:subsetSize] old_yO = yO old_yC = yC old_yE = yE old_yA = yA old_yN = yN [conE, yO, yC, yE, yA, yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, old_yO, old_yC, old_yE, old_yA, old_yN, method, transform) print("Embeddings computed.") split_index = round(len(conE) * 0.85) data_train = conE[:split_index] data_test = conE[split_index:] l = 1 for labels in [yO, yC, yE, yA, yN]: if l == 1: big5trait = "O" print("[SVM] computing results for Openness...") elif l == 2: big5trait = "C"
dfs = df.sample(3000) print("Training set shuffled.") print("Loading embeddings dataset...") wordDictionary = dsu.parseFastText(dataset_path) print("Dataset correctly laoded.") posts = dfs["message"] yO = np.array(dfs["ope"], dtype=pd.Series) yC = np.array(dfs["con"], dtype=pd.Series) yE = np.array(dfs["ext"], dtype=pd.Series) yA = np.array(dfs["agr"], dtype=pd.Series) yN = np.array(dfs["neu"], dtype=pd.Series) [conE, yO, yC, yE, yA, yN] = embeddings.transformTextForTraining(wordDictionary, post_threshold, posts, yO, yC, yE, yA, yN, "conc", True) print("\tEmbeddings computed.") trait = 1 for labels in [yO, yC, yE, yA, yN]: if trait == 1: big5trait = "O" gamma = 1 C = 1 print(" Training model for Openness...") elif trait == 2: big5trait = "C" gamma = 1 C = 1
yA = np.array(yA) yN = np.array(yN) s = np.arange(filteredTweets.shape[0]) np.random.shuffle(s) filteredTweets = filteredTweets[s] yO = yO[s] yC = yC[s] yE = yE[s] yA = yA[s] yN = yN[s] print("Data shuffled.") [conE, yO, yC, yE, yA, yN] = embeddings.transformTextForTraining(wordDictionary, tweet_threshold, filteredTweets, yO, yC, yE, yA, yN, "conc", True) print("Embeddings computed.") l = 1 k_fold = KFold(n_splits=4) for labels in [yO, yC, yE, yA, yN]: if l == 1: big5trait = "O" gamma = 1 C = 1 print("Training model for Openness...") elif l == 2: big5trait = "C" gamma = 1