def train(self): """ Execute the training process for the Wikipedia Extractor. """ raw_train_data = WikipediaParser.parse(self.__train_seasons) train_output = np.array([1.0 if get_is_mol(player) else 0.0 for player in raw_train_data]) job_input = np.array([data.job_features for data in raw_train_data.values()]) self.__train_job_clusters(job_input) job_input = self.__discretize_jobs(job_input) self.__pca = PCA(n_components = self.__pca_components) job_input = self.__pca.fit_transform(job_input) word_input = np.array([[data.word_feature ** i for i in range(1, self.WORD_COUNT_POLY_DEGREE + 1)] for data in raw_train_data.values()]) train_input = np.concatenate((job_input, word_input), axis = 1) self.__lda = LinearDiscriminantAnalysis() self.__lda.fit(train_input, train_output)
def get_predict_data(self) -> Dict[Player, bool]: """ Get all formatted predict data useable for the machine learning algorithms to do a prediction. Returns: A dictionary with as key the players of that season and as value the formatted predict input. """ raw_predict_data = WikipediaParser.parse({self.__predict_season}) predict_data = dict() for player, data in raw_predict_data.items(): job_input = self.__discretize_jobs(np.array([data.job_features])) job_input = self.__pca.transform(job_input) word_input = np.array([[data.word_feature ** i for i in range(1, self.WORD_COUNT_POLY_DEGREE + 1)]]) predict_input = np.concatenate((job_input, word_input), axis = 1) predict_input = self.__lda.transform(predict_input) predict_data[player] = predict_input[0] z_scores = sc.stats.zscore([data for data in predict_data.values()]) return {player: z_score < self.__unlikely_z_score for player, z_score in zip(predict_data.keys(), z_scores)}
# Count the occurrences of each job for a given player from Data.Player import Player from Data.Wikipedia.Job import Job from Layers.Wikipedia.WikipediaParser import WikipediaParser PLAYER = Player.NADJA_7 dictionary = WikipediaParser.get_standard_dictionary() features = WikipediaParser.extract_player_features(PLAYER, dictionary) print("Number of words: " + str(features.number_words)) for job in Job: print(str(job) + ": " + str(features.job_counts[job]))
# Count all words for these seasons from collections import Counter from Data.Player import Player from Data.PlayerData import get_season from Layers.Wikipedia.WikipediaParser import WikipediaParser from nltk.corpus import stopwords SEASONS = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} pure_counter = Counter() for player in Player: if get_season(player) in SEASONS: pure_counter += WikipediaParser.wiki_file_parse(player) compound_counter = Counter() dictionary = WikipediaParser.get_standard_dictionary() stop_words = set(stopwords.words('dutch')) for word, count in pure_counter.items(): sub_words = WikipediaParser.get_all_sub_words(word, dictionary, WikipediaParser.MIN_LENGTH_COMPOUND_WORD) sub_words.difference_update(stop_words) for sub_word in sub_words: compound_counter[sub_word] += count print(compound_counter)
from Data.PlayerData import get_is_mol from Data.Wikipedia.Job import Job from Layers.Wikipedia.WikipediaParser import WikipediaParser import matplotlib.pyplot as plt SEASONS = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} JOB = Job.TV_PRODUCER data = WikipediaParser.parse(SEASONS) job_mapping = dict([(job, i) for i, job in enumerate(Job)]) job_index = job_mapping[JOB] train_input = [d.job_features[job_index] for p, d in data.items()] train_output = [1.0 if get_is_mol(p) else 0.0 for p in data] plt.figure(figsize=(12, 3)) plt.xlabel("Score") plt.ylabel("Is 'mol'") plt.yticks([0.0, 1.0]) plt.gcf().subplots_adjust(bottom=0.15) plt.scatter(train_input, train_output, s=4) plt.show()
from Data.Wikipedia.Job import Job from Layers.Wikipedia.WikipediaParser import WikipediaParser SEASONS = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} dictionary = WikipediaParser.get_standard_dictionary() job_occurrences = dict() all_data = dict() for season in SEASONS: all_data.update(WikipediaParser.parse_raw(season, dictionary)) for data in all_data.values(): for job in Job: count = data.job_counts[job] job_occurrences[job] = job_occurrences.get(job, []) + [count] for job, occurrences in job_occurrences.items(): print(job) print(sorted(occurrences)) print("Number of words:") print(sorted([data.number_words for data in all_data.values()]))
from Layers.Wikipedia.WikipediaParser import WikipediaParser LARGER_WORD = "nieuwsprogramma" MINIMUM_LENGTH = 4 dictionary = WikipediaParser.get_standard_dictionary() sub_words = list(WikipediaParser.get_all_sub_words(LARGER_WORD, dictionary, MINIMUM_LENGTH)) sub_words = sorted(sub_words, key = lambda w: (-len(w), w)) print(sub_words)
from Data.PlayerData import get_is_mol from Data.Wikipedia.Job import Job from Layers.Wikipedia.WikipediaParser import WikipediaParser from scipy.stats import kruskal, levene import numpy as np TRAIN_SEASONS = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20} data = WikipediaParser.parse(TRAIN_SEASONS) predict_input = np.array([d.job_features for p, d in data.items()]) predict_output = np.array([1.0 if get_is_mol(p) else 0.0 for p in data]) for job, column in zip(Job, predict_input.T): mol_features = [ value for value, is_mol in zip(column, predict_output) if is_mol == 1.0 ] non_mol_features = [ value for value, is_mol in zip(column, predict_output) if is_mol == 0.0 ] _, mean_p_value = kruskal(mol_features, non_mol_features) _, std_p_value = levene(mol_features, non_mol_features) print( str(job) + " - Mean: " + str(mean_p_value) + ", Std: " + str(std_p_value))