Example #1
0
 def train(self):
     """ Execute the training process for the Wikipedia Extractor. """
     raw_train_data = WikipediaParser.parse(self.__train_seasons)
     train_output = np.array([1.0 if get_is_mol(player) else 0.0 for player in raw_train_data])
     job_input = np.array([data.job_features for data in raw_train_data.values()])
     self.__train_job_clusters(job_input)
     job_input = self.__discretize_jobs(job_input)
     self.__pca = PCA(n_components = self.__pca_components)
     job_input = self.__pca.fit_transform(job_input)
     word_input = np.array([[data.word_feature ** i for i in range(1, self.WORD_COUNT_POLY_DEGREE + 1)]
                            for data in raw_train_data.values()])
     train_input = np.concatenate((job_input, word_input), axis = 1)
     self.__lda = LinearDiscriminantAnalysis()
     self.__lda.fit(train_input, train_output)
Example #2
0
    def get_predict_data(self) -> Dict[Player, bool]:
        """ Get all formatted predict data useable for the machine learning algorithms to do a prediction.

        Returns:
            A dictionary with as key the players of that season and as value the formatted predict input.
        """
        raw_predict_data = WikipediaParser.parse({self.__predict_season})
        predict_data = dict()
        for player, data in raw_predict_data.items():
            job_input = self.__discretize_jobs(np.array([data.job_features]))
            job_input = self.__pca.transform(job_input)
            word_input = np.array([[data.word_feature ** i for i in range(1, self.WORD_COUNT_POLY_DEGREE + 1)]])
            predict_input = np.concatenate((job_input, word_input), axis = 1)
            predict_input = self.__lda.transform(predict_input)
            predict_data[player] = predict_input[0]

        z_scores = sc.stats.zscore([data for data in predict_data.values()])
        return {player: z_score < self.__unlikely_z_score for player, z_score in zip(predict_data.keys(), z_scores)}
Example #3
0
# Count the occurrences of each job for a given player
from Data.Player import Player
from Data.Wikipedia.Job import Job
from Layers.Wikipedia.WikipediaParser import WikipediaParser

PLAYER = Player.NADJA_7

dictionary = WikipediaParser.get_standard_dictionary()
features = WikipediaParser.extract_player_features(PLAYER, dictionary)
print("Number of words: " + str(features.number_words))
for job in Job:
    print(str(job) + ": " + str(features.job_counts[job]))
Example #4
0
# Count all words for these seasons
from collections import Counter
from Data.Player import Player
from Data.PlayerData import get_season
from Layers.Wikipedia.WikipediaParser import WikipediaParser
from nltk.corpus import stopwords

SEASONS = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

pure_counter = Counter()
for player in Player:
    if get_season(player) in SEASONS:
        pure_counter += WikipediaParser.wiki_file_parse(player)

compound_counter = Counter()
dictionary = WikipediaParser.get_standard_dictionary()
stop_words = set(stopwords.words('dutch'))
for word, count in pure_counter.items():
    sub_words = WikipediaParser.get_all_sub_words(word, dictionary, WikipediaParser.MIN_LENGTH_COMPOUND_WORD)
    sub_words.difference_update(stop_words)
    for sub_word in sub_words:
        compound_counter[sub_word] += count

print(compound_counter)
Example #5
0
from Data.PlayerData import get_is_mol
from Data.Wikipedia.Job import Job
from Layers.Wikipedia.WikipediaParser import WikipediaParser
import matplotlib.pyplot as plt

SEASONS = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
JOB = Job.TV_PRODUCER

data = WikipediaParser.parse(SEASONS)
job_mapping = dict([(job, i) for i, job in enumerate(Job)])
job_index = job_mapping[JOB]
train_input = [d.job_features[job_index] for p, d in data.items()]
train_output = [1.0 if get_is_mol(p) else 0.0 for p in data]

plt.figure(figsize=(12, 3))
plt.xlabel("Score")
plt.ylabel("Is 'mol'")
plt.yticks([0.0, 1.0])
plt.gcf().subplots_adjust(bottom=0.15)
plt.scatter(train_input, train_output, s=4)
plt.show()
Example #6
0
from Data.Wikipedia.Job import Job
from Layers.Wikipedia.WikipediaParser import WikipediaParser

SEASONS = {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

dictionary = WikipediaParser.get_standard_dictionary()
job_occurrences = dict()
all_data = dict()
for season in SEASONS:
    all_data.update(WikipediaParser.parse_raw(season, dictionary))

for data in all_data.values():
    for job in Job:
        count = data.job_counts[job]
        job_occurrences[job] = job_occurrences.get(job, []) + [count]

for job, occurrences in job_occurrences.items():
    print(job)
    print(sorted(occurrences))

print("Number of words:")
print(sorted([data.number_words for data in all_data.values()]))
Example #7
0
from Layers.Wikipedia.WikipediaParser import WikipediaParser

LARGER_WORD = "nieuwsprogramma"
MINIMUM_LENGTH = 4

dictionary = WikipediaParser.get_standard_dictionary()
sub_words = list(WikipediaParser.get_all_sub_words(LARGER_WORD, dictionary, MINIMUM_LENGTH))
sub_words = sorted(sub_words, key = lambda w: (-len(w), w))
print(sub_words)
Example #8
0
from Data.PlayerData import get_is_mol
from Data.Wikipedia.Job import Job
from Layers.Wikipedia.WikipediaParser import WikipediaParser
from scipy.stats import kruskal, levene
import numpy as np

TRAIN_SEASONS = {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}

data = WikipediaParser.parse(TRAIN_SEASONS)
predict_input = np.array([d.job_features for p, d in data.items()])
predict_output = np.array([1.0 if get_is_mol(p) else 0.0 for p in data])
for job, column in zip(Job, predict_input.T):
    mol_features = [
        value for value, is_mol in zip(column, predict_output) if is_mol == 1.0
    ]
    non_mol_features = [
        value for value, is_mol in zip(column, predict_output) if is_mol == 0.0
    ]
    _, mean_p_value = kruskal(mol_features, non_mol_features)
    _, std_p_value = levene(mol_features, non_mol_features)
    print(
        str(job) + " - Mean: " + str(mean_p_value) + ", Std: " +
        str(std_p_value))