Exemple #1
0


""":type: pd.DataFrame"""
# testData = pd.read_csv(fileNameTestData)
"""
Test model on unseen data. After each prediction step, you may update you model. This is not mandatory though.
"""

# Loading the trained model in case we do not want to train on new data
model = DataHandling.load_data(Constants.model_path + 'Model_NB.pickle')
os_list = DataHandling.load_data(Constants.model_path + 'os_list.pickle')
publisher_list = DataHandling.load_data(Constants.model_path + 'publisher_list.pickle')

#In case there is a new termVectors file we use this to extract features, I assume it is because articles will be updated.
feature_extraction = FeatureExtraction()
[article_word_count, word_tfidf, publishers, article_numbers] = feature_extraction.prepare_dictionary_article(termVectors)
# #For testing train data the same as test data
testData = train_data

article_popularity = DataHandling.load_data(Constants.model_path + 'article_popularity.pickle')
train_data = DataHandling.load_data(Constants.model_path + 'train_data_with_article_distances.pickle')

for (rowNum, row) in testData.iterrows():
    inputFeatures = row[["Publisher", "Osfamily", "ItemSrc", "UserID", "UserClicksAd"]]

    #Check if user has a history
    number_times_clicked = row["UserClicksAd"]

    #For users with no history pick the most popular item
    if number_times_clicked == 0:
Exemple #2
0

"""
Prepare data for trianing and testing
# """
fileNameTrainData = "data/train.csv"
fileNameTermVectors = "data/termVectorsPerPublisher.json"
# fileNameTestData = "data/test.csv" # well. obviously we dont hand this one out
topN = 5

termVectors = json.load(open(fileNameTermVectors))
train_data = pd.read_csv(fileNameTrainData)

# Feature Extraction part
# Running feature extraction to create word dictionary, word counts, article grouping (distances)
feature_extraction = FeatureExtraction()
[train_data, article_popularity, os_list, publisher_list] = feature_extraction.prepare_features(train_data, termVectors)

#Taking only users that have historically clicked into training.
train_data = train_data.loc[train_data['UserClicksAd'] > 0]

#Balancing training data
#Since the dataset is imbalanced, we balance negative and positive samples for training.
# positive_samples = train_data.loc[train_data['Output'] == 1]
# positive_samples_count = train_data.loc[train_data['Output'] == 1].shape[0]
# negative_samples = train_data.loc[train_data['Output'] == 0]
# rows = random.sample(negative_samples.index, positive_samples_count)
# negative_samples = negative_samples.ix[rows]
# train_data = pd.concat([positive_samples, negative_samples])

"""
Exemple #3
0


""":type: pd.DataFrame"""
# testData = pd.read_csv(fileNameTestData)
"""
Test model on unseen data. After each prediction step, you may update you model. This is not mandatory though.
"""

# Loading the trained model in case we do not want to train on new data
model = DataHandling.load_data(Constants.model_path + 'Model_NB.pickle')
os_list = DataHandling.load_data(Constants.model_path + 'os_list.pickle')
publisher_list = DataHandling.load_data(Constants.model_path + 'publisher_list.pickle')

#In case there is a new termVectors file we use this to extract features, I assume it is because articles will be updated.
feature_extraction = FeatureExtraction()
[article_word_count, word_tfidf, publishers, article_numbers] = feature_extraction.prepare_dictionary_article(termVectors)

article_popularity = DataHandling.load_data(Constants.model_path + 'article_popularity.pickle')
train_data = DataHandling.load_data(Constants.model_path + 'train_data_with_article_distances.pickle')

for (rowNum, row) in testData.iterrows():
    inputFeatures = row[["Publisher", "Osfamily", "ItemSrc", "UserID", "UserClicksAd"]]

    #Check if user has a history
    number_times_clicked = row["UserClicksAd"]

    #For users with no history pick the most popular item
    if number_times_clicked == 0:
        #Pick top 20 articles
        top_article_popularity = heapq.nlargest(20, enumerate(article_popularity), key=lambda x: x[1])