def __init__(self, y, p=5): RegressionModel.__init__(self, y) self.p = p
def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump()
def __init__(self, data, p): RegressionModel.__init__(self, data) self.p = p
def loadData(self): ''' initialises the data and loads it from hdfs and prepares the data. ''' self.training_data, self.test_data = RegressionModel.load_datasets( balance=True, viral_threshold=50000) self.viral_threshold = 50000 # picking a subset of training data as training with full data takes a lot of time. self.X_train = self.training_data[:30000, :-1] self.Y_train = self.training_data[:30000, -1] self.X_test = self.test_data[:, :-1] self.Y_test = self.test_data[:, -1] self.test_median = np.median(self.Y_test) self.train_median = np.median(self.Y_train) self.train_median = self.viral_threshold self.test_median = self.viral_threshold print "Test Median" print self.test_median print "Training Median" print self.train_median
def loadData(self): ''' initialises the data and loads it from hdfs and prepares the data. ''' self.training_data, self.test_data = RegressionModel.load_datasets(balance= True, viral_threshold=50000) self.viral_threshold = 50000 # picking a subset of training data as training with full data takes a lot of time. self.X_train = self.training_data[:30000, :-1] self.Y_train = self.training_data[:30000,-1] self.X_test = self.test_data[:, :-1] self.Y_test = self.test_data[:, -1] self.test_median = np.median(self.Y_test) self.train_median = np.median(self.Y_train) self.train_median = self.viral_threshold self.test_median = self.viral_threshold print "Test Median" print self.test_median print "Training Median" print self.train_median
class ViralityPrediction: SCORE_PLOT_FILENAME = "hashtags_score.png" # If CLASSIFICATION is set to True, classification is used, otherwise regression CLASSIFICATION = True # K defines the number of results in the Top-K virality predictions K = 10 def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump() def predict(self, hashtags, hashtag_threshold=None): """ Return a dictionary containing for each hashtag its virality prediction - hashtags: dictionary hashtag -> array of tweets features - hashtag_threshold: if defined, the dictionary value will be a boolean set at true if the hashtag goes viral, false otherwise. If not defined, the value will be the number of retweets Features are [followers_count, friends_count, listed_count, statuses_count, hashtags_count, media_count, user_mention_count, url_count, verified_account, is_a_retweet, tweet_length] """ values = {} for key, value in hashtags.iteritems(): if ViralityPrediction.CLASSIFICATION: tweets_values = self.model.predictClassifier(value) else: tweets_values = self.model.predictRegression(value) hashtag_value = sum(tweets_values) if hashtag_threshold is not None: if hashtag_value >= hashtag_threshold: values[key] = 1 else: values[key] = 0 else: # Round to the nearest 10 below the current value values[key] = max(0, int(math.floor(hashtag_value / 10.0)) * 10) return values def score(self, expected, predicted, labels=None, showPlot=True, savePlot=False): if showPlot or savePlot: x = np.arange(len(expected)) width = 0.8 ticks = x + x * width fig = plt.figure() ax = fig.add_subplot(111) bar1 = ax.bar(ticks, expected, color='green') bar2 = ax.bar(ticks + width, predicted, color='blue') ax.set_xlim(-width, (ticks + width)[-1] + 2 * width) ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05) ax.set_xticks(ticks + width) if labels is not None: xtickNames = ax.set_xticklabels(labels) plt.setp(xtickNames, rotation=45, fontsize=10) ax.set_title('Expected and predicted retweet count per hashtag') ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted')) if savePlot: plt.savefig(DataAnalyser.PLOT_DIR + ViralityPrediction.SCORE_PLOT_FILENAME, format='png') if showPlot: plt.show() return np.mean(predicted - expected)**2
import pandas as pd from regression import RegressionModel model = RegressionModel() print(model.predict()) print(model.predict(use_macro=False)) # year_prior = row['year']-years # home_match = self.history_df.loc[self.history_df['home_id'] == row['home_id']] # row_prior = home_match.loc[home_match['year'] == year_prior] # # Break if not found # if len(home_match)<2 or len(row_prior) < 1: # break # # Create new row with home data, prev_year, prev_value, prev_value # row_prior = row_prior.drop("home_id", axis=1) # row_prior = row_prior.drop("year", axis=1) # row_prior = row_prior.values.tolist()[0] # row_prior = [row['year'], row['value']] + row_prior # # Add row to DataFrame # row_series = pd.Series(row_prior, index=pred_df.columns) # pred_df = pred_df.append(row_series, ignore_index=True)
class Election2016: def __init__(self): self.cfUntouched = pd.DataFrame() def init(self): self.loadDataSets() self.preprocessDataSets() self.candidateDataMapping = self.setupCandidates(self.data) def runRegression(self): self.regressor = RegressionModel(self.data, self.candidateDataMapping, self.testDataFrame) # self.regressor.performRegression() self.regressor.run() def loadDataSets(self): # self.cf = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/2016_presidential_election_v5/county_facts.csv') self.cf = pd.read_csv('../2016_presidential_election_v5/county_facts.csv') # 2016 primary results self.pr = pd.read_csv('../2016_presidential_election_v5/primary_results.csv') # self.pr = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/2016_presidential_election_v5/primary_results.csv') # self.clusters = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/visualization/clustersout.csv') def getStateForFips(self,fips): if self.cfUntouched.empty: self.cfUntouched = pd.read_csv('../2016_presidential_election_v5/county_facts.csv') state = self.cfUntouched[self.cfUntouched['fips'] == fips]['state_abbreviation'] if state.values: return state.values[0] else: print "Invalid fips" def encodeStrings(self, data): # encode string as number for regression le = preprocessing.LabelEncoder() le.fit(data['state_abbreviation']) states = le.transform(data['state_abbreviation']) data['state_number'] = states data.drop('state_abbreviation', axis = 1, inplace=True) return data def setupTestSet(self, data): testDataFrame = data[data['candidate'].isnull()] # testDataFrame.dropna(axis=1, inplace=True) testDataFrame.drop('candidate', axis=1, inplace=True) testDataFrame.fillna(value=0, inplace=True) return testDataFrame def preprocessDataSets(self): # only consider the republican party self.pr = self.pr[self.pr['party'] == 'Republican'] # identify states that have already voted # statesAlreadyVoted = pr['state_abbreviation'].unique() self.pr.drop('state_abbreviation', axis = 1, inplace=True) # merge county facts with 2016 primary results self.data = pd.merge(self.cf, self.pr, on='fips', how='left') # data = pd.merge(cf, pr, on='fips', how='inner') self.data = self.encodeStrings(self.data) # removes information aggregated on state level self.data = self.data[self.data.state_number != 0] self.data = self.removeUnnecessaryFeatures(self.data) # TODO check voted # self.data['voted'] = 0 self.data['voted'] = 1 self.data.loc[self.data['candidate'].isnull(), 'voted'] = 0 self.testDataFrame = self.setupTestSet(self.data) def removeUnnecessaryFeatures(self, data): # remove unnecessary data data.drop('area_name', axis = 1, inplace=True) data.drop('county', axis = 1, inplace=True) data.drop('state', axis = 1, inplace=True) data.drop('party', axis = 1, inplace=True) data.drop('votes', axis = 1, inplace=True) return data def setupCandidates(self, data): # GOP candidates candidates = ['Donald Trump', 'Ted Cruz', 'John Kasich'] # split up data for every candidate trumpData = data[data['candidate'] == 'Donald Trump'] trumpData.drop('candidate', axis=1, inplace=True) ### 1881 rows cruzData = data[data['candidate'] == 'Ted Cruz'] cruzData.drop('candidate', axis=1, inplace=True) kasichData = data[data['candidate'] == 'John Kasich'] kasichData.drop('candidate', axis=1, inplace=True) candidateDataMapping = {'Donald Trump' : trumpData, 'Ted Cruz' : cruzData, 'John Kasich' : kasichData} return candidateDataMapping
def runRegression(self): self.regressor = RegressionModel(self.data, self.candidateDataMapping, self.testDataFrame) # self.regressor.performRegression() self.regressor.run()
class ViralityPrediction: SCORE_PLOT_FILENAME = "hashtags_score.png" # If CLASSIFICATION is set to True, classification is used, otherwise regression CLASSIFICATION = True # K defines the number of results in the Top-K virality predictions K = 10 def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True): """ Import or train the regression model """ self.model = RegressionModel() if not self.model.load(): training_set, testing_set = RegressionModel.load_datasets( balance=balance, viral_threshold=tweet_threshold) if ViralityPrediction.CLASSIFICATION == True: training_set = self.model.normaliseFeats(training_set) testing_set = self.model.normaliseFeats(testing_set) self.model.trainClassifier(training_set, normalize=normalize) if score: self.model.scoreClassifier(testing_set) else: self.model.trainRegression(training_set, normalize=normalize) if score: self.model.scoreRegression(testing_set) if dump_model: self.model.dump() def predict(self, hashtags, hashtag_threshold=None): """ Return a dictionary containing for each hashtag its virality prediction - hashtags: dictionary hashtag -> array of tweets features - hashtag_threshold: if defined, the dictionary value will be a boolean set at true if the hashtag goes viral, false otherwise. If not defined, the value will be the number of retweets Features are [followers_count, friends_count, listed_count, statuses_count, hashtags_count, media_count, user_mention_count, url_count, verified_account, is_a_retweet, tweet_length] """ values = {} for key, value in hashtags.iteritems(): if ViralityPrediction.CLASSIFICATION: tweets_values = self.model.predictClassifier(value) else: tweets_values = self.model.predictRegression(value) hashtag_value = sum(tweets_values) if hashtag_threshold is not None: if hashtag_value >= hashtag_threshold: values[key] = 1 else: values[key] = 0 else: # Round to the nearest 10 below the current value values[key] = max(0, int(math.floor(hashtag_value / 10.0)) * 10) return values def score(self, expected, predicted, labels=None, showPlot=True, savePlot=False): if showPlot or savePlot: x = np.arange(len(expected)) width = 0.8 ticks = x + x * width fig = plt.figure() ax = fig.add_subplot(111) bar1 = ax.bar(ticks, expected, color='green') bar2 = ax.bar(ticks + width, predicted, color='blue') ax.set_xlim(-width, (ticks + width)[-1] + 2 * width) ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05) ax.set_xticks(ticks + width) if labels is not None: xtickNames = ax.set_xticklabels(labels) plt.setp(xtickNames, rotation=45, fontsize=10) ax.set_title('Expected and predicted retweet count per hashtag') ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted')) if savePlot: plt.savefig(DataAnalyser.PLOT_DIR + ViralityPrediction.SCORE_PLOT_FILENAME, format='png') if showPlot: plt.show() return np.mean(predicted - expected) ** 2