Esempio n. 1
0
    def __init__(self,
                 y,
                 p=5):

        RegressionModel.__init__(self,
                            y)
        self.p = p
    def __init__(self,
                 normalize=False,
                 balance=False,
                 tweet_threshold=0,
                 score=False,
                 dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                self.model.dump()
Esempio n. 3
0
    def __init__(self,
                 data,
                 p):

        RegressionModel.__init__(self,
                                 data)

        self.p = p
Esempio n. 4
0
    def loadData(self):
        '''
		initialises the data and loads it from hdfs
		and prepares the data.

		'''

        self.training_data, self.test_data = RegressionModel.load_datasets(
            balance=True, viral_threshold=50000)

        self.viral_threshold = 50000

        #  picking a subset of training data as training with full data takes a lot of time.

        self.X_train = self.training_data[:30000, :-1]
        self.Y_train = self.training_data[:30000, -1]
        self.X_test = self.test_data[:, :-1]
        self.Y_test = self.test_data[:, -1]
        self.test_median = np.median(self.Y_test)
        self.train_median = np.median(self.Y_train)

        self.train_median = self.viral_threshold
        self.test_median = self.viral_threshold
        print "Test Median"
        print self.test_median
        print "Training Median"
        print self.train_median
Esempio n. 5
0
	def loadData(self):
		'''
		initialises the data and loads it from hdfs
		and prepares the data.

		'''

		self.training_data, self.test_data = RegressionModel.load_datasets(balance= True, viral_threshold=50000)

		self.viral_threshold = 50000

		#  picking a subset of training data as training with full data takes a lot of time.

		self.X_train = self.training_data[:30000, :-1]
		self.Y_train = self.training_data[:30000,-1]
		self.X_test = self.test_data[:, :-1]
		self.Y_test = self.test_data[:, -1]
		self.test_median = np.median(self.Y_test)
		self.train_median = np.median(self.Y_train)

		self.train_median = self.viral_threshold
		self.test_median = self.viral_threshold
		print "Test Median"
		print self.test_median
		print "Training Median"
		print self.train_median
    def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                    self.model.dump()
class ViralityPrediction:
    SCORE_PLOT_FILENAME = "hashtags_score.png"
    # If CLASSIFICATION is set to True, classification is used, otherwise regression
    CLASSIFICATION = True
    # K defines the number of results in the Top-K virality predictions
    K = 10

    def __init__(self,
                 normalize=False,
                 balance=False,
                 tweet_threshold=0,
                 score=False,
                 dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                self.model.dump()

    def predict(self, hashtags, hashtag_threshold=None):
        """
        Return a dictionary containing for each hashtag its virality prediction
        - hashtags: dictionary hashtag -> array of tweets features
        - hashtag_threshold: if defined, the dictionary value will be a boolean
            set at true if the hashtag goes viral, false otherwise.
            If not defined, the value will be the number of retweets
        Features are [followers_count, friends_count, listed_count,
            statuses_count, hashtags_count, media_count, user_mention_count,
            url_count, verified_account, is_a_retweet, tweet_length]
        """
        values = {}
        for key, value in hashtags.iteritems():
            if ViralityPrediction.CLASSIFICATION:
                tweets_values = self.model.predictClassifier(value)
            else:
                tweets_values = self.model.predictRegression(value)
            hashtag_value = sum(tweets_values)
            if hashtag_threshold is not None:
                if hashtag_value >= hashtag_threshold:
                    values[key] = 1
                else:
                    values[key] = 0
            else:
                # Round to the nearest 10 below the current value
                values[key] = max(0,
                                  int(math.floor(hashtag_value / 10.0)) * 10)

        return values

    def score(self,
              expected,
              predicted,
              labels=None,
              showPlot=True,
              savePlot=False):
        if showPlot or savePlot:
            x = np.arange(len(expected))
            width = 0.8
            ticks = x + x * width

            fig = plt.figure()
            ax = fig.add_subplot(111)
            bar1 = ax.bar(ticks, expected, color='green')
            bar2 = ax.bar(ticks + width, predicted, color='blue')
            ax.set_xlim(-width, (ticks + width)[-1] + 2 * width)
            ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05)
            ax.set_xticks(ticks + width)
            if labels is not None:
                xtickNames = ax.set_xticklabels(labels)
                plt.setp(xtickNames, rotation=45, fontsize=10)
            ax.set_title('Expected and predicted retweet count per hashtag')
            ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted'))

            if savePlot:
                plt.savefig(DataAnalyser.PLOT_DIR +
                            ViralityPrediction.SCORE_PLOT_FILENAME,
                            format='png')
            if showPlot:
                plt.show()

        return np.mean(predicted - expected)**2
Esempio n. 8
0
import pandas as pd
from regression import RegressionModel

model = RegressionModel()
print(model.predict())
print(model.predict(use_macro=False))

# year_prior = row['year']-years
#             home_match = self.history_df.loc[self.history_df['home_id'] == row['home_id']]
#             row_prior = home_match.loc[home_match['year'] == year_prior]

#             # Break if not found
#             if len(home_match)<2 or len(row_prior) < 1:
#                 break

#             # Create new row with home data, prev_year, prev_value, prev_value
#             row_prior = row_prior.drop("home_id", axis=1)
#             row_prior = row_prior.drop("year", axis=1)
#             row_prior = row_prior.values.tolist()[0]
#             row_prior = [row['year'], row['value']] + row_prior

#             # Add row to DataFrame
#             row_series =  pd.Series(row_prior, index=pred_df.columns)
#             pred_df = pred_df.append(row_series, ignore_index=True)
class Election2016:

    def __init__(self):
        self.cfUntouched = pd.DataFrame()

    def init(self):
        self.loadDataSets()
        self.preprocessDataSets()
        self.candidateDataMapping = self.setupCandidates(self.data)


    def runRegression(self):
        self.regressor = RegressionModel(self.data, self.candidateDataMapping, self.testDataFrame)
        # self.regressor.performRegression()
        self.regressor.run()

    def loadDataSets(self):
        # self.cf = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/2016_presidential_election_v5/county_facts.csv')
        self.cf = pd.read_csv('../2016_presidential_election_v5/county_facts.csv')

        # 2016 primary results
        self.pr = pd.read_csv('../2016_presidential_election_v5/primary_results.csv')

        # self.pr = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/2016_presidential_election_v5/primary_results.csv')

        # self.clusters = pd.read_csv(r'/Users/Jonas/OneDrive/Google Drive/Uni/UCB16/Machine Learning and Analytics/final_project/presidential-election-2016/visualization/clustersout.csv')

    def getStateForFips(self,fips):
        if self.cfUntouched.empty:
            self.cfUntouched = pd.read_csv('../2016_presidential_election_v5/county_facts.csv')
        state =  self.cfUntouched[self.cfUntouched['fips'] == fips]['state_abbreviation']
        if state.values:
            return state.values[0]
        else:
            print "Invalid fips"


    def encodeStrings(self, data):
        # encode string as number for regression
        le = preprocessing.LabelEncoder()
        le.fit(data['state_abbreviation'])
        states = le.transform(data['state_abbreviation'])
        data['state_number'] = states
        data.drop('state_abbreviation', axis = 1, inplace=True)
        return data

    def setupTestSet(self, data):
        testDataFrame = data[data['candidate'].isnull()]
        # testDataFrame.dropna(axis=1, inplace=True)
        testDataFrame.drop('candidate', axis=1, inplace=True)
        testDataFrame.fillna(value=0, inplace=True)
        return testDataFrame

    def preprocessDataSets(self):
        # only consider the republican party
        self.pr = self.pr[self.pr['party'] == 'Republican']

        # identify states that have already voted
        # statesAlreadyVoted = pr['state_abbreviation'].unique()

        self.pr.drop('state_abbreviation', axis = 1, inplace=True)

        # merge county facts with 2016 primary results
        self.data = pd.merge(self.cf, self.pr, on='fips', how='left')
        # data = pd.merge(cf, pr, on='fips', how='inner')

        self.data = self.encodeStrings(self.data)

         # removes information aggregated on state level
        self.data = self.data[self.data.state_number != 0]

        self.data = self.removeUnnecessaryFeatures(self.data)

        # TODO check voted
        # self.data['voted'] = 0
        self.data['voted'] = 1

        self.data.loc[self.data['candidate'].isnull(), 'voted'] = 0

        self.testDataFrame = self.setupTestSet(self.data)



    def removeUnnecessaryFeatures(self, data):
        # remove unnecessary data
        data.drop('area_name', axis = 1, inplace=True)
        data.drop('county', axis = 1, inplace=True)
        data.drop('state', axis = 1, inplace=True)
        data.drop('party', axis = 1, inplace=True)
        data.drop('votes', axis = 1, inplace=True)
        return data

    def setupCandidates(self, data):
        # GOP candidates
        candidates = ['Donald Trump', 'Ted Cruz', 'John Kasich']

        # split up data for every candidate
        trumpData = data[data['candidate'] == 'Donald Trump']
        trumpData.drop('candidate', axis=1, inplace=True)
        ### 1881 rows

        cruzData = data[data['candidate'] == 'Ted Cruz']
        cruzData.drop('candidate', axis=1, inplace=True)

        kasichData = data[data['candidate'] == 'John Kasich']
        kasichData.drop('candidate', axis=1, inplace=True)

        candidateDataMapping = {'Donald Trump' : trumpData, 'Ted Cruz' : cruzData, 'John Kasich' : kasichData}
        return candidateDataMapping
 def runRegression(self):
     self.regressor = RegressionModel(self.data, self.candidateDataMapping, self.testDataFrame)
     # self.regressor.performRegression()
     self.regressor.run()
class ViralityPrediction:
    SCORE_PLOT_FILENAME = "hashtags_score.png"
    # If CLASSIFICATION is set to True, classification is used, otherwise regression
    CLASSIFICATION = True
    # K defines the number of results in the Top-K virality predictions
    K = 10

    def __init__(self, normalize=False, balance=False, tweet_threshold=0, score=False, dump_model=True):
        """
        Import or train the regression model
        """
        self.model = RegressionModel()
        if not self.model.load():
            training_set, testing_set = RegressionModel.load_datasets(
                balance=balance, viral_threshold=tweet_threshold)

            if ViralityPrediction.CLASSIFICATION == True:
                training_set = self.model.normaliseFeats(training_set)
                testing_set = self.model.normaliseFeats(testing_set)
                self.model.trainClassifier(training_set, normalize=normalize)
                if score:
                    self.model.scoreClassifier(testing_set)

            else:
                self.model.trainRegression(training_set, normalize=normalize)
                if score:
                    self.model.scoreRegression(testing_set)

            if dump_model:
                    self.model.dump()

    def predict(self, hashtags, hashtag_threshold=None):
        """
        Return a dictionary containing for each hashtag its virality prediction
        - hashtags: dictionary hashtag -> array of tweets features
        - hashtag_threshold: if defined, the dictionary value will be a boolean
            set at true if the hashtag goes viral, false otherwise.
            If not defined, the value will be the number of retweets
        Features are [followers_count, friends_count, listed_count,
            statuses_count, hashtags_count, media_count, user_mention_count,
            url_count, verified_account, is_a_retweet, tweet_length]
        """
        values = {}
        for key, value in hashtags.iteritems():
            if ViralityPrediction.CLASSIFICATION:
                tweets_values = self.model.predictClassifier(value)
            else:
                tweets_values = self.model.predictRegression(value)
            hashtag_value = sum(tweets_values)
            if hashtag_threshold is not None:
                if hashtag_value >= hashtag_threshold:
                    values[key] = 1
                else:
                    values[key] = 0
            else:
                # Round to the nearest 10 below the current value
                values[key] = max(0, int(math.floor(hashtag_value / 10.0)) * 10)

        return values

    def score(self, expected, predicted, labels=None, showPlot=True, savePlot=False):
        if showPlot or savePlot:
            x = np.arange(len(expected))
            width = 0.8
            ticks = x + x * width

            fig = plt.figure()
            ax = fig.add_subplot(111)
            bar1 = ax.bar(ticks, expected, color='green')
            bar2 = ax.bar(ticks + width, predicted, color='blue')
            ax.set_xlim(-width, (ticks + width)[-1] + 2 * width)
            ax.set_ylim(0, max(max(expected), max(predicted)) * 1.05)
            ax.set_xticks(ticks + width)
            if labels is not None:
                xtickNames = ax.set_xticklabels(labels)
                plt.setp(xtickNames, rotation=45, fontsize=10)
            ax.set_title('Expected and predicted retweet count per hashtag')
            ax.legend((bar1[0], bar2[0]), ('Expected', 'Predicted'))

            if savePlot:
                plt.savefig(DataAnalyser.PLOT_DIR + ViralityPrediction.SCORE_PLOT_FILENAME, format='png')
            if showPlot:
                plt.show()

        return np.mean(predicted - expected) ** 2