Example #1
0
    def _plotPredictionError(self, true_num_pickups, predicted_num_pickups):
        '''
        Generate a scatter plot. The true number of pickups is plotted against the prediction error for
        each data point. The prediction error is defined as the absolute difference between the true value
        and the predicted value.
        '''
        util.verbosePrint('Plotting predicted versus true pickup scatter plot.')
        util.verbosePrint('')

        error = [abs(true_num_pickups[i] - predicted_num_pickups[i]) for i in xrange(len(true_num_pickups))]

        # Set bubble area and opacity, and create scatter plot.
        area = [70]*len(error)
        plt.scatter(true_num_pickups, predicted_num_pickups, s=area, alpha=0.2,
                    edgecolors='none', label='actual predictions')

        X_line = range(max(true_num_pickups))
        plt.plot(X_line, X_line, 'g--', color='0.5', label='perfect prediction line')

        # Decorate plot.
        plt.grid(True)
        plt.ylabel('Predicted Number of Pickups')
        plt.xlabel('True Number of Pickups')
        plt.title('Predicted vs. True Number of Pickups  \nModel: %s' % self.model)

        plt.legend(loc='best')

        # Hard-code xmin, ymin to be -10, and constrain xmax, ymax to be the greater of the two.
        xmin, xmax, ymin, ymax = plt.axis()
        plt.axis((-10, max(xmax, ymax), -10, max(xmax, ymax)))
        plt.savefig((Const.OUTFILE_DIRECTORY + 'true_vs_predicted_scatter_%s.png') %
                    (util.currentTimeString()), bbox_inches='tight')
        plt.close()
Example #2
0
    def writePythonFile(self):
        fileContent = self.generatePythonFile()

        verbosePrint(self.verbose, "Started generating file...")
        with open("{filename}.py".format(filename=self.pyFilename),
                  "w") as pyFile:
            pyFile.write(fileContent)
        verbosePrint(self.verbose, "Ended generating file.")
Example #3
0
    def extractPathsFromSVG(self):
        svg = self.readSVGFile()
        soup = BeautifulSoup(svg, "lxml")

        verbosePrint(self.verbose, "Finding all paths...")
        paths = soup.findAll("path")
        verbosePrint(self.verbose, "Ended finding all paths.")

        return paths
Example #4
0
    def parseDAttrsToComplexNumbers(self):
        dAttrs = self.extractDAttrsFromPaths()
        parsedPaths = []

        verbosePrint(self.verbose, "Started parsing d attrs...")
        for dAttr in dAttrs:
            parsedPaths.append(parse_path(dAttr))
        verbosePrint(self.verbose, "Ended parsing d attrs.")

        return parsedPaths
Example #5
0
    def extractDAttrsFromPaths(self):
        paths = self.extractPathsFromSVG()
        dAttrs = []

        verbosePrint(self.verbose, "Started collecting d attrs...")
        for path in paths:
            dAttrs.append(path.get("d"))
        verbosePrint(self.verbose, "Ended collecting d attrs.")

        return dAttrs
Example #6
0
    def __init__(self, database, dataset, regressor_model, sparse=True):
        self.db = database
        self.dataset = dataset
        self.table_name = Const.AGGREGATED_PICKUPS
        self.regressor = regressor_model

        # `sparse` determines whether data should be represented as sparse scipy
        # matrices as opposed to dense ones. (Some models such as the decision tree
        # regression model require a dense representation.)
        self.feature_extractor = FeatureExtractor(sparse)

        util.verbosePrint(self.regressor)
Example #7
0
    def extractXAndYValuesFromParsedPaths(self):
        parsedPaths = self.parseDAttrsToComplexNumbers()
        points = []
        numOfPts = self.numOfPoints

        verbosePrint(
            self.verbose, "Started extracting x and y values from paths...")
        for parsedPath in parsedPaths:
            pts = [(p.real, p.imag) for p in (parsedPath.point(i / numOfPts)
                                              for i in range(0, numOfPts + 1))]
            points.append(pts)
        verbosePrint(
            self.verbose, "Ended extracting x and y values from paths.")

        return points
Example #8
0
    def _plotPredictionError(self, true_num_pickups, predicted_num_pickups):
        '''
        Generate a scatter plot. The true number of pickups is plotted against the prediction error for
        each data point. The prediction error is defined as the absolute difference between the true value
        and the predicted value.
        '''
        util.verbosePrint(
            'Plotting predicted versus true pickup scatter plot.')
        util.verbosePrint('')

        error = [
            abs(true_num_pickups[i] - predicted_num_pickups[i])
            for i in xrange(len(true_num_pickups))
        ]

        # Set bubble area and opacity, and create scatter plot.
        area = [70] * len(error)
        plt.scatter(true_num_pickups,
                    predicted_num_pickups,
                    s=area,
                    alpha=0.2,
                    edgecolors='none',
                    label='actual predictions')

        X_line = range(max(true_num_pickups))
        plt.plot(X_line,
                 X_line,
                 'g--',
                 color='0.5',
                 label='perfect prediction line')

        # Decorate plot.
        plt.grid(True)
        plt.ylabel('Predicted Number of Pickups')
        plt.xlabel('True Number of Pickups')
        plt.title('Predicted vs. True Number of Pickups  \nModel: %s' %
                  self.model)

        plt.legend(loc='best')

        # Hard-code xmin, ymin to be -10, and constrain xmax, ymax to be the greater of the two.
        xmin, xmax, ymin, ymax = plt.axis()
        plt.axis((-10, max(xmax, ymax), -10, max(xmax, ymax)))
        plt.savefig(
            (Const.OUTFILE_DIRECTORY + 'true_vs_predicted_scatter_%s.png') %
            (util.currentTimeString()),
            bbox_inches='tight')
        plt.close()
Example #9
0
def extractDataset(dataset, model_name):
    '''
    Extract and return feature vectors and labels from data set as X and y/

    :return: X, y
    '''
    feature_extractor = FeatureExtractor(use_sparse=model_name != 'dtr')
    row_dicts = []
    while dataset.hasMoreTrainExamples():
        row_dicts.extend(dataset.getTrainExamples(Const.TRAIN_BATCH_SIZE))

    dataset.switchToTestMode()
    while dataset.hasMoreTestExamples():
        row_dicts.extend(dataset.getTestExamples())

    util.verbosePrint('Number of examples being considered for train and test:',
        len(row_dicts))

    X = feature_extractor.getFeatureVectors(row_dicts)
    y = np.array([example['num_pickups'] for example in row_dicts])
    return X, y
Example #10
0
    def _plotFeatureWeights(self, zone_id, start_datetime, num_hours=7 * 24):
        '''
        :param zone_id: only use features relevant to this zone.
        :param start_datetime: datetime at which to start extracting features.
        :param num_hours: number of hours to plot

        Generate a stacked bar chart of all the features weights used to
        predict the number of pickups in zone zone_id for each hour of the
        week (from Sunday 12am to Saturday 11pm).
        '''
        if not hasattr(self.model, 'feature_extractor'):
            print '\tCannot plot features for the model.'
            return
        else:
            util.verbosePrint(
                'Plotting features and their weights for each hour.')
            util.verbosePrint('\tStart time: %s' % str(start_datetime))
            util.verbosePrint('\tDuration : %s hours' % str(num_hours))
            util.verbosePrint('')

        # Mapping from all features to their corresponding weights.
        #   e.g. feature_weights['Zone_HourOfDay=15402_14'] = 324.4565
        feature_weights = self.model.getFeatureWeights()
        if feature_weights is None:
            print '\tAborting feature weight plot.'
            return

        # For each data point in the time range, get the weight for each of its features.
        # plot_values is a mapping from feature templates to a list of all their values at each time step.
        #   e.g. plot_values['Zone_HourOfDay'] = [324.4565, 221.498, ... ]
        plot_values = {}
        for time_step in xrange(num_hours):
            curr_datetime = start_datetime + datetime.timedelta(
                hours=time_step)
            test_example = {
                'zone_id': zone_id,
                'start_datetime': curr_datetime
            }

            # test_example_features is a mapping from feature templates to their identifiers
            #   e.g. test_example_features['Zone_HourOfDay'] = 15402_14
            test_example_features = self.model.feature_extractor.getFeatureDict(
                test_example)

            for feature_template, identifier in test_example_features.iteritems(
            ):
                if feature_template not in plot_values:
                    plot_values[feature_template] = [0] * num_hours
                feature_name = '%s=%s' % (feature_template, identifier)
                if feature_name in feature_weights:
                    plot_values[feature_template][time_step] = feature_weights[
                        feature_name]

        # Generate stacked bar chart, whose series are the feature templates.

        # Order these feature templates first, then all the remaining feature
        # templates in plot_values in any order.
        feature_templates = [
            'Zone', 'DayOfWeek', 'HourOfDay', 'Zone_DayOfWeek',
            'Zone_HourOfDay'
        ]
        for feature_template in list(feature_templates):
            if feature_template not in plot_values.keys():
                feature_templates.remove(feature_template)
        for feature_template in plot_values.keys():
            if feature_template not in feature_templates:
                feature_templates.append(feature_template)

        colors = ['b', 'g', 'r', 'c', 'y', 'm', '0.2', '0.8']
        indices = [i for i in xrange(num_hours)]
        series_index = 0
        width = 1
        bars = []

        # Plot positive values for all series.
        bottom_values = [0] * num_hours
        for feature_template in feature_templates:
            pos_values = [
                max(0, weight) for weight in plot_values[feature_template]
            ]
            bar = plt.bar(indices,
                          pos_values,
                          color=colors[series_index % len(colors)],
                          width=width,
                          alpha=0.8,
                          bottom=bottom_values)
            bars.append(bar[0])
            new_bottom_values = [
                bottom_values[i] + pos_values[i] for i in xrange(num_hours)
            ]
            bottom_values = new_bottom_values
            series_index += 1

        series_index = 0
        # Plot negative values for all series.
        bottom_values = [0] * num_hours
        for feature_template in feature_templates:
            neg_values = [
                min(0, weight) for weight in plot_values[feature_template]
            ]
            plt.bar(indices,
                    neg_values,
                    color=colors[series_index % len(colors)],
                    width=width,
                    alpha=0.8,
                    bottom=[i for i in bottom_values])
            new_bottom_values = [
                bottom_values[i] + neg_values[i] for i in xrange(num_hours)
            ]
            bottom_values = new_bottom_values
            series_index += 1

        # Decorate plot.
        plt.grid(True)
        plt.title('Predicted Number of Pickups in Zone %d' % zone_id)
        plt.xlabel('Time (hours since 2013 April 7, 12am)')
        plt.ylabel('Number of Pickups')
        plt.xlim(0, num_hours)
        plt.ylim(-1000, 2000)
        plt.xticks(np.arange(0, num_hours + 1, 12))
        plt.grid(True)
        plt.legend(bars, feature_templates)

        plt.savefig((Const.OUTFILE_DIRECTORY + 'feature_weights_zone_%d_%s.png') % \
                    (zone_id, util.currentTimeString()), bbox_inches='tight')
        plt.close()
Example #11
0
    def _plotFeatureWeights(self, zone_id, start_datetime, num_hours=7*24):
        '''
        :param zone_id: only use features relevant to this zone.
        :param start_datetime: datetime at which to start extracting features.
        :param num_hours: number of hours to plot

        Generate a stacked bar chart of all the features weights used to
        predict the number of pickups in zone zone_id for each hour of the
        week (from Sunday 12am to Saturday 11pm).
        '''
        if not hasattr(self.model, 'feature_extractor'):
            print '\tCannot plot features for the model.'
            return
        else:
            util.verbosePrint('Plotting features and their weights for each hour.')
            util.verbosePrint('\tStart time: %s' % str(start_datetime))
            util.verbosePrint('\tDuration : %s hours' % str(num_hours))
            util.verbosePrint('')

        # Mapping from all features to their corresponding weights.
        #   e.g. feature_weights['Zone_HourOfDay=15402_14'] = 324.4565
        feature_weights = self.model.getFeatureWeights()
        if feature_weights is None:
            print '\tAborting feature weight plot.'
            return

        # For each data point in the time range, get the weight for each of its features.
        # plot_values is a mapping from feature templates to a list of all their values at each time step.
        #   e.g. plot_values['Zone_HourOfDay'] = [324.4565, 221.498, ... ]
        plot_values = {}
        for time_step in xrange(num_hours):
            curr_datetime = start_datetime + datetime.timedelta(hours=time_step)
            test_example = {'zone_id': zone_id, 'start_datetime': curr_datetime}

            # test_example_features is a mapping from feature templates to their identifiers
            #   e.g. test_example_features['Zone_HourOfDay'] = 15402_14
            test_example_features = self.model.feature_extractor.getFeatureDict(test_example)

            for feature_template, identifier in test_example_features.iteritems():
                if feature_template not in plot_values:
                    plot_values[feature_template] = [0] * num_hours
                feature_name = '%s=%s' % (feature_template, identifier)
                if feature_name in feature_weights:
                    plot_values[feature_template][time_step] = feature_weights[feature_name]

        # Generate stacked bar chart, whose series are the feature templates.

        # Order these feature templates first, then all the remaining feature
        # templates in plot_values in any order.
        feature_templates = ['Zone', 'DayOfWeek', 'HourOfDay', 'Zone_DayOfWeek', 'Zone_HourOfDay']
        for feature_template in list(feature_templates):
            if feature_template not in plot_values.keys():
                feature_templates.remove(feature_template)
        for feature_template in plot_values.keys():
            if feature_template not in feature_templates:
                feature_templates.append(feature_template)

        colors = ['b', 'g', 'r', 'c', 'y', 'm', '0.2', '0.8']
        indices = [i for i in xrange(num_hours)]
        series_index = 0
        width = 1
        bars = []

        # Plot positive values for all series.
        bottom_values = [0] * num_hours
        for feature_template in feature_templates:
            pos_values = [max(0, weight) for weight in plot_values[feature_template]]
            bar = plt.bar(indices, pos_values,
                    color=colors[series_index % len(colors)],
                    width=width,
                    alpha=0.8,
                    bottom=bottom_values)
            bars.append(bar[0])
            new_bottom_values = [bottom_values[i] + pos_values[i] for i in xrange(num_hours)]
            bottom_values = new_bottom_values
            series_index += 1

        series_index = 0
        # Plot negative values for all series.
        bottom_values = [0] * num_hours
        for feature_template in feature_templates:
            neg_values = [min(0, weight) for weight in plot_values[feature_template]]
            plt.bar(indices, neg_values,
                    color=colors[series_index % len(colors)],
                    width=width,
                    alpha=0.8,
                    bottom=[i for i in bottom_values])
            new_bottom_values = [bottom_values[i] + neg_values[i] for i in xrange(num_hours)]
            bottom_values = new_bottom_values
            series_index += 1

        # Decorate plot.
        plt.grid(True)
        plt.title('Predicted Number of Pickups in Zone %d' % zone_id)
        plt.xlabel('Time (hours since 2013 April 7, 12am)')
        plt.ylabel('Number of Pickups')
        plt.xlim(0, num_hours)
        plt.ylim(-1000, 2000)
        plt.xticks(np.arange(0, num_hours + 1, 12))
        plt.grid(True)
        plt.legend(bars, feature_templates)

        plt.savefig((Const.OUTFILE_DIRECTORY + 'feature_weights_zone_%d_%s.png') % \
                    (zone_id, util.currentTimeString()), bbox_inches='tight')
        plt.close()