def _plotPredictionError(self, true_num_pickups, predicted_num_pickups): ''' Generate a scatter plot. The true number of pickups is plotted against the prediction error for each data point. The prediction error is defined as the absolute difference between the true value and the predicted value. ''' util.verbosePrint('Plotting predicted versus true pickup scatter plot.') util.verbosePrint('') error = [abs(true_num_pickups[i] - predicted_num_pickups[i]) for i in xrange(len(true_num_pickups))] # Set bubble area and opacity, and create scatter plot. area = [70]*len(error) plt.scatter(true_num_pickups, predicted_num_pickups, s=area, alpha=0.2, edgecolors='none', label='actual predictions') X_line = range(max(true_num_pickups)) plt.plot(X_line, X_line, 'g--', color='0.5', label='perfect prediction line') # Decorate plot. plt.grid(True) plt.ylabel('Predicted Number of Pickups') plt.xlabel('True Number of Pickups') plt.title('Predicted vs. True Number of Pickups \nModel: %s' % self.model) plt.legend(loc='best') # Hard-code xmin, ymin to be -10, and constrain xmax, ymax to be the greater of the two. xmin, xmax, ymin, ymax = plt.axis() plt.axis((-10, max(xmax, ymax), -10, max(xmax, ymax))) plt.savefig((Const.OUTFILE_DIRECTORY + 'true_vs_predicted_scatter_%s.png') % (util.currentTimeString()), bbox_inches='tight') plt.close()
def writePythonFile(self): fileContent = self.generatePythonFile() verbosePrint(self.verbose, "Started generating file...") with open("{filename}.py".format(filename=self.pyFilename), "w") as pyFile: pyFile.write(fileContent) verbosePrint(self.verbose, "Ended generating file.")
def extractPathsFromSVG(self): svg = self.readSVGFile() soup = BeautifulSoup(svg, "lxml") verbosePrint(self.verbose, "Finding all paths...") paths = soup.findAll("path") verbosePrint(self.verbose, "Ended finding all paths.") return paths
def parseDAttrsToComplexNumbers(self): dAttrs = self.extractDAttrsFromPaths() parsedPaths = [] verbosePrint(self.verbose, "Started parsing d attrs...") for dAttr in dAttrs: parsedPaths.append(parse_path(dAttr)) verbosePrint(self.verbose, "Ended parsing d attrs.") return parsedPaths
def extractDAttrsFromPaths(self): paths = self.extractPathsFromSVG() dAttrs = [] verbosePrint(self.verbose, "Started collecting d attrs...") for path in paths: dAttrs.append(path.get("d")) verbosePrint(self.verbose, "Ended collecting d attrs.") return dAttrs
def __init__(self, database, dataset, regressor_model, sparse=True): self.db = database self.dataset = dataset self.table_name = Const.AGGREGATED_PICKUPS self.regressor = regressor_model # `sparse` determines whether data should be represented as sparse scipy # matrices as opposed to dense ones. (Some models such as the decision tree # regression model require a dense representation.) self.feature_extractor = FeatureExtractor(sparse) util.verbosePrint(self.regressor)
def extractXAndYValuesFromParsedPaths(self): parsedPaths = self.parseDAttrsToComplexNumbers() points = [] numOfPts = self.numOfPoints verbosePrint( self.verbose, "Started extracting x and y values from paths...") for parsedPath in parsedPaths: pts = [(p.real, p.imag) for p in (parsedPath.point(i / numOfPts) for i in range(0, numOfPts + 1))] points.append(pts) verbosePrint( self.verbose, "Ended extracting x and y values from paths.") return points
def _plotPredictionError(self, true_num_pickups, predicted_num_pickups): ''' Generate a scatter plot. The true number of pickups is plotted against the prediction error for each data point. The prediction error is defined as the absolute difference between the true value and the predicted value. ''' util.verbosePrint( 'Plotting predicted versus true pickup scatter plot.') util.verbosePrint('') error = [ abs(true_num_pickups[i] - predicted_num_pickups[i]) for i in xrange(len(true_num_pickups)) ] # Set bubble area and opacity, and create scatter plot. area = [70] * len(error) plt.scatter(true_num_pickups, predicted_num_pickups, s=area, alpha=0.2, edgecolors='none', label='actual predictions') X_line = range(max(true_num_pickups)) plt.plot(X_line, X_line, 'g--', color='0.5', label='perfect prediction line') # Decorate plot. plt.grid(True) plt.ylabel('Predicted Number of Pickups') plt.xlabel('True Number of Pickups') plt.title('Predicted vs. True Number of Pickups \nModel: %s' % self.model) plt.legend(loc='best') # Hard-code xmin, ymin to be -10, and constrain xmax, ymax to be the greater of the two. xmin, xmax, ymin, ymax = plt.axis() plt.axis((-10, max(xmax, ymax), -10, max(xmax, ymax))) plt.savefig( (Const.OUTFILE_DIRECTORY + 'true_vs_predicted_scatter_%s.png') % (util.currentTimeString()), bbox_inches='tight') plt.close()
def extractDataset(dataset, model_name): ''' Extract and return feature vectors and labels from data set as X and y/ :return: X, y ''' feature_extractor = FeatureExtractor(use_sparse=model_name != 'dtr') row_dicts = [] while dataset.hasMoreTrainExamples(): row_dicts.extend(dataset.getTrainExamples(Const.TRAIN_BATCH_SIZE)) dataset.switchToTestMode() while dataset.hasMoreTestExamples(): row_dicts.extend(dataset.getTestExamples()) util.verbosePrint('Number of examples being considered for train and test:', len(row_dicts)) X = feature_extractor.getFeatureVectors(row_dicts) y = np.array([example['num_pickups'] for example in row_dicts]) return X, y
def _plotFeatureWeights(self, zone_id, start_datetime, num_hours=7 * 24): ''' :param zone_id: only use features relevant to this zone. :param start_datetime: datetime at which to start extracting features. :param num_hours: number of hours to plot Generate a stacked bar chart of all the features weights used to predict the number of pickups in zone zone_id for each hour of the week (from Sunday 12am to Saturday 11pm). ''' if not hasattr(self.model, 'feature_extractor'): print '\tCannot plot features for the model.' return else: util.verbosePrint( 'Plotting features and their weights for each hour.') util.verbosePrint('\tStart time: %s' % str(start_datetime)) util.verbosePrint('\tDuration : %s hours' % str(num_hours)) util.verbosePrint('') # Mapping from all features to their corresponding weights. # e.g. feature_weights['Zone_HourOfDay=15402_14'] = 324.4565 feature_weights = self.model.getFeatureWeights() if feature_weights is None: print '\tAborting feature weight plot.' return # For each data point in the time range, get the weight for each of its features. # plot_values is a mapping from feature templates to a list of all their values at each time step. # e.g. plot_values['Zone_HourOfDay'] = [324.4565, 221.498, ... ] plot_values = {} for time_step in xrange(num_hours): curr_datetime = start_datetime + datetime.timedelta( hours=time_step) test_example = { 'zone_id': zone_id, 'start_datetime': curr_datetime } # test_example_features is a mapping from feature templates to their identifiers # e.g. test_example_features['Zone_HourOfDay'] = 15402_14 test_example_features = self.model.feature_extractor.getFeatureDict( test_example) for feature_template, identifier in test_example_features.iteritems( ): if feature_template not in plot_values: plot_values[feature_template] = [0] * num_hours feature_name = '%s=%s' % (feature_template, identifier) if feature_name in feature_weights: plot_values[feature_template][time_step] = feature_weights[ feature_name] # Generate stacked bar chart, whose series are the feature templates. # Order these feature templates first, then all the remaining feature # templates in plot_values in any order. feature_templates = [ 'Zone', 'DayOfWeek', 'HourOfDay', 'Zone_DayOfWeek', 'Zone_HourOfDay' ] for feature_template in list(feature_templates): if feature_template not in plot_values.keys(): feature_templates.remove(feature_template) for feature_template in plot_values.keys(): if feature_template not in feature_templates: feature_templates.append(feature_template) colors = ['b', 'g', 'r', 'c', 'y', 'm', '0.2', '0.8'] indices = [i for i in xrange(num_hours)] series_index = 0 width = 1 bars = [] # Plot positive values for all series. bottom_values = [0] * num_hours for feature_template in feature_templates: pos_values = [ max(0, weight) for weight in plot_values[feature_template] ] bar = plt.bar(indices, pos_values, color=colors[series_index % len(colors)], width=width, alpha=0.8, bottom=bottom_values) bars.append(bar[0]) new_bottom_values = [ bottom_values[i] + pos_values[i] for i in xrange(num_hours) ] bottom_values = new_bottom_values series_index += 1 series_index = 0 # Plot negative values for all series. bottom_values = [0] * num_hours for feature_template in feature_templates: neg_values = [ min(0, weight) for weight in plot_values[feature_template] ] plt.bar(indices, neg_values, color=colors[series_index % len(colors)], width=width, alpha=0.8, bottom=[i for i in bottom_values]) new_bottom_values = [ bottom_values[i] + neg_values[i] for i in xrange(num_hours) ] bottom_values = new_bottom_values series_index += 1 # Decorate plot. plt.grid(True) plt.title('Predicted Number of Pickups in Zone %d' % zone_id) plt.xlabel('Time (hours since 2013 April 7, 12am)') plt.ylabel('Number of Pickups') plt.xlim(0, num_hours) plt.ylim(-1000, 2000) plt.xticks(np.arange(0, num_hours + 1, 12)) plt.grid(True) plt.legend(bars, feature_templates) plt.savefig((Const.OUTFILE_DIRECTORY + 'feature_weights_zone_%d_%s.png') % \ (zone_id, util.currentTimeString()), bbox_inches='tight') plt.close()
def _plotFeatureWeights(self, zone_id, start_datetime, num_hours=7*24): ''' :param zone_id: only use features relevant to this zone. :param start_datetime: datetime at which to start extracting features. :param num_hours: number of hours to plot Generate a stacked bar chart of all the features weights used to predict the number of pickups in zone zone_id for each hour of the week (from Sunday 12am to Saturday 11pm). ''' if not hasattr(self.model, 'feature_extractor'): print '\tCannot plot features for the model.' return else: util.verbosePrint('Plotting features and their weights for each hour.') util.verbosePrint('\tStart time: %s' % str(start_datetime)) util.verbosePrint('\tDuration : %s hours' % str(num_hours)) util.verbosePrint('') # Mapping from all features to their corresponding weights. # e.g. feature_weights['Zone_HourOfDay=15402_14'] = 324.4565 feature_weights = self.model.getFeatureWeights() if feature_weights is None: print '\tAborting feature weight plot.' return # For each data point in the time range, get the weight for each of its features. # plot_values is a mapping from feature templates to a list of all their values at each time step. # e.g. plot_values['Zone_HourOfDay'] = [324.4565, 221.498, ... ] plot_values = {} for time_step in xrange(num_hours): curr_datetime = start_datetime + datetime.timedelta(hours=time_step) test_example = {'zone_id': zone_id, 'start_datetime': curr_datetime} # test_example_features is a mapping from feature templates to their identifiers # e.g. test_example_features['Zone_HourOfDay'] = 15402_14 test_example_features = self.model.feature_extractor.getFeatureDict(test_example) for feature_template, identifier in test_example_features.iteritems(): if feature_template not in plot_values: plot_values[feature_template] = [0] * num_hours feature_name = '%s=%s' % (feature_template, identifier) if feature_name in feature_weights: plot_values[feature_template][time_step] = feature_weights[feature_name] # Generate stacked bar chart, whose series are the feature templates. # Order these feature templates first, then all the remaining feature # templates in plot_values in any order. feature_templates = ['Zone', 'DayOfWeek', 'HourOfDay', 'Zone_DayOfWeek', 'Zone_HourOfDay'] for feature_template in list(feature_templates): if feature_template not in plot_values.keys(): feature_templates.remove(feature_template) for feature_template in plot_values.keys(): if feature_template not in feature_templates: feature_templates.append(feature_template) colors = ['b', 'g', 'r', 'c', 'y', 'm', '0.2', '0.8'] indices = [i for i in xrange(num_hours)] series_index = 0 width = 1 bars = [] # Plot positive values for all series. bottom_values = [0] * num_hours for feature_template in feature_templates: pos_values = [max(0, weight) for weight in plot_values[feature_template]] bar = plt.bar(indices, pos_values, color=colors[series_index % len(colors)], width=width, alpha=0.8, bottom=bottom_values) bars.append(bar[0]) new_bottom_values = [bottom_values[i] + pos_values[i] for i in xrange(num_hours)] bottom_values = new_bottom_values series_index += 1 series_index = 0 # Plot negative values for all series. bottom_values = [0] * num_hours for feature_template in feature_templates: neg_values = [min(0, weight) for weight in plot_values[feature_template]] plt.bar(indices, neg_values, color=colors[series_index % len(colors)], width=width, alpha=0.8, bottom=[i for i in bottom_values]) new_bottom_values = [bottom_values[i] + neg_values[i] for i in xrange(num_hours)] bottom_values = new_bottom_values series_index += 1 # Decorate plot. plt.grid(True) plt.title('Predicted Number of Pickups in Zone %d' % zone_id) plt.xlabel('Time (hours since 2013 April 7, 12am)') plt.ylabel('Number of Pickups') plt.xlim(0, num_hours) plt.ylim(-1000, 2000) plt.xticks(np.arange(0, num_hours + 1, 12)) plt.grid(True) plt.legend(bars, feature_templates) plt.savefig((Const.OUTFILE_DIRECTORY + 'feature_weights_zone_%d_%s.png') % \ (zone_id, util.currentTimeString()), bbox_inches='tight') plt.close()