def traverse_movies_OLS(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 100: model = LinearRegression() model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS) print 'all', avg_float_list(ERRORS[3200:])
def traverse_movies_OLS(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 100: model = LinearRegression() model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) P_ERRORS.append(round(raw / m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS) print 'all', avg_float_list(ERRORS[3200:])
def vectorizeMovie(movie, LBMap, DMAP, Discrete=False, Sentiment=False): OUT = [] for field in ['month', 'mpaa', 'genres']: OUT += getVector(field, movie[field], LBMap) OUT += [discretize(movie['runtime'], 90, Discrete)] OUT += [discretize(movie['budget'], M50, Discrete)] for entity in movie['stars']: e_name = entity['name'] v = DMAP['stars'][e_name] for subfield in ['high', 'avg']: OUT += [discretize(v[subfield], M100, Discrete)] for field in ['directors', 'writers', 'production']: ls_vals = [] for entity in movie[field]: e_name = entity['name'] ls_vals += [DMAP[field][e_name]['avg']] OUT += [discretize(avg_float_list(ls_vals), M100, Discrete)] if Sentiment: tag = movie['tagline'] if tag: tblob = TextBlob(tag.strip().encode('ascii', 'ignore')) if float(tblob.sentiment.polarity) >= 0.0: OUT += [1] else: OUT += [0] else: OUT += [1] return OUT
def graph_regression(): results = { 'Overall': [0, 0, []], 'All Correct': [0, 0, []], '<100M Correct': [0, 0, []], '100M+ Correct': [0, 0, []] } with open('combined.csv', 'r') as f: for line in f: vals = line.split(',') true, pred, err = vals[0], vals[1], float(vals[2]) all_c, c0_c, c1_c = False, False, False if true == pred: all_c = True if true == '1': c1_c = True else: c0_c = True for key in results.keys(): if ((key == 'Overall') or (key == 'All Correct' and all_c) or (key == '<100M Correct' and c0_c) or (key == '100M+ Correct' and c1_c)): n, ravg = results[key][0], results[key][1] new_avg = ((n * ravg) + err) / float(n + 1) results[key][0], results[key][1] = (n + 1), new_avg results[key][2] += [new_avg] else: results[key][0] += 1 results[key][2] += [results[key][1]] for key in ['Overall', 'All Correct', '<100M Correct', '100M+ Correct']: model, scores = key, results[key][2] print model, avg_float_list(scores) plt.plot(range(len(scores)), scores, label=model, linewidth=2) plt.legend() plt.suptitle('Regression Performance on Correct Classifications') plt.xlabel('Time Step') plt.ylabel('Mean Absolute Error') plt.ylim(10000000, 60000000) plt.show()
def graph_regression(): results = { 'Overall' : [0, 0, []], 'All Correct' : [0, 0, []], '<100M Correct' : [0, 0, []], '100M+ Correct' : [0, 0, []] } with open('combined.csv', 'r') as f: for line in f: vals = line.split(',') true, pred, err = vals[0], vals[1], float(vals[2]) all_c, c0_c, c1_c = False, False, False if true == pred: all_c = True if true == '1': c1_c = True else: c0_c = True for key in results.keys(): if ((key == 'Overall') or (key == 'All Correct' and all_c) or (key == '<100M Correct' and c0_c) or (key == '100M+ Correct' and c1_c)): n, ravg = results[key][0], results[key][1] new_avg = ((n * ravg) + err) / float(n+1) results[key][0], results[key][1] = (n + 1), new_avg results[key][2] += [new_avg] else: results[key][0] += 1 results[key][2] += [results[key][1]] for key in ['Overall', 'All Correct', '<100M Correct', '100M+ Correct']: model, scores = key, results[key][2] print model, avg_float_list(scores) plt.plot(range(len(scores)), scores, label=model, linewidth=2) plt.legend() plt.suptitle('Regression Performance on Correct Classifications') plt.xlabel('Time Step') plt.ylabel('Mean Absolute Error') plt.ylim(10000000, 60000000) plt.show()