Exemple #1
def traverse_movies_OLS():
	LBMAP = getLBMap()
	DMAP = createEmpty()

	P_ERRORS, ERRORS = [], []

	training_data, training_response = [], []

	for i in range(len(data)):

		movie = data[i]
		m_rev = movie['revenue']

		myvector = vectorizeMovie(movie, LBMAP, DMAP)

		if i > 100:
			model = LinearRegression()
			model.fit(training_data, training_response)
			raw = math.fabs(model.predict(myvector) - m_rev)
			P_ERRORS.append(round(raw/m_rev, 4))

		DMAP = update(movie, DMAP)

	print 'all', avg_float_list(P_ERRORS)
	print 'all', avg_float_list(ERRORS)
	print 'all', avg_float_list(ERRORS[3200:])
Exemple #3
def vectorizeMovie(movie, LBMap, DMAP, Discrete=False, Sentiment=False):
	OUT = []

	for field in ['month', 'mpaa', 'genres']:
		OUT += getVector(field, movie[field], LBMap)

	OUT += [discretize(movie['runtime'], 90, Discrete)]
	OUT += [discretize(movie['budget'], M50, Discrete)]

	for entity in movie['stars']:
		e_name = entity['name']
		v = DMAP['stars'][e_name]
		for subfield in ['high', 'avg']:

			OUT += [discretize(v[subfield], M100, Discrete)]

	for field in ['directors', 'writers', 'production']:
		ls_vals = []
		for entity in movie[field]:
			e_name = entity['name']
			ls_vals += [DMAP[field][e_name]['avg']]
		OUT += [discretize(avg_float_list(ls_vals), M100, Discrete)]

	if Sentiment:
		tag = movie['tagline']
		if tag:
			tblob = TextBlob(tag.strip().encode('ascii', 'ignore'))
			if float(tblob.sentiment.polarity) >= 0.0: 
				OUT += [1]
			else: OUT += [0]
		else: OUT += [1]

	return OUT
def graph_regression():

    results = {
        'Overall': [0, 0, []],
        'All Correct': [0, 0, []],
        '<100M Correct': [0, 0, []],
        '100M+ Correct': [0, 0, []]

    with open('combined.csv', 'r') as f:

        for line in f:

            vals = line.split(',')
            true, pred, err = vals[0], vals[1], float(vals[2])

            all_c, c0_c, c1_c = False, False, False

            if true == pred:
                all_c = True

                if true == '1':
                    c1_c = True
                    c0_c = True

            for key in results.keys():

                if ((key == 'Overall') or (key == 'All Correct' and all_c)
                        or (key == '<100M Correct' and c0_c)
                        or (key == '100M+ Correct' and c1_c)):

                    n, ravg = results[key][0], results[key][1]

                    new_avg = ((n * ravg) + err) / float(n + 1)

                    results[key][0], results[key][1] = (n + 1), new_avg

                    results[key][2] += [new_avg]


                    results[key][0] += 1
                    results[key][2] += [results[key][1]]

    for key in ['Overall', 'All Correct', '<100M Correct', '100M+ Correct']:
        model, scores = key, results[key][2]
        print model, avg_float_list(scores)
        plt.plot(range(len(scores)), scores, label=model, linewidth=2)

    plt.suptitle('Regression Performance on Correct Classifications')
    plt.xlabel('Time Step')
    plt.ylabel('Mean Absolute Error')
    plt.ylim(10000000, 60000000)
Exemple #6
