def screens_budget_summer_lglglg(): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] budget_ind = key['production_budget'] summer_ind = key['summer_release'] screens = mat.getcol(screen_ind).todense() budget = mat.getcol(budget_ind).todense() summer = mat.getcol(summer_ind).todense() def safelog(x): if x <= 0.: return 0. else: return math.log(x) fns = [safelog, safelog, safelog, safelog] bs_check = lambda x:x[1] > 0. and x[2] > 0. bns_check = lambda x:x[1] > 0. and x[2] == 0. nbs_check = lambda x:x[1] == 0. and x[2] > 0. nbns_check = lambda x:x[1] == 0. and x[2] == 0. bs_arr = format_arr([screens,budget,summer], regy, fns, bs_check) bns_arr = format_arr([screens,budget,summer], regy, fns, bns_check) nbs_arr = format_arr([screens,budget,summer], regy, fns, nbs_check) nbns_arr = format_arr([screens,budget,summer], regy, fns, nbns_check) budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2, lambda x:x[1], lambda x:x[1]**2] no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2] bs_coeffs = freg.coeffs(budget_basis_fns, bs_arr) bns_coeffs = freg.coeffs(budget_basis_fns, bns_arr) nbs_coeffs = freg.coeffs(no_budget_basis_fns, nbs_arr) nbns_coeffs = freg.coeffs(no_budget_basis_fns, nbns_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = 0 x = [test[i,screen_ind], test[i,budget_ind], test[i,summer_ind]] logx = tuple([safelog(feat) for feat in x]) if bs_check(x): prod = freg.product(logx, bs_coeffs, budget_basis_fns) elif bns_check(x): prod = freg.product(logx, bns_coeffs, budget_basis_fns) elif nbs_check(x): prod = freg.product(logx, nbs_coeffs, no_budget_basis_fns) elif nbns_check(x): prod = freg.product(logx, nbns_coeffs, no_budget_basis_fns) if prod < 0: prod = 0 preds.append(math.e**prod) util.write_predictions(preds, ids, 'screens_budget_summer_lglglg-2.csv')
def proc_unigram_feats(): mat,key,regy,_ = rs.extract_feats([rs.unigram_feats]) inv_key = {v:k for k,v in key.items()} num_movies,num_words = mat.get_shape() movies = [(regy[i],i) for i in range(num_movies)] min_movies = heap.nsmallest(MOVIE_TARGET,movies) max_movies = heap.nlargest(MOVIE_TARGET,movies) tot_min = 0. tot_max = 0. for mv in min_movies: tot_min += mat[mv[1]].sum() for mv in max_movies: tot_max += mat[mv[1]].sum() fix = tot_max/tot_min diffs = np.zeros((num_words)) for mv in min_movies: diffs += -1.*fix*mat[mv[1]] for mv in max_movies: diffs += mat[mv[1]] with open("english.stop") as f: stop_words = set([line.strip() for line in f.readlines()]) words = [(diffs[0,i],inv_key[i]) for i in range(num_words) if inv_key[i] not in stop_words] worst_words = heap.nsmallest(WORD_TARGET, words) worst_words.sort() best_words = heap.nlargest(WORD_TARGET, words) best_words.sort() for wd in worst_words: print wd[1] + '\t' + str(wd[0]) print '---------------------------------' for wd in best_words: print wd[1] + '\t' + str(wd[0])
def screens(basis_fns, fns, inv_fn, outfile): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] screens = mat.getcol(screen_ind).todense() train_arr = format_arr([screens], regy, fns) coeffs = freg.coeffs(basis_fns, train_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = freg.product((fns[0](test[i,screen_ind]),), coeffs, basis_fns) if prod < 0: prod = 0 preds.append(inv_fn(prod)) util.write_predictions(preds, ids, outfile)
def loadData(params, withhold, ffs, trainfile="train.xml", testfile="testcases.xml"): """ loads the movie data arguments: params : dict with several keys: load : loading mode; either: 'extract' to load from `params['extractFile']`, 'split' to load from `params['splitFile']`, or None to extract features and save to `params['extractFile']` and/or `params['splitFile']`. extractFile : file to load/save extracted features to/from, depending on loading mode splitFile : file to load/save split data to/from, depending on loading mode withhold : number of data points to withhold for cross-validation ffs : list of feature functions trainfile : path to training file (train.xml) testfile : path to test cases file returns: """ # load data from `params['splitFile']` if params['load']=='split': X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile']) print "loaded %d fds" % len(train_ids) print "withholding %d of %d fds" % (len(test_ids), len(train_ids)) else: # load data from scratch if params['load']==None: fds, targets, train_ids = regress.extract_feats_helper(ffs, trainfile) print "loaded %d fds" % len(fds) if params['extractFile'] != None: pickle((fds,targets,train_ids),params['extractFile']) # load data from `params['extractFile']`, but split it anew elif params['load']=='extract': fds,targets,train_ids=unpickle(params['extractFile']) # load the test data from the testcases file if withhold==0: X_train,feat_dict = regress.make_design_mat(fds) y_train=np.array(targets) X_test,_,y_test,test_ids = regress.extract_feats(ffs, testfile, global_feat_dict=feat_dict) train_ids = [] # withhold some of the training data into test data else: fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod']) X_train,feat_dict = regress.make_design_mat(fds) X_test,_ = regress.make_design_mat(fdsTest, feat_dict) y_train=np.array(targets) y_test=np.array(targetsTest) if params['splitFile'] != None: pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile']) return X_train,y_train,train_ids, X_test,y_test,test_ids
def corr_words(): mat,key,regy,_ = rs.extract_feats([rs.unigram_feats]) num_data = mat.shape[0] word_counts = mat.sum() avg_count = word_counts / num_data movie_count = [] for i in range(num_data): word_count = mat[i].sum() #movie_count.append(avg_count/word_count*mat[i,key['camera']]) movie_count.append(mat[i,key['camera']]) print np.corrcoef(np.array(movie_count), regy)[0][1]
def screens_budget_lglglg(): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) screen_ind = key['number_of_screens'] budget_ind = key['production_budget'] screens = mat.getcol(screen_ind).todense() budget = mat.getcol(budget_ind).todense() budget_fns = [lambda x:math.log(x) for i in range(3)] budget_check = lambda x:x[1] > 0. budget_arr = format_arr([screens,budget], regy, budget_fns, budget_check) no_budget_arr = format_arr([screens],regy,[lambda x:math.log(x),lambda x:math.log(x)]) budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2, lambda x:x[1], lambda x:x[1]**2] no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2] budget_coeffs = freg.coeffs(budget_basis_fns, budget_arr) no_budget_coeffs = freg.coeffs(no_budget_basis_fns, no_budget_arr) test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml', global_feat_dict = key) test_len = test.shape[0] preds = [] for i in range(test_len): prod = 0 if test[i,budget_ind] > 0.: x = (budget_fns[0](test[i,screen_ind]), budget_fns[1](test[i,budget_ind])) prod = freg.product(x, budget_coeffs, budget_basis_fns) else: x = (math.log(test[i,screen_ind]),) prod = freg.product(x, no_budget_coeffs, no_budget_basis_fns) if prod < 0: prod = 0 preds.append(math.e**prod) util.write_predictions(preds, ids, 'screens_budget_lglglg-2.csv')
def proc_metadata_feats(): mat,key,regy,_ = rs.extract_feats([rs.metadata_feats]) num_movies = len(regy) for feat in CONT_FEATS: print feat print np.corrcoef(mat.getcol(key[feat]).todense().transpose(),regy)[0,1] print '-------------' for feat in BOOL_FEATS: feat_arr = mat.getcol(key[feat]).todense().transpose() tvec = [regy[i] for i in range(len(regy)) if feat_arr[0,i] == 1.] fvec = [regy[i] for i in range(len(regy)) if feat_arr[0,i] == 0.] tlen = len(tvec) flen = len(fvec) print feat + ' ' + str(tlen) + '/' + str(flen) print 'mean diff ' + str((sum(tvec)/tlen) - (sum(fvec)/flen)) print 'log mean diff ' + str((sum([math.log(r) for r in tvec])/tlen) - (sum([math.log(r) for r in fvec])/flen)) print stats.ks_2samp(np.array(tvec),np.array(fvec))[0] print '-------------'