Example #1
0
def screens_budget_summer_lglglg():
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    budget_ind = key['production_budget']
    summer_ind = key['summer_release']

    screens = mat.getcol(screen_ind).todense()
    budget = mat.getcol(budget_ind).todense()
    summer = mat.getcol(summer_ind).todense()

    def safelog(x):
        if x <= 0.:
            return 0.
        else:
            return math.log(x)
    fns = [safelog, safelog, safelog, safelog]
    bs_check = lambda x:x[1] > 0. and x[2] > 0.
    bns_check = lambda x:x[1] > 0. and x[2] == 0.
    nbs_check = lambda x:x[1] == 0. and x[2] > 0.
    nbns_check = lambda x:x[1] == 0. and x[2] == 0.

    bs_arr = format_arr([screens,budget,summer], regy, fns, bs_check)
    bns_arr = format_arr([screens,budget,summer], regy, fns, bns_check)
    nbs_arr = format_arr([screens,budget,summer], regy, fns, nbs_check)
    nbns_arr = format_arr([screens,budget,summer], regy, fns, nbns_check)

    budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2,
                        lambda x:x[1], lambda x:x[1]**2]
    no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2]

    bs_coeffs = freg.coeffs(budget_basis_fns, bs_arr)
    bns_coeffs = freg.coeffs(budget_basis_fns, bns_arr)
    nbs_coeffs = freg.coeffs(no_budget_basis_fns, nbs_arr)
    nbns_coeffs = freg.coeffs(no_budget_basis_fns, nbns_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = 0
        x = [test[i,screen_ind], test[i,budget_ind], test[i,summer_ind]]
        logx = tuple([safelog(feat) for feat in x])
        if bs_check(x):
            prod = freg.product(logx, bs_coeffs, budget_basis_fns)
        elif bns_check(x):
            prod = freg.product(logx, bns_coeffs, budget_basis_fns)
        elif nbs_check(x):
            prod = freg.product(logx, nbs_coeffs, no_budget_basis_fns)
        elif nbns_check(x):
            prod = freg.product(logx, nbns_coeffs, no_budget_basis_fns)
        if prod < 0:
            prod = 0
        preds.append(math.e**prod)
    util.write_predictions(preds, ids, 'screens_budget_summer_lglglg-2.csv')
Example #2
0
def proc_unigram_feats():
    mat,key,regy,_ = rs.extract_feats([rs.unigram_feats])
    inv_key = {v:k for k,v in key.items()}
    num_movies,num_words = mat.get_shape()

    movies = [(regy[i],i) for i in range(num_movies)]
    min_movies = heap.nsmallest(MOVIE_TARGET,movies)
    max_movies = heap.nlargest(MOVIE_TARGET,movies)
    tot_min = 0.
    tot_max = 0.
    for mv in min_movies:
        tot_min += mat[mv[1]].sum()
    for mv in max_movies:
        tot_max += mat[mv[1]].sum()
    fix = tot_max/tot_min
    diffs = np.zeros((num_words))
    for mv in min_movies:
        diffs += -1.*fix*mat[mv[1]]
    for mv in max_movies:
        diffs += mat[mv[1]]

    with open("english.stop") as f:
        stop_words = set([line.strip() for line in f.readlines()])
        words = [(diffs[0,i],inv_key[i]) for i in range(num_words)
                 if inv_key[i] not in stop_words]
        worst_words = heap.nsmallest(WORD_TARGET, words)
        worst_words.sort()
        best_words = heap.nlargest(WORD_TARGET, words)
        best_words.sort()

        for wd in worst_words:
            print wd[1] + '\t' + str(wd[0])
        print '---------------------------------'
        for wd in best_words:
            print wd[1] + '\t' + str(wd[0])
Example #3
0
def screens(basis_fns, fns, inv_fn, outfile):
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    screens = mat.getcol(screen_ind).todense()
    train_arr = format_arr([screens], regy, fns)
    coeffs = freg.coeffs(basis_fns, train_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = freg.product((fns[0](test[i,screen_ind]),), coeffs, basis_fns)
        if prod < 0:
            prod = 0
        preds.append(inv_fn(prod))

    util.write_predictions(preds, ids, outfile)
Example #4
0
def loadData(params, withhold, ffs, trainfile="train.xml", testfile="testcases.xml"):
    """
    loads the movie data

    arguments:
        params      : dict with several keys:
            load        : loading mode; either: 'extract' to load from
                        `params['extractFile']`, 'split' to load from
                        `params['splitFile']`, or None to extract features and
                        save to `params['extractFile']` and/or
                        `params['splitFile']`.
            extractFile : file to load/save extracted features to/from,
                        depending on loading mode
            splitFile   : file to load/save split data to/from,
                        depending on loading mode
        withhold    : number of data points to withhold for cross-validation
        ffs         : list of feature functions
        trainfile   : path to training file (train.xml)
        testfile    : path to test cases file

    returns:

    """
    # load data from `params['splitFile']`
    if params['load']=='split':
        X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile'])
        print "loaded %d fds" % len(train_ids)
        print "withholding %d of %d fds" % (len(test_ids), len(train_ids))
    else:
        # load data from scratch
        if params['load']==None:
            fds, targets, train_ids = regress.extract_feats_helper(ffs, trainfile)
            print "loaded %d fds" % len(fds)
            if params['extractFile'] != None:
                pickle((fds,targets,train_ids),params['extractFile'])
        # load data from `params['extractFile']`, but split it anew
        elif params['load']=='extract':
            fds,targets,train_ids=unpickle(params['extractFile'])

        # load the test data from the testcases file
        if withhold==0:
            X_train,feat_dict = regress.make_design_mat(fds)
            y_train=np.array(targets)
            X_test,_,y_test,test_ids = regress.extract_feats(ffs, testfile, global_feat_dict=feat_dict)
            train_ids = []
        # withhold some of the training data into test data
        else:
            fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod'])
            X_train,feat_dict = regress.make_design_mat(fds)
            X_test,_ = regress.make_design_mat(fdsTest, feat_dict)
            y_train=np.array(targets)
            y_test=np.array(targetsTest)

        if params['splitFile'] != None:
            pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile'])

    return X_train,y_train,train_ids, X_test,y_test,test_ids
Example #5
0
def corr_words():
    mat,key,regy,_ = rs.extract_feats([rs.unigram_feats])

    num_data = mat.shape[0]
    word_counts = mat.sum()
    avg_count = word_counts / num_data

    movie_count = []
    for i in range(num_data):
        word_count = mat[i].sum()
        #movie_count.append(avg_count/word_count*mat[i,key['camera']])
        movie_count.append(mat[i,key['camera']])

    print np.corrcoef(np.array(movie_count), regy)[0][1]
Example #6
0
def screens_budget_lglglg():
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    screen_ind = key['number_of_screens']
    budget_ind = key['production_budget']
    screens = mat.getcol(screen_ind).todense()
    budget = mat.getcol(budget_ind).todense()

    budget_fns = [lambda x:math.log(x) for i in range(3)]
    budget_check = lambda x:x[1] > 0.
    budget_arr = format_arr([screens,budget], regy, budget_fns, budget_check)
    no_budget_arr = format_arr([screens],regy,[lambda x:math.log(x),lambda x:math.log(x)])

    budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2,
                        lambda x:x[1], lambda x:x[1]**2]
    no_budget_basis_fns = [lambda x:1, lambda x:x[0], lambda x:x[0]**2]

    budget_coeffs = freg.coeffs(budget_basis_fns, budget_arr)
    no_budget_coeffs = freg.coeffs(no_budget_basis_fns, no_budget_arr)

    test,_,_,ids = rs.extract_feats([rs.metadata_feats],'testcases.xml',
                                    global_feat_dict = key)
    test_len = test.shape[0]
    preds = []
    for i in range(test_len):
        prod = 0
        if test[i,budget_ind] > 0.:
            x = (budget_fns[0](test[i,screen_ind]),
                 budget_fns[1](test[i,budget_ind]))
            prod = freg.product(x, budget_coeffs, budget_basis_fns)
        else:
            x = (math.log(test[i,screen_ind]),)
            prod = freg.product(x, no_budget_coeffs, no_budget_basis_fns)
        if prod < 0:
            prod = 0
        preds.append(math.e**prod)
    util.write_predictions(preds, ids, 'screens_budget_lglglg-2.csv')
Example #7
0
def proc_metadata_feats():
    mat,key,regy,_ = rs.extract_feats([rs.metadata_feats])
    num_movies = len(regy)

    for feat in CONT_FEATS:
        print feat
        print np.corrcoef(mat.getcol(key[feat]).todense().transpose(),regy)[0,1]
        print '-------------'

    for feat in BOOL_FEATS:
        feat_arr = mat.getcol(key[feat]).todense().transpose()
        tvec = [regy[i] for i in range(len(regy)) if feat_arr[0,i] == 1.]
        fvec = [regy[i] for i in range(len(regy)) if feat_arr[0,i] == 0.]
        tlen = len(tvec)
        flen = len(fvec)

        print feat + ' ' + str(tlen) + '/' + str(flen)
        print 'mean diff ' + str((sum(tvec)/tlen) - (sum(fvec)/flen))
        print 'log mean diff ' + str((sum([math.log(r) for r in tvec])/tlen) -
                                     (sum([math.log(r) for r in fvec])/flen))
        print stats.ks_2samp(np.array(tvec),np.array(fvec))[0]
        print '-------------'