def loadData(params, withhold, ffs, trainfile="train.xml", testfile="testcases.xml"): """ loads the movie data arguments: params : dict with several keys: load : loading mode; either: 'extract' to load from `params['extractFile']`, 'split' to load from `params['splitFile']`, or None to extract features and save to `params['extractFile']` and/or `params['splitFile']`. extractFile : file to load/save extracted features to/from, depending on loading mode splitFile : file to load/save split data to/from, depending on loading mode withhold : number of data points to withhold for cross-validation ffs : list of feature functions trainfile : path to training file (train.xml) testfile : path to test cases file returns: """ # load data from `params['splitFile']` if params['load']=='split': X_train, y_train, train_ids,X_test,y_test,test_ids = unpickle(params['splitFile']) print "loaded %d fds" % len(train_ids) print "withholding %d of %d fds" % (len(test_ids), len(train_ids)) else: # load data from scratch if params['load']==None: fds, targets, train_ids = regress.extract_feats_helper(ffs, trainfile) print "loaded %d fds" % len(fds) if params['extractFile'] != None: pickle((fds,targets,train_ids),params['extractFile']) # load data from `params['extractFile']`, but split it anew elif params['load']=='extract': fds,targets,train_ids=unpickle(params['extractFile']) # load the test data from the testcases file if withhold==0: X_train,feat_dict = regress.make_design_mat(fds) y_train=np.array(targets) X_test,_,y_test,test_ids = regress.extract_feats(ffs, testfile, global_feat_dict=feat_dict) train_ids = [] # withhold some of the training data into test data else: fds, targets, train_ids, fdsTest, targetsTest, test_ids = splitData(fds, targets, train_ids, withhold, params['splitMethod']) X_train,feat_dict = regress.make_design_mat(fds) X_test,_ = regress.make_design_mat(fdsTest, feat_dict) y_train=np.array(targets) y_test=np.array(targetsTest) if params['splitFile'] != None: pickle((X_train, y_train, train_ids,X_test,y_test,test_ids), params['splitFile']) return X_train,y_train,train_ids, X_test,y_test,test_ids
def postproc(feat_dict): return rs.make_design_mat(feat_dict)
assert False else: curr_inst = [line] in_instance = True elif end_tag in line: curr_inst.append(line) movies.append(util.MovieData(ET.fromstring("".join(curr_inst)))) curr_inst = [] in_instance = False elif in_instance: curr_inst.append(line) return movies '''movies = get_movies('train.xml') print regression_starter.metadata_feats(movies[0]) print '-------' print regression_starter.unigram_feats(movies[0])''' fds = [{'hi':1,'bye':0,'foo':3},{'hi':1,'hello':1,'foo':2,'bar':0}] '''movies = get_movies('train.xml') fd1 = regression_starter.metadata_feats(movies[0]) fd2 = regression_starter.metadata_feats(movies[1]) print fd1 print fd2 print '------' fds = [fd1,fd2]''' X, dict = regression_starter.make_design_mat(fds) print X print '------' print dict