def load_and_preprocess_data(debug=False): description, points = load_dataset() if debug: allData = [(description[_], float(points[_])) for _ in xrange(200)] else: allData = [(description[_], float(points[_])) for _ in xrange(len(description))] # choose specific description to comprise the training set np.random.seed(2017) np.random.shuffle(allData) train = allData[:int(len(allData) * 7 / 10)] dev = allData[int(len(allData) * 7 / 10):int(len(allData) * 9 / 10)] test = allData[int(len(allData) * 9 / 10):] helper = ModelHelper.build(allData) # ((the, drink, smells, like, this,, 85), (this wine is fruity, 90),.... ) # now process all the input data. train_data = helper.vectorize(zip(*train)[0]) dev_data = helper.vectorize(zip(*dev)[0]) test_data = helper.vectorize(zip(*test)[0]) train_final_data = zip(train_data, zip(*train)[1]) dev_final_data = zip(dev_data, zip(*dev)[1]) test_final_data = zip(test_data, zip(*test)[1]) return helper, train_final_data, dev_final_data, test_final_data, train, dev, test
def load_and_preprocess_data(debug=False): description, country = load_dataset() country_dict_key_int = {} country_dict_key_category = {} country_set = set(country) country_list = list(country_set) for i in range(len(country_list)): country_dict_key_int[i] = country_list[i] country_dict_key_category[country_list[i]] = i if debug: allData = [(description[_], country_dict_key_category[country[_]]) for _ in xrange(200)] else: allData = [(description[_], country_dict_key_category[country[_]]) for _ in xrange(len(description))] # choose specific description to comprise the training set np.random.seed(2017) np.random.shuffle(allData) train = allData[:int(len(allData) * 7 / 10)] dev = allData[int(len(allData) * 7 / 10):int(len(allData) * 9 / 10)] test = allData[int(len(allData) * 9 / 10):] helper = ModelHelper.build(allData) # ((the, drink, smells, like, this,, 85), (this wine is fruity, 90),.... ) # now process all the input data. # train_data = helper.vectorize([x[0] for x in train]) train_data = helper.vectorize(zip(*train)[0]) dev_data = helper.vectorize(zip(*train)[0]) test_data = helper.vectorize(zip(*train)[0]) train_final_data = zip(train_data, zip(*train)[1]) dev_final_data = zip(dev_data, zip(*train)[1]) test_final_data = zip(test_data, zip(*train)[1]) return helper, train_final_data, dev_final_data, test_final_data, train, dev, test, country_dict_key_int