Beispiel #1
0
def load_and_preprocess_data(debug=False):

    description, points = load_dataset()

    if debug:
        allData = [(description[_], float(points[_])) for _ in xrange(200)]

    else:
        allData = [(description[_], float(points[_]))
                   for _ in xrange(len(description))]

    # choose specific description to comprise the training set

    np.random.seed(2017)
    np.random.shuffle(allData)

    train = allData[:int(len(allData) * 7 / 10)]
    dev = allData[int(len(allData) * 7 / 10):int(len(allData) * 9 / 10)]
    test = allData[int(len(allData) * 9 / 10):]

    helper = ModelHelper.build(allData)

    # ((the, drink, smells, like, this,, 85), (this wine is fruity, 90),.... )

    # now process all the input data.
    train_data = helper.vectorize(zip(*train)[0])
    dev_data = helper.vectorize(zip(*dev)[0])
    test_data = helper.vectorize(zip(*test)[0])

    train_final_data = zip(train_data, zip(*train)[1])
    dev_final_data = zip(dev_data, zip(*dev)[1])
    test_final_data = zip(test_data, zip(*test)[1])

    return helper, train_final_data, dev_final_data, test_final_data, train, dev, test
Beispiel #2
0
def load_and_preprocess_data(debug=False):

    description, country = load_dataset()

    country_dict_key_int = {}
    country_dict_key_category = {}
    country_set = set(country)
    country_list = list(country_set)

    for i in range(len(country_list)):
        country_dict_key_int[i] = country_list[i]
        country_dict_key_category[country_list[i]] = i

    if debug:
        allData = [(description[_], country_dict_key_category[country[_]])
                   for _ in xrange(200)]

    else:
        allData = [(description[_], country_dict_key_category[country[_]])
                   for _ in xrange(len(description))]

    # choose specific description to comprise the training set

    np.random.seed(2017)
    np.random.shuffle(allData)

    train = allData[:int(len(allData) * 7 / 10)]
    dev = allData[int(len(allData) * 7 / 10):int(len(allData) * 9 / 10)]
    test = allData[int(len(allData) * 9 / 10):]

    helper = ModelHelper.build(allData)

    # ((the, drink, smells, like, this,, 85), (this wine is fruity, 90),.... )

    # now process all the input data.
    # train_data = helper.vectorize([x[0] for x in train])
    train_data = helper.vectorize(zip(*train)[0])
    dev_data = helper.vectorize(zip(*train)[0])
    test_data = helper.vectorize(zip(*train)[0])

    train_final_data = zip(train_data, zip(*train)[1])
    dev_final_data = zip(dev_data, zip(*train)[1])
    test_final_data = zip(test_data, zip(*train)[1])

    return helper, train_final_data, dev_final_data, test_final_data, train, dev, test, country_dict_key_int