Esempio n. 1
0
def train_model(item_type):
    model_dir = "models/" + item_type.lower().replace(" ", "_")
    if os.path.exists(model_dir):
        return
    print("==> Training model for '%s'" % item_type)
    csv_filename = filename = "data/" + item_type.lower().replace(" ", "_") + ".csv"
    df_all = pd.read_csv(csv_filename, skipinitialspace=True, encoding='utf-8')
    df_all.fillna(0.0, inplace=True)

    # Convert the price to a bucket representing a range
    df_all['price_chaos'] = (df_all['price_chaos'].apply(util.price_bucket)).astype(int)

    # Hash the item type to a number
    df_all['itemType'] = (df_all['itemType'].apply(lambda x: util.type_hash[x])).astype(float)

    LABEL_COLUMN = util.LABEL_COLUMN

    # Split the data 80/20 training/test
    percent_test = 20
    n = (len(df_all) * percent_test)/100
    df_train = df_all.head(len(df_all) - n)
    df_test = df_all.tail(n)

    train_x = df_train.ix[:, df_train.columns != LABEL_COLUMN].as_matrix().astype(float)
    train_y = df_train.as_matrix([LABEL_COLUMN])
    test_x = df_test.ix[:, df_test.columns != LABEL_COLUMN].as_matrix().astype(float)
    test_y = df_test.as_matrix([LABEL_COLUMN])

    deep_columns = tf.contrib.learn.infer_real_valued_columns_from_input(train_x)
    hidden_units = util.get_hidden_units(len(df_train.columns)-1)
    model = DNNClassifier(model_dir=model_dir, feature_columns=deep_columns, hidden_units=hidden_units,
                          n_classes=len(util.bins), enable_centered_bias=True)

    steps = len(df_train)/75
    sessions = (steps/500)+2
    for i in range(sessions):
        model.fit(train_x, train_y, steps=500, batch_size=5000)
        results = model.evaluate(test_x, test_y, steps=1, batch_size=df_test.size)

    # Print some predictions from the test data
    predictions = df_test.sample(10)
    v = model.predict_proba(predictions.ix[:, df_test.columns != LABEL_COLUMN].as_matrix().astype(float), batch_size=10)

    price_map = []

    for i in v:
        # take the top 5 most likely price ranges
        top_largest = i.argsort()[-5:][::-1]
        prices = {}
        for p in top_largest:
            prices[util.get_bin_label(p)] = float(round(100*i[p], 1))
        price_map.append(prices)

    for r in price_map:
        print r
Esempio n. 2
0
                        type=str)
    parse = parser.parse_args()
    TRAIN_DATASET = parse.train
    TEST_DATASET = parse.test
    OUTPUT_PATH = parse.output
    np.random.seed(19260817)

    train_set = pandas.read_csv(TRAIN_DATASET)
    test_set = pandas.read_csv(TEST_DATASET)
    encoder = LabelEncoder().fit(train_set["species"])
    train = train_set.drop(["species", "id"], axis=1).values
    label = encoder.transform(train_set["species"])
    test = test_set.drop(["id"], axis=1).values
    scaler = StandardScaler().fit(train)
    train = scaler.transform(train)
    scaler = StandardScaler().fit(test)
    test = scaler.transform(test)

    feature_columns = [real_valued_column("", dimension=192)]
    classifier = DNNClassifier(feature_columns=feature_columns,
                               n_classes=99,
                               hidden_units=[1024, 512, 256],
                               optimizer=tf.train.AdamOptimizer)
    classifier.fit(x=train, y=label, steps=1000)
    output = classifier.predict(test)
    output_prob = classifier.predict_proba(test)
    test_id = test_set.pop("id")
    result = pandas.DataFrame(output_prob,
                              index=test_id,
                              columns=encoder.classes_)
    result.to_csv(OUTPUT_PATH)