Beispiel #1
0
def main():

    if len(sys.argv) != 3:
        print "Usage " + sys.argv[0] + " file_train model_file"
        sys.exit(1)

    train_filename = sys.argv[1]
    model_filename = sys.argv[2]

    featureIndexes = processData(train_filename)
    trainFeatures, trainTargets, trainItemIds = processData(train_filename, featureIndexes)

    # joblib.dump((trainFeatures, trainTargets, trainItemIds), os.path.join(dataFolder, "train_data.pkl"))
    # trainFeatures, trainTargets, trainItemIds = joblib.load(os.path.join(dataFolder, "train_data.pkl"))

    trainTargets = np.asarray(trainTargets)

    logging.info("Feature preparation done, fitting model...")
    clf = SGDClassifier(loss="log", penalty="l2", alpha=1e-4, class_weight="auto")

    # clf.fit(trainFeatures, trainTargets)
    # joblib.dump(clf, model_filename)

    print utils.xvalidation(trainFeatures, trainTargets, clf)
Beispiel #2
0
    return c_features, y#, ids, [vectorizer]


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print "Usage " + sys.argv[0] + " file_train model_file"
        sys.exit(1)

    train_filename = sys.argv[1]
    model_filename = sys.argv[2]

    X, y = read_train_nb(train_filename)
    xv = []
    sz = []
    for i in range(10):
        sz.append(X[i].shape[0])
        nb = MultinomialNB()
        xv.append(utils.xvalidation(X[i], y[i], nb))

    total_sz = float(sum(sz))
    for i in range(10):
        print category_dict_r[i], sz[i]/total_sz, xv[i]

    #utils.xvalidation_result(train_filename, 'nb.ng2.txt', X, y, ids, nb)

    # nb.fit(X, y)
    # joblib.dump((nb, transformers), model_filename)