args = parser.parse_args() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) # load classifier clf = joblib.load(args.classifier) text_files = [fi for fi in os.listdir(args.in_dir) if fi.endswith(".txt")] for i, text_file in enumerate(text_files): in_file = os.path.join(args.in_dir, text_file) print "{} of {}".format(i + 1, len(text_files)) print "In:", in_file # load data X_train, X_data, Y_train, Y_data, classes_ = get_data(args.train_file, in_file) # classifiy pred = clf.predict(X_data) # save results out_file = os.path.join(args.out_dir, text_file) print "Out:", out_file X_data_with_ids, Y_data = load_data(in_file) with codecs.open(out_file, "wb", "utf8") as f: for x, y in zip(X_data_with_ids, pred): f.write(u"{}\t{}\n".format(x.decode("utf8"), "_".join(classes_[y]) or "None")) print
import string import os parser = argparse.ArgumentParser() parser.add_argument('train_file', help='file containing the train data') parser.add_argument('output_dir', help='directory to save the classifier to') args = parser.parse_args() stopwords = sw.words('dutch') + [p for p in string.punctuation] if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) classifier_file = '{}/classifier.pkl'.format(args.output_dir) X_train, X_test, Y_train, Y_test, classes_ = get_data(args.train_file, args.train_file) clf = make_pipeline(TfidfVectorizer(analyzer=split, stop_words=stopwords), RandomKLabelsets(LinearSVC(class_weight='auto'), n_estimators=Y_train.shape[1]*2, labels_per_estimator=3)) clf.fit(X_train, Y_train) # save classifier joblib.dump(clf, classifier_file) print('saved', classifier_file)
out_dir = args.out_dir if not os.path.exists(out_dir): os.makedirs(out_dir) #classifier_dir = '{}/classifier/'.format(out_dir) #if not os.path.exists(classifier_dir): # os.makedirs(classifier_dir) for run in range(1, 11): print("Run", run) train_file = '{}/train_{}.txt'.format(args.input_dir, run) test_file = '{}/test_{}.txt'.format(args.input_dir, run) out_file = '{}/output_{}.txt'.format(out_dir, run) X_train, X_test, Y_train, Y_test, classes_ = get_data(train_file, test_file) #print(Y_train.shape) clf = make_pipeline(TfidfVectorizer(analyzer=split, stop_words=stopwords), RandomKLabelsets(LinearSVC(class_weight='auto'), n_estimators=Y_train.shape[1]*2, labels_per_estimator=3)) clf.fit(X_train, Y_train) Y_pred = clf.predict(X_test) print_results(Y_test, Y_pred, classes_, open(out_file, 'w')) # save classifier #joblib.dump(clf, '{}/classifier.pkl'.format(classifier_dir))