from sklearn.naive_bayes import GaussianNB, MultinomialNB import fasttext sys.path.append('libs/') import textfeatures import fileio import classification methods = ('FT+ GNB', 'FT + RF', 'FT + SVM', 'BOW + GNB', 'BOW + MNB', 'BOW + RF', 'BOW + SVM') pylab.ion() pylab.figure() y, messages, classes = fileio.read_fasttext_train_file( 'data/train/annotated_fb_messages.txt') y = np.array(y) # # FastText features # model = fasttext.load_model('../fastText/data/wiki.fi.bin') x = textfeatures.fasttext_bag_of_means(messages, model) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) ## Naive Bayes # Train, predict and evaluate clf = GaussianNB().fit(x_train, y_train)
def main(argv): parser = argparse.ArgumentParser() parser.add_argument('input', help='Input') parser.add_argument( '--annotations', help='Directory for annotated files (extends training material)', default='') parser.add_argument('--outputdir', help='Output directory', required=True) parser.add_argument('--featurename', help='Feature extraction name', required=True) parser.add_argument('--featurefile', help='Feature extraction file', required=True) parser.add_argument('--classifier', help='Predictor file', required=True) args = parser.parse_args(argv) print('Inputs:') print(args) # Load training data # TODO: This data should come from real database print('Loading training data') y, messages, classes = fileio.read_fasttext_train_file(args.input) y = np.array(y) print(len(messages), y.shape) if len(args.annotations) > 0: newmessages, labels = fileio.read_annotated_files(args.annotations) y = np.hstack((y, labels)) messages += newmessages print(len(messages), y.shape) #TODO: We need to have BoW Features here too.. # Load FastText textfeatures print('Loading text feature extractor') feature_extractor = textfeatures.FeatureExtractor( method=args.featurename, filename=args.featurefile) # Extract text features from training data print('Extracting text features from training data') x = feature_extractor.extract(messages) # Train the model # TODO: It would make sense to define training as a pipeline so that all the # parameters could be given in print('Training a new model..') if args.classifier.upper() == 'RF': clf = RF().fit(x, y) elif args.classifier.upper() == 'SVM': clf = SVC(kernel='linear', probability=True).fit(x, y) # Save the model #TODO: The name of the file should be also depend on the method predictor_model_file = os.path.join( args.outputdir, args.featurename + '_' + args.classifier + '.pkl') if os.path.exists(os.path.dirname(predictor_model_file)) == False: os.makedirs(os.path.dirname(predictor_model_file)) print('Storing the result file in %s' % predictor_model_file) joblib.dump(clf, predictor_model_file)