Esempio n. 1
0
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import fasttext

sys.path.append('libs/')
import textfeatures
import fileio
import classification

methods = ('FT+ GNB', 'FT + RF', 'FT + SVM', 'BOW + GNB', 'BOW + MNB',
           'BOW + RF', 'BOW + SVM')

pylab.ion()
pylab.figure()

y, messages, classes = fileio.read_fasttext_train_file(
    'data/train/annotated_fb_messages.txt')
y = np.array(y)

#
# FastText features
#
model = fasttext.load_model('../fastText/data/wiki.fi.bin')
x = textfeatures.fasttext_bag_of_means(messages, model)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

## Naive Bayes
# Train, predict and evaluate
clf = GaussianNB().fit(x_train, y_train)
Esempio n. 2
0
def main(argv):

    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='Input')
    parser.add_argument(
        '--annotations',
        help='Directory for annotated files (extends training material)',
        default='')
    parser.add_argument('--outputdir', help='Output directory', required=True)
    parser.add_argument('--featurename',
                        help='Feature extraction name',
                        required=True)
    parser.add_argument('--featurefile',
                        help='Feature extraction file',
                        required=True)
    parser.add_argument('--classifier', help='Predictor file', required=True)

    args = parser.parse_args(argv)

    print('Inputs:')
    print(args)

    # Load training data
    # TODO: This data should come from real database
    print('Loading training data')
    y, messages, classes = fileio.read_fasttext_train_file(args.input)
    y = np.array(y)

    print(len(messages), y.shape)
    if len(args.annotations) > 0:
        newmessages, labels = fileio.read_annotated_files(args.annotations)
        y = np.hstack((y, labels))
        messages += newmessages

    print(len(messages), y.shape)

    #TODO: We need to have BoW Features here too..
    # Load FastText textfeatures
    print('Loading text feature extractor')
    feature_extractor = textfeatures.FeatureExtractor(
        method=args.featurename, filename=args.featurefile)

    # Extract text features from training data
    print('Extracting text features from training data')
    x = feature_extractor.extract(messages)

    # Train the model
    # TODO: It would make sense to define training as a pipeline so that all the
    # parameters could be given in
    print('Training a new model..')
    if args.classifier.upper() == 'RF':
        clf = RF().fit(x, y)
    elif args.classifier.upper() == 'SVM':
        clf = SVC(kernel='linear', probability=True).fit(x, y)

    # Save the model
    #TODO: The name of the file should be also depend on the method
    predictor_model_file = os.path.join(
        args.outputdir, args.featurename + '_' + args.classifier + '.pkl')
    if os.path.exists(os.path.dirname(predictor_model_file)) == False:
        os.makedirs(os.path.dirname(predictor_model_file))
    print('Storing the result file in %s' % predictor_model_file)
    joblib.dump(clf, predictor_model_file)