Ejemplo n.º 1
0
def test_accuracy(db_name='',
                  test_samples=0,
                  neutral_range=0,
                  offset=0,
                  redis_db=5):
    """
    Returns two accuracies and classifier:
    NLTK accuracy is the internal accuracy of the classifier
    Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label.

    Keyword Arguments:
    db_name (str) -- Samples database to use, by default this is the same as your trained database
                     with an offset to ensure unseen data. Should be a string database name located in ~/.synt.

    test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount.

    neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look.
                             For example in the case where neutral range is 0.2 if the sentiment
                             guessed is not greater than 0.2 or less than -0.2 it is not considered.
                             Leaving this set to 0 will not cause the special case drops and will by default
                             categorize text as either positive or negative. This may be undesired as the classifier
                             will treat 0.0001 as positive even though it is not a strong indication.

    offset (int) -- By default the offset is decided from the end of the the trained amount, i.e
                    if you've trained on 1000 and you have 250 testing samples the samples retrieved
                    will be from 1000-1250, you can override this offset if you wish to use a different
                    subset.

    redis_db (int) -- The redis database to use.
    """

    m = RedisManager(db=redis_db)
    trained_classifier = m.r.get(
        'trained_classifier')  #retrieve the trained classifier

    if not trained_classifier:
        print("Accuracy needs a classifier, have you trained?")
        return

    classifier = m.pickle_load(trained_classifier)

    #we want to make sure we are testing on a new set of samples therefore
    #we use the trained_to as our offset and proceed to use the samples
    #thereafter, unless an offset is otherwise specified
    trained_to = int(m.r.get('trained_to'))

    if not offset:
        offset = trained_to

    if test_samples <= 0:  #if no testing samples provided use 25% of our training number
        test_samples = int(trained_to * .25)

    if not db_name:
        db_name = m.r.get('trained_db')  #use the trained samples database

    test_samples = get_samples(db_name,
                               test_samples,
                               offset=offset,
                               redis_db=redis_db)

    testfeats = []
    trained_ext = m.r.get('trained_extractor')

    feat_ex = get_extractor(trained_ext)()

    #normalization and extraction
    for text, label in test_samples:
        tokens = normalize_text(text)
        bag_of_words = feat_ex.extract(tokens)

        if bag_of_words:
            testfeats.append((bag_of_words, label))

    nltk_accuracy = nltk.classify.util.accuracy(
        classifier, gold=testfeats) * 100  # percentify

    total_guessed = 0
    total_correct = 0
    total_incorrect = 0

    g = Guesser(extractor_type=trained_ext)

    #compare the guessed sentiments with our samples database to determine manual accuracy
    for text, label in test_samples:
        guessed = g.guess(text)
        if abs(guessed) < neutral_range:
            continue

        if (guessed > 0) == label.startswith('pos'):
            total_correct += 1
        else:
            #print text, label, guessed
            total_incorrect += 1

        total_guessed += 1

    assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing."

    manual_accuracy = total_correct * 100.0 / total_guessed

    #TODO: precision and recall

    return (nltk_accuracy, manual_accuracy, classifier)
Ejemplo n.º 2
0
def main():

    if not os.path.exists(config.SYNT_PATH):
        os.makedirs(config.SYNT_PATH)

        #copy user config for first time run
        if not os.path.exists(config.USER_CONFIG_PATH):
            user_config = os.path.join(config.PROJECT_PATH, 'user_config.py')
            target_config = config.USER_CONFIG_PATH
            shutil.copy(user_config, target_config)

            print("First time run created a config in ~/.synt that Synt will use. Please make sure everything is ok then re-run your previous commands.")
            return

    parser = argparse.ArgumentParser(description='Tool to interface with synt, provides a way to train, collect and guess from the command line.')
    subparsers = parser.add_subparsers(dest='parser')

    #Train Parser
    train_parser = subparsers.add_parser(
        'train',
        help='Train a classifier.'
    )
    train_parser.add_argument(
        'db_name',
        help="The name of the training database to use. They are stored/retreived from ~/.synt/"
    )
    train_parser.add_argument(
        'samples',
        type=int,
        help="The amount of samples to train on. Uses the samples.db",
    )
    train_parser.add_argument(
        '--classifier_type',
        default='naivebayes',
        choices=('naivebayes',),
        help="The classifier to use. See help for currently supported classifier.",
    )
    train_parser.add_argument(
        '--extractor_type',
        default='stopwords',
        choices=('words', 'stopwords', 'bestwords'),
        help="The feature extractor to use. By default this uses stopwords filtering.",
    )
    train_parser.add_argument(
        '--best_features',
        type=int,
        default=0,
        help="The amount of best words to use, or best features. This should be used in conjunction with bestwords extractor.",
    )
    train_parser.add_argument(
        '--purge',
        default='no',
        choices=('yes', 'no'),
        help="Yes to purge the redis database. By default no."
    )
    train_parser.add_argument(
        '--processes',
        default=4,
        help="Will utilize multiprocessing if available with this number of processes. By default 4."
    )

    #Collect parser
    d = datetime.datetime.now()
    db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    collect_parser = subparsers.add_parser(
        'collect',
        help='Collect samples.'
    )
    collect_parser.add_argument(
        '--db_name',
        default=db_name,
        help="Optional database name to store as.",
    )
    collect_parser.add_argument(
        '--commit_every',
        default=200,
        type=int,
        help="Write to sqlite database after every 'this number'. Default is 200",
    )
    collect_parser.add_argument(
        '--max_collect',
        default=2000000,
        type=int,
        help="The amount to stop collecting at. Default is 2 million",
    )
    collect_parser.add_argument(
        '--query_file',
        default='',
        type=str,
        help="Absolute path to query file to use.",
    )

    #Fetch parser
    fetch_parser = subparsers.add_parser(
        'fetch',
        help='Fetches premade sample database.'
    )
    fetch_parser.add_argument(
        '--db_name',
        help="Fetches the default samples database from github and stores it as 'db' in ~/.synt/. Default db name is 'samples.db'.",
        default='samples.db',
    )

    #Guess parser
    guess_parser = subparsers.add_parser(
        'guess',
        help='Guess sentiment'
    )
    guess_parser.add_argument(
        'guess',
        nargs='?',
        default=True,
        help="Starts the guess prompt.",
    )
    guess_parser.add_argument(
        '--text',
        default='',
        help="Given text, will guess the sentiment on it.",
    )

    #Accuracy parser
    accuracy_parser = subparsers.add_parser(
        'accuracy',
        help="Test accuracy of classifier.",
    )
    accuracy_parser.add_argument(
        '--db_name',
        default='',
        help="""The samples database to use, if left empty the same database that was used for training is used for testing (with fresh samples). Specify db with with a database name located in ~/.synt.""",
    )
    accuracy_parser.add_argument(
        '--test_samples',
        type=int,
        help="""The amount of samples to test on. By default this is figured out internally and amounts to 25%
        of the training sample count. You can override this.""",
        default=0,
    )
    accuracy_parser.add_argument(
        '--neutral_range',
        default=0.2,
        type=float,
        help="Neutral range to use. By default this is 0.2.",
    )
    accuracy_parser.add_argument(
        '--offset',
        default=0,
        type=int,
        help="""By default the test samples are taken from the offset of the trained samples. i.e if 100 samples are trained and we
        are testing on 25 it will start from 100-125 to ensure the testing samples are new. You can override what offset to use
        with this argument.""",
    )

    args = parser.parse_args()

    if args.parser == 'train':
        print("Beginning train on {} database with {} samples.".format(args.db_name, args.samples))

        start = time.time()

        purge = False
        if args.purge == 'yes':
            purge = True

        train(
            db_name         = args.db_name,
            samples         = args.samples,
            classifier_type = args.classifier_type,
            extractor_type  = args.extractor_type,
            best_features   = args.best_features,
            processes       = args.processes,
            purge           = purge,
        )

        print("Finished training in {}.".format(time.time() - start))

    elif args.parser == 'collect':
        print("Beginning collecting {} samples to {}.".format(args.max_collect, args.db_name))

        start = time.time()

        collect(
            db_name      = args.db_name,
            commit_every = args.commit_every,
            max_collect  = args.max_collect,
            query_file   = args.query_file,
        )

        print("Finished collecting samples in {} seconds.".format(time.time() - start))

    elif args.parser == 'fetch':
        print("Beginning fetch to '{}' database.".format(args.db_name))
        fetch(args.db_name)
        print("Finished fetch.")

    elif args.parser == 'guess':
        g = Guesser()

        if args.text:
            print("Guessed: ",  g.guess(args.text))
            sys.exit()

        print("Enter something to calculate the synt of it!")
        print("Press enter to quit.")

        while True:
            text = raw_input("synt> ")
            if not text:
                break
            print('Guessed: {}'.format(g.guess(text)))

    elif args.parser == 'accuracy':
        print("Beginning accuracy test with neutral range {}.".format(args.neutral_range))

        start = time.time()

        n_accur, m_accur, classifier = test_accuracy(
            db_name       = args.db_name,
            test_samples  = args.test_samples,
            neutral_range = args.neutral_range,
            offset        = args.offset,
        )

        print("NLTK Accuracy: {}".format(n_accur))
        print("Manual Accuracy: {}".format(m_accur))

        classifier.show_most_informative_features(50)

        print("Finished testing in {} seconds.".format(time.time() - start))
Ejemplo n.º 3
0
        return score

if __name__ == '__main__':
    #example usage of guess

    g = Guesser()

    print("Enter something to calculate the synt of it!")
    print("Just press enter to quit.")

    while True:
        text = raw_input("synt> ")
        if not text:
            break
        print('Guessed: {}'.format(g.guess(text)))

########NEW FILE########
__FILENAME__ = tests
# -*- coding: utf-8 -*-
import unittest
from synt.trainer import train
from synt.guesser import Guesser
from synt import config

class TrainerTestCase(unittest.TestCase):

    def test_train_success(self):
        train('samples.db', 1000, best_features=None, purge=True, redis_db=config.REDIS_TEST_DB)

    def test_train_bestwords_success(self):
Ejemplo n.º 4
0
def test_accuracy(db_name='', test_samples=0, neutral_range=0, offset=0, redis_db=5):
    """
    Returns two accuracies and classifier:
    NLTK accuracy is the internal accuracy of the classifier
    Manual Accuracy is the accuracy when compared to pre-flagged/known samples and label.

    Keyword Arguments:
    db_name (str) -- Samples database to use, by default this is the same as your trained database
                     with an offset to ensure unseen data. Should be a string database name located in ~/.synt.

    test_samples (int) -- Amount of samples to use, by default this will be 25% of the training set amount.

    neutral_range (float) -- Will be used to drop "neutrals" to see how real-world accuracy will look.
                             For example in the case where neutral range is 0.2 if the sentiment
                             guessed is not greater than 0.2 or less than -0.2 it is not considered.
                             Leaving this set to 0 will not cause the special case drops and will by default
                             categorize text as either positive or negative. This may be undesired as the classifier
                             will treat 0.0001 as positive even though it is not a strong indication.

    offset (int) -- By default the offset is decided from the end of the the trained amount, i.e
                    if you've trained on 1000 and you have 250 testing samples the samples retrieved
                    will be from 1000-1250, you can override this offset if you wish to use a different
                    subset.

    redis_db (int) -- The redis database to use.
    """

    m = RedisManager(db=redis_db)
    trained_classifier = m.r.get('trained_classifier') #retrieve the trained classifier

    if not trained_classifier:
        print("Accuracy needs a classifier, have you trained?")
        return

    classifier = m.pickle_load(trained_classifier)

    #we want to make sure we are testing on a new set of samples therefore
    #we use the trained_to as our offset and proceed to use the samples
    #thereafter, unless an offset is otherwise specified
    trained_to = int(m.r.get('trained_to'))

    if not offset:
        offset = trained_to

    if test_samples <= 0: #if no testing samples provided use 25% of our training number
        test_samples = int(trained_to * .25)

    if not db_name:
        db_name = m.r.get('trained_db') #use the trained samples database

    test_samples = get_samples(db_name, test_samples, offset=offset,
        redis_db=redis_db)

    testfeats = []
    trained_ext = m.r.get('trained_extractor')

    feat_ex = get_extractor(trained_ext)()

    #normalization and extraction
    for text, label in test_samples:
        tokens = normalize_text(text)
        bag_of_words = feat_ex.extract(tokens)

        if bag_of_words:
            testfeats.append((bag_of_words, label))

    nltk_accuracy = nltk.classify.util.accuracy(classifier, gold=testfeats) * 100 # percentify

    total_guessed = 0
    total_correct = 0
    total_incorrect = 0

    g = Guesser(extractor_type=trained_ext)

    #compare the guessed sentiments with our samples database to determine manual accuracy
    for text, label in test_samples:
        guessed = g.guess(text)
        if abs(guessed) < neutral_range:
            continue

        if (guessed > 0) == label.startswith('pos'):
            total_correct += 1
        else:
            #print text, label, guessed
            total_incorrect += 1

        total_guessed += 1

    assert total_guessed, "There were no guesses, make sure you've trained on the same database you're testing."

    manual_accuracy =  total_correct * 100.0 / total_guessed

    #TODO: precision and recall

    return (nltk_accuracy, manual_accuracy, classifier)
Ejemplo n.º 5
0
        return score


if __name__ == '__main__':
    #example usage of guess

    g = Guesser()

    print("Enter something to calculate the synt of it!")
    print("Just press enter to quit.")

    while True:
        text = raw_input("synt> ")
        if not text:
            break
        print('Guessed: {}'.format(g.guess(text)))

########NEW FILE########
__FILENAME__ = tests
# -*- coding: utf-8 -*-
import unittest
from synt.trainer import train
from synt.guesser import Guesser
from synt import config


class TrainerTestCase(unittest.TestCase):
    def test_train_success(self):
        train('samples.db',
              1000,
              best_features=None,