loader = klass(data_params["path"])
        data_args = data_params["args"]
        load_args = data_args.get("load", {})
        data = loader.load_data(**load_args)

        # test all vector models
        for embedder_model in data_args["models"]:

            # identify prebuilt model if exists
            if isinstance(embedder_model, dict):

                # initialize word vector embedder
                embedder_model, prebuilt_model_params = embedder_model.items().pop()
                prebuilt_path_model = prebuilt_model_params.get("model", None)
                model_args = prebuilt_model_params.get("args", {})
                embedder = WordVectorEmbedder(embedder_model, model_fullpath=prebuilt_path_model, model_args=model_args)

                # update embedder parameters
                if prebuilt_path_model:
                    model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components(
                        prebuilt_path_model
                    )
                    embedder.model_subset = model_path_filename

                # training data (custom or default)
                if prebuilt_model_params.get("train", None):
                    prebuilt_path_train = prebuilt_model_params.get("train")
                else:
                    prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model)
                with open(prebuilt_path_train, "rb") as f:
                    data_train = pickle.load(f)
Beispiel #2
0
def main():
    model_defaults = {
        'imdb': {
            'data_filename' : "",
            'hdf5_name'     : "imdb_split.hd5"},
        'amazon': {
            'data_filename' : "reviews_Health_and_Personal_Care.json.gz",
            'hdf5_name'     : "health_personal_split.hd5"            
            },
        'sentiment140': {
            'data_filename' : "sentiment140.csv",
            'hdf5_name'     : "sentiment140_split.hd5" 
            },
        'open_weiboscope': {
            'data_filename' : "",
            'hdf5_name'     : "open_weiboscope.hd5"
            },
        }

    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument("dataset", help="Name of dataset (one of amazon, imdb, sentiment140, open_weiboscope)")
    arg_parser.add_argument("--working_dir", "-w", default=".",
	    help="Directory where data and results should be put, default PWD.")
    #arg_parser.add_argument("embedding", choices=('glove','word2vec'), required=False)
    vector_group=arg_parser.add_mutually_exclusive_group()
    vector_group.add_argument("--glove", nargs=1, metavar="LOCATION",help="Use glove, object at this path")
    vector_group.add_argument("--word2vec", nargs=1, metavar="LOCATION",help="Use word2vec, object at this path")
    arg_parser.add_argument("--results_dir", "-r", default=None, help="custom subfolder to store results and weights in (defaults to dataset)")
    arg_parser.add_argument("--data_path", "-d", default=None, help="custom path to original data, partially overrides working_dir")
    model_types=arg_parser.add_mutually_exclusive_group()
    model_types.add_argument("--cnn", default=True, action="store_true", help="Use convolutional model")
    model_types.add_argument("--lstm", action="store_true", help="Use LSTM")
    arg_parser.add_argument("--hdf5_path", "-5", default=None, help="custom path to split data in HDF5")
    arg_parser.add_argument("--weights_path", default=None, help="path to weights to initialize model with")
    arg_parser.add_argument("--gpu_id", "-g", default=0, type=int, help="GPU device ID (integer)")
    arg_parser.add_argument("--learning_rate", default=0.01, type=float, help="Learning rate, default 0.01")
    arg_parser.add_argument("--momentum_coef", default=0.9, type=float, help="Momentum coefficient, default 0.9")
    arg_parser.add_argument("--batch_size", default=128, type=int, help="Batch size")
    arg_parser.add_argument("--nframes", default=256, type=int, help="Frame buffer size for CREPE. 256 or 1024.")
    arg_parser.add_argument("--rng_seed",default=None,type=float, help="Random number seed")
    arg_parser.add_argument("--do_evals", default=False, action="store_true")
    arg_parser.add_argument("--log_level", default=logging.INFO, type=int)

    args = arg_parser.parse_args()
    logging.getLogger().setLevel(args.log_level)
    dataset_name = args.dataset
    args.working_dir = os.path.abspath(args.working_dir)
    if not args.results_dir:
        args.results_dir = dataset_name
    args.results_dir = os.path.join(args.working_dir, args.results_dir)
    if not args.data_path:
        args.data_path = os.path.join(args.working_dir, model_defaults[dataset_name]['data_filename'])
    if not args.hdf5_path:
        args.hdf5_path = os.path.join(args.working_dir, model_defaults[dataset_name]['hdf5_name'])

    # dataset-specific arguments to do_model
    model_args = { 
        'sentiment140' : {
            'min_length'        : 70,
            'max_length'        : 150,
            'normalizer_fun'    : normalize_tweet,
            'variant'           : 'tweet_character',
            },
        'imdb' : {
            'normalizer_fun'    : normalize_imdb,
            },
        'amazon'            : {
            'normalizer_fun'    : data_utils.normalize, 
            },
        'open_weiboscope'   : {
            'normalizer_fun'    : data_utils.normalize,
            'balance_labels'    : True,
            'max_records'       : 2e6,
            },
        }
    # use 50 words in embedding-based models of microblogs,
    # otherwise use 99 words for other embedding-based models
    if dataset_name in ('sentiment140','open_weiboscope'):
        embedding_nr_words = 50
    else:
        embedding_nr_words = 99

    # CNN or LSTM
    if args.cnn:
        model_args[dataset_name]['model_type'] = 'cnn'
        # character-based by default (overridden for embedding-based, below)
        model_args[dataset_name]['transformer_fun'] = data_utils.to_one_hot
    elif args.lstm:
        model_args[dataset_name]['model_type'] = 'lstm'
        # for character-based LSTM, re-reverse data
        if not (args.glove or args.word2vec):
            model_args[dataset_name]['transformer_fun'] = reverse_one_hot

    # set default hidden size (overridden for embedding-based models below)
    model_args[dataset_name]['hidden_size'] = 10
    model_args[dataset_name]['nframes']=args.nframes
    # parameters for embedding-based models
    if args.glove or args.word2vec:
        model_args[dataset_name]['sequence_length'] = embedding_nr_words
        model_args[dataset_name]['crepe_variant'] = 'embedding{}'.format(embedding_nr_words)
	model_args[dataset_name]['hidden_size'] = 200
    if args.glove:
        glove_path = os.path.abspath(args.glove[0])
        if not os.path.isfile(glove_path):
            model_downloader = ModelDownloader('glove')
            glove_url = model_downloader.data_location['twitter-2b']['url']
            glove_dir = os.path.dirname(glove_path)
            glove_file = os.path.basename(glove_path)
            model_downloader.download_and_save_vectors(glove_url, glove_dir, glove_file)
        glove_embedder = WordVectorEmbedder("glove",glove_path) 
        model_args[dataset_name]['transformer_fun'] = \
            lambda x: glove_embedder.embed_words_into_vectors(
                transform_for_vectors(x), embedding_nr_words)
        model_args[dataset_name]['vocab_size'] = 200
    if args.word2vec:
        w2v_embedder = WordVectorEmbedder("word2vec", os.path.abspath(args.word2vec[0]))
        model_args[dataset_name]['transformer_fun'] = \
            lambda x: w2v_embedder.embed_words_into_vectors(
                transform_for_vectors(x), embedding_nr_words)
        model_args[dataset_name]['vocab_size'] = 300

    try:
        logger.debug(model_args[dataset_name]['normalizer_fun'])
    except KeyError:
        logger.debug("No custom normalization fn specified")
    do_model(dataset_name, 
             args.working_dir,
             args.results_dir,
             args.data_path,
             args.hdf5_path,
             gpu_id=args.gpu_id,
             learning_rate=args.learning_rate,
             momentum_coef=args.momentum_coef,
             batch_size=args.batch_size,
             rng_seed=args.rng_seed,
             **model_args[dataset_name])
    if args.do_evals:
        do_evaluations(args.results_dir)
Beispiel #3
0
def main():
    model_defaults = {
        'imdb': {
            'data_filename': "",
            'hdf5_name': "imdb_split.hd5"
        },
        'amazon': {
            'data_filename': "reviews_Health_and_Personal_Care.json.gz",
            'hdf5_name': "health_personal_split.hd5"
        },
        'sentiment140': {
            'data_filename': "sentiment140.csv",
            'hdf5_name': "sentiment140_split.hd5"
        },
        'open_weiboscope': {
            'data_filename': "",
            'hdf5_name': "open_weiboscope.hd5"
        },
    }

    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument(
        "dataset",
        help=
        "Name of dataset (one of amazon, imdb, sentiment140, open_weiboscope)")
    arg_parser.add_argument(
        "--working_dir",
        "-w",
        default=".",
        help="Directory where data and results should be put, default PWD.")
    #arg_parser.add_argument("embedding", choices=('glove','word2vec'), required=False)
    vector_group = arg_parser.add_mutually_exclusive_group()
    vector_group.add_argument("--glove",
                              nargs=1,
                              metavar="LOCATION",
                              help="Use glove, object at this path")
    vector_group.add_argument("--word2vec",
                              nargs=1,
                              metavar="LOCATION",
                              help="Use word2vec, object at this path")
    arg_parser.add_argument(
        "--results_dir",
        "-r",
        default=None,
        help=
        "custom subfolder to store results and weights in (defaults to dataset)"
    )
    arg_parser.add_argument(
        "--data_path",
        "-d",
        default=None,
        help="custom path to original data, partially overrides working_dir")
    model_types = arg_parser.add_mutually_exclusive_group()
    model_types.add_argument("--cnn",
                             default=True,
                             action="store_true",
                             help="Use convolutional model")
    model_types.add_argument("--lstm", action="store_true", help="Use LSTM")
    arg_parser.add_argument("--hdf5_path",
                            "-5",
                            default=None,
                            help="custom path to split data in HDF5")
    arg_parser.add_argument("--weights_path",
                            default=None,
                            help="path to weights to initialize model with")
    arg_parser.add_argument("--gpu_id",
                            "-g",
                            default=0,
                            type=int,
                            help="GPU device ID (integer)")
    arg_parser.add_argument("--learning_rate",
                            default=0.01,
                            type=float,
                            help="Learning rate, default 0.01")
    arg_parser.add_argument("--momentum_coef",
                            default=0.9,
                            type=float,
                            help="Momentum coefficient, default 0.9")
    arg_parser.add_argument("--batch_size",
                            default=128,
                            type=int,
                            help="Batch size")
    arg_parser.add_argument("--nframes",
                            default=256,
                            type=int,
                            help="Frame buffer size for CREPE. 256 or 1024.")
    arg_parser.add_argument("--rng_seed",
                            default=None,
                            type=float,
                            help="Random number seed")
    arg_parser.add_argument("--do_evals", default=False, action="store_true")
    arg_parser.add_argument("--log_level", default=logging.INFO, type=int)

    args = arg_parser.parse_args()
    logging.getLogger().setLevel(args.log_level)
    dataset_name = args.dataset
    args.working_dir = os.path.abspath(args.working_dir)
    if not args.results_dir:
        args.results_dir = dataset_name
    args.results_dir = os.path.join(args.working_dir, args.results_dir)
    if not args.data_path:
        args.data_path = os.path.join(
            args.working_dir, model_defaults[dataset_name]['data_filename'])
    if not args.hdf5_path:
        args.hdf5_path = os.path.join(
            args.working_dir, model_defaults[dataset_name]['hdf5_name'])

    # dataset-specific arguments to do_model
    model_args = {
        'sentiment140': {
            'min_length': 70,
            'max_length': 150,
            'normalizer_fun': normalize_tweet,
            'variant': 'tweet_character',
        },
        'imdb': {
            'normalizer_fun': normalize_imdb,
        },
        'amazon': {
            'normalizer_fun': data_utils.normalize,
        },
        'open_weiboscope': {
            'normalizer_fun': data_utils.normalize,
            'balance_labels': True,
            'max_records': 2e6,
        },
    }
    # use 50 words in embedding-based models of microblogs,
    # otherwise use 99 words for other embedding-based models
    if dataset_name in ('sentiment140', 'open_weiboscope'):
        embedding_nr_words = 50
    else:
        embedding_nr_words = 99

    # CNN or LSTM
    if args.cnn:
        model_args[dataset_name]['model_type'] = 'cnn'
        # character-based by default (overridden for embedding-based, below)
        model_args[dataset_name]['transformer_fun'] = data_utils.to_one_hot
    elif args.lstm:
        model_args[dataset_name]['model_type'] = 'lstm'
        # for character-based LSTM, re-reverse data
        if not (args.glove or args.word2vec):
            model_args[dataset_name]['transformer_fun'] = reverse_one_hot

    # set default hidden size (overridden for embedding-based models below)
    model_args[dataset_name]['hidden_size'] = 10
    model_args[dataset_name]['nframes'] = args.nframes
    # parameters for embedding-based models
    if args.glove or args.word2vec:
        model_args[dataset_name]['sequence_length'] = embedding_nr_words
        model_args[dataset_name]['crepe_variant'] = 'embedding{}'.format(
            embedding_nr_words)
        model_args[dataset_name]['hidden_size'] = 200
    if args.glove:
        glove_path = os.path.abspath(args.glove[0])
        if not os.path.isfile(glove_path):
            model_downloader = ModelDownloader('glove')
            glove_url = model_downloader.data_location['twitter-2b']['url']
            glove_dir = os.path.dirname(glove_path)
            glove_file = os.path.basename(glove_path)
            model_downloader.download_and_save_vectors(glove_url, glove_dir,
                                                       glove_file)
        glove_embedder = WordVectorEmbedder("glove", glove_path)
        model_args[dataset_name]['transformer_fun'] = \
            lambda x: glove_embedder.embed_words_into_vectors(
                transform_for_vectors(x), embedding_nr_words)
        model_args[dataset_name]['vocab_size'] = 200
    if args.word2vec:
        w2v_embedder = WordVectorEmbedder("word2vec",
                                          os.path.abspath(args.word2vec[0]))
        model_args[dataset_name]['transformer_fun'] = \
            lambda x: w2v_embedder.embed_words_into_vectors(
                transform_for_vectors(x), embedding_nr_words)
        model_args[dataset_name]['vocab_size'] = 300

    try:
        logger.debug(model_args[dataset_name]['normalizer_fun'])
    except KeyError:
        logger.debug("No custom normalization fn specified")
    do_model(dataset_name,
             args.working_dir,
             args.results_dir,
             args.data_path,
             args.hdf5_path,
             gpu_id=args.gpu_id,
             learning_rate=args.learning_rate,
             momentum_coef=args.momentum_coef,
             batch_size=args.batch_size,
             rng_seed=args.rng_seed,
             **model_args[dataset_name])
    if args.do_evals:
        do_evaluations(args.results_dir)
Beispiel #4
0
    loader = klass(data_params['path'])
    data_args = data_params['args']
    load_args = data_args.get('load', {})
    data = loader.load_data(load_args)

    # test all vector models
    for embedder_model in data_args['models']:

        # identify prebuilt model if exists
        prebuilt_path_model = None
        if isinstance(embedder_model, dict):
            embedder_model, prebuilt_model_params = embedder_model.items().pop()
            prebuilt_path_model = prebuilt_model_params.get('model')

        # initialize word vector embedder
        embedder = WordVectorEmbedder(embedder_model, prebuilt_path_model)

        # load pre-sampled data from disk
        if prebuilt_path_model:

            # training data (custom or default)
            if prebuilt_model_params.get('train', None):
                prebuilt_path_train = prebuilt_model_params.get('train')
            else:
                prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model)

            # testing data (custom or default)
            if prebuilt_model_params.get('test', None):
                prebuilt_path_test = prebuilt_model_params.get('test')
            else:
                prebuilt_path_test = WordVectorBuilder.filename_test(prebuilt_path_model)
Beispiel #5
0
    for data_source, data_params in datasets.iteritems():

        # prepare data loader
        klass = data_params['class']
        loader = klass(data_params['path'])
        data_args = data_params['args']
        data = loader.load_data()

        # initialize lists (will be converted later into numpy arrays)
        values = []
        labels = []

        # initialize vector embedder
        prebuilt_model_path = data_args.get('models', {}).get(
            embedder_model, {}).get('prebuilt_model_path', None)
        embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path)

        # load pre-sampled data from disk
        if prebuilt_model_path:
            with open(WordVectorBuilder.filename_train(prebuilt_model_path),
                      'rb') as f:
                data = pickle.load(f)
        else:

            # get equal-sized subsets of each class
            min_samples = data_args['min_samples'] if data_args.has_key(
                'min_samples') else None
            data_sampler = DataSampler(klass,
                                       file_path=data_params['path'],
                                       num_classes=2)
            data = data_sampler.sample_balanced(min_samples)