Example #1
0
        prebuilt_path_model = None
        if isinstance(embedder_model, dict):
            embedder_model, prebuilt_model_params = embedder_model.items().pop()
            prebuilt_path_model = prebuilt_model_params.get('model')

        # initialize word vector embedder
        embedder = WordVectorEmbedder(embedder_model, prebuilt_path_model)

        # load pre-sampled data from disk
        if prebuilt_path_model:

            # training data (custom or default)
            if prebuilt_model_params.get('train', None):
                prebuilt_path_train = prebuilt_model_params.get('train')
            else:
                prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model)

            # testing data (custom or default)
            if prebuilt_model_params.get('test', None):
                prebuilt_path_test = prebuilt_model_params.get('test')
            else:
                prebuilt_path_test = WordVectorBuilder.filename_test(prebuilt_path_model)

            # import pickled data
            with open(prebuilt_path_train, 'rb') as f:
                data_train = pickle.load(f)
            with open(prebuilt_path_test, 'rb') as f:
                data_test = pickle.load(f)

            # update embedder parameters
            model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components(prebuilt_path_model)
        # test all vector models
        for embedder_model in data_args["models"]:

            # identify prebuilt model if exists
            if isinstance(embedder_model, dict):

                # initialize word vector embedder
                embedder_model, prebuilt_model_params = embedder_model.items().pop()
                prebuilt_path_model = prebuilt_model_params.get("model", None)
                model_args = prebuilt_model_params.get("args", {})
                embedder = WordVectorEmbedder(embedder_model, model_fullpath=prebuilt_path_model, model_args=model_args)

                # update embedder parameters
                if prebuilt_path_model:
                    model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components(
                        prebuilt_path_model
                    )
                    embedder.model_subset = model_path_filename

                # training data (custom or default)
                if prebuilt_model_params.get("train", None):
                    prebuilt_path_train = prebuilt_model_params.get("train")
                else:
                    prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model)
                with open(prebuilt_path_train, "rb") as f:
                    data_train = pickle.load(f)

                # testing data (custom or default)
                if prebuilt_model_params.get("test", None):
                    prebuilt_path_test = prebuilt_model_params.get("test")
                else:
Example #3
0
from src.datasets.data_utils import WordVectorBuilder
from src.datasets.open_weiboscope import OpenWeibo

builder = WordVectorBuilder(OpenWeibo, '/data/openweibo/')
builder.word2vec_save('/data/weibo.bin', min_samples=800000)
from src.datasets.data_utils import WordVectorBuilder
from src.datasets.open_weiboscope import OpenWeibo
from src.datasets.amazon_reviews import AmazonReviews
from src.datasets.sentiment140 import Sentiment140

builder = WordVectorBuilder(Sentiment140, '/data/sentiment140.csv')
builder.word2vec_save('/data/sentiment140.bin')
from src.datasets.data_utils import WordVectorBuilder
from src.datasets.open_weiboscope import OpenWeibo
from src.datasets.amazon_reviews import AmazonReviews
from src.datasets.sentiment140 import Sentiment140

builder = WordVectorBuilder(Sentiment140, "/data/sentiment140.csv")
builder.word2vec_save("/data/sentiment140.bin")
        klass = data_params['class']
        loader = klass(data_params['path'])
        data_args = data_params['args']
        data = loader.load_data()

        # initialize lists (will be converted later into numpy arrays)
        values = []
        labels = []

        # initialize vector embedder
        prebuilt_model_path = data_args.get('models', {}).get(embedder_model, {}).get('prebuilt_model_path', None)
        embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path)

        # load pre-sampled data from disk
        if prebuilt_model_path:
            with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f:
                data = pickle.load(f)
        else:

            # get equal-sized subsets of each class
            min_samples = data_args['min_samples'] if data_args.has_key('min_samples') else None
            data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2)
            data = data_sampler.sample_balanced(min_samples)

        # load dataset
        logger.info("processing {} samples from {}...".format(len(data), data_params['path']))
        profile_results = timed_dataload(data, data_args, values, labels)

        # store loading time
        seconds_loading = profile_results.timer.total_tt
Example #7
0
        loader = klass(data_params['path'])
        data_args = data_params['args']
        data = loader.load_data()

        # initialize lists (will be converted later into numpy arrays)
        values = []
        labels = []

        # initialize vector embedder
        prebuilt_model_path = data_args.get('models', {}).get(
            embedder_model, {}).get('prebuilt_model_path', None)
        embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path)

        # load pre-sampled data from disk
        if prebuilt_model_path:
            with open(WordVectorBuilder.filename_train(prebuilt_model_path),
                      'rb') as f:
                data = pickle.load(f)
        else:

            # get equal-sized subsets of each class
            min_samples = data_args['min_samples'] if data_args.has_key(
                'min_samples') else None
            data_sampler = DataSampler(klass,
                                       file_path=data_params['path'],
                                       num_classes=2)
            data = data_sampler.sample_balanced(min_samples)

        # load dataset
        logger.info("processing {} samples from {}...".format(
            len(data), data_params['path']))