prebuilt_path_model = None if isinstance(embedder_model, dict): embedder_model, prebuilt_model_params = embedder_model.items().pop() prebuilt_path_model = prebuilt_model_params.get('model') # initialize word vector embedder embedder = WordVectorEmbedder(embedder_model, prebuilt_path_model) # load pre-sampled data from disk if prebuilt_path_model: # training data (custom or default) if prebuilt_model_params.get('train', None): prebuilt_path_train = prebuilt_model_params.get('train') else: prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model) # testing data (custom or default) if prebuilt_model_params.get('test', None): prebuilt_path_test = prebuilt_model_params.get('test') else: prebuilt_path_test = WordVectorBuilder.filename_test(prebuilt_path_model) # import pickled data with open(prebuilt_path_train, 'rb') as f: data_train = pickle.load(f) with open(prebuilt_path_test, 'rb') as f: data_test = pickle.load(f) # update embedder parameters model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components(prebuilt_path_model)
# test all vector models for embedder_model in data_args["models"]: # identify prebuilt model if exists if isinstance(embedder_model, dict): # initialize word vector embedder embedder_model, prebuilt_model_params = embedder_model.items().pop() prebuilt_path_model = prebuilt_model_params.get("model", None) model_args = prebuilt_model_params.get("args", {}) embedder = WordVectorEmbedder(embedder_model, model_fullpath=prebuilt_path_model, model_args=model_args) # update embedder parameters if prebuilt_path_model: model_path_dir, model_path_filename, model_path_filext = WordVectorBuilder.filename_components( prebuilt_path_model ) embedder.model_subset = model_path_filename # training data (custom or default) if prebuilt_model_params.get("train", None): prebuilt_path_train = prebuilt_model_params.get("train") else: prebuilt_path_train = WordVectorBuilder.filename_train(prebuilt_path_model) with open(prebuilt_path_train, "rb") as f: data_train = pickle.load(f) # testing data (custom or default) if prebuilt_model_params.get("test", None): prebuilt_path_test = prebuilt_model_params.get("test") else:
from src.datasets.data_utils import WordVectorBuilder from src.datasets.open_weiboscope import OpenWeibo builder = WordVectorBuilder(OpenWeibo, '/data/openweibo/') builder.word2vec_save('/data/weibo.bin', min_samples=800000)
from src.datasets.data_utils import WordVectorBuilder from src.datasets.open_weiboscope import OpenWeibo from src.datasets.amazon_reviews import AmazonReviews from src.datasets.sentiment140 import Sentiment140 builder = WordVectorBuilder(Sentiment140, '/data/sentiment140.csv') builder.word2vec_save('/data/sentiment140.bin')
from src.datasets.data_utils import WordVectorBuilder from src.datasets.open_weiboscope import OpenWeibo from src.datasets.amazon_reviews import AmazonReviews from src.datasets.sentiment140 import Sentiment140 builder = WordVectorBuilder(Sentiment140, "/data/sentiment140.csv") builder.word2vec_save("/data/sentiment140.bin")
klass = data_params['class'] loader = klass(data_params['path']) data_args = data_params['args'] data = loader.load_data() # initialize lists (will be converted later into numpy arrays) values = [] labels = [] # initialize vector embedder prebuilt_model_path = data_args.get('models', {}).get(embedder_model, {}).get('prebuilt_model_path', None) embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path) # load pre-sampled data from disk if prebuilt_model_path: with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f: data = pickle.load(f) else: # get equal-sized subsets of each class min_samples = data_args['min_samples'] if data_args.has_key('min_samples') else None data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples) # load dataset logger.info("processing {} samples from {}...".format(len(data), data_params['path'])) profile_results = timed_dataload(data, data_args, values, labels) # store loading time seconds_loading = profile_results.timer.total_tt
loader = klass(data_params['path']) data_args = data_params['args'] data = loader.load_data() # initialize lists (will be converted later into numpy arrays) values = [] labels = [] # initialize vector embedder prebuilt_model_path = data_args.get('models', {}).get( embedder_model, {}).get('prebuilt_model_path', None) embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path) # load pre-sampled data from disk if prebuilt_model_path: with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f: data = pickle.load(f) else: # get equal-sized subsets of each class min_samples = data_args['min_samples'] if data_args.has_key( 'min_samples') else None data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2) data = data_sampler.sample_balanced(min_samples) # load dataset logger.info("processing {} samples from {}...".format( len(data), data_params['path']))