def timed_dataload(loader, data, args, embedder, values, labels): # use separate counter to account for invalid input along the way counter = 0 for text, sentiment in data: try: if counter % 10000 == 0: print("Loading at {}".format(counter)) # normalize and tokenize if necessary if args.has_key("normalize"): text_normalized = data_utils.normalize(text, **args["normalize"]) else: text_normalized = text # tokenize if args.get("load", {}).get("form", None) == "hanzi": tokens = data_utils.tokenize_hanzi(text_normalized) elif args.get("load", {}).get("form", None) == "arabic": text_stripped = loader.twitter_strip(text_normalized) tokens = loader.tokenize_arabic(text_stripped) else: tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if args["embed"]["type"] == "concatenated": vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args["embed"]) elif args["embed"]["type"] == "averaged": vector = embedder.embed_words_into_vectors_averaged(tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: values.append(vector) labels.append(sentiment) counter += 1 except TextTooShortException as e: pass
def timed_dataload(data, args, values, labels): # use separate counter to account for invalid input along the way counter = 0 for text,sentiment in data: try: if (counter % 10000 == 0): print("Loading at {}".format(counter)) # normalize and tokenize if necessary if args.has_key('normalize'): text_normalized = data_utils.normalize(text, **args['normalize']) else: text_normalized = text # tokenize if data_args.get('load', {}).get('form', None) == 'hanzi': tokens = data_utils.tokenize_hanzi(text_normalized) else: tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if args['embed']['type'] == 'concatenated': vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed']) elif args['embed']['type'] == 'averaged': vector = embedder.embed_words_into_vectors_averaged(tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: values.append(vector) labels.append(sentiment) counter += 1 except TextTooShortException as e: pass
from gensim.models import Word2Vec, Doc2Vec from src.datasets.open_weiboscope import OpenWeibo from src.datasets.data_utils import tokenize, tokenize_hanzi import random # save raw or romanized form form = 'hanzi' #pinyin # load data data = OpenWeibo('/data/openweibo/').load_data(form=form, keep_retweets=True) # get input sentences for vector model if form == 'hanzi': sentences = [tokenize_hanzi(text) for text, sentiment in data] else: sentences = [tokenize(text) for text, sentiment in data] print("loaded {} sentences".format(len(sentences))) # build and train model model = Word2Vec(size=200, window=5, min_count=1, workers=32) model.build_vocab(sentences) model.train(sentences) # save model model.save_word2vec_format( '/data/openweibo/openweibo_fullset_{}_vocab{}.bin'.format( form, len(model.vocab)))
from gensim.models import Word2Vec, Doc2Vec from src.datasets.open_weiboscope import OpenWeibo from src.datasets.data_utils import tokenize, tokenize_hanzi import random # save raw or romanized form form = "hanzi" # pinyin # load data data = OpenWeibo("/data/openweibo/").load_data(form=form, keep_retweets=True) # get input sentences for vector model if form == "hanzi": sentences = [tokenize_hanzi(text) for text, sentiment in data] else: sentences = [tokenize(text) for text, sentiment in data] print("loaded {} sentences".format(len(sentences))) # build and train model model = Word2Vec(size=200, window=5, min_count=1, workers=32) model.build_vocab(sentences) model.train(sentences) # save model model.save_word2vec_format("/data/openweibo/openweibo_fullset_{}_vocab{}.bin".format(form, len(model.vocab)))