def timed_dataload(loader, data, args, embedder, values, labels):

    # use separate counter to account for invalid input along the way
    counter = 0

    for text, sentiment in data:

        try:
            if counter % 10000 == 0:
                print("Loading at {}".format(counter))

            # normalize and tokenize if necessary
            if args.has_key("normalize"):
                text_normalized = data_utils.normalize(text, **args["normalize"])
            else:
                text_normalized = text

            # tokenize
            if args.get("load", {}).get("form", None) == "hanzi":
                tokens = data_utils.tokenize_hanzi(text_normalized)
            elif args.get("load", {}).get("form", None) == "arabic":
                text_stripped = loader.twitter_strip(text_normalized)
                tokens = loader.tokenize_arabic(text_stripped)
            else:
                tokens = data_utils.tokenize(text_normalized)

            # choose embedding type
            vector = None
            if args["embed"]["type"] == "concatenated":
                vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args["embed"])
            elif args["embed"]["type"] == "averaged":
                vector = embedder.embed_words_into_vectors_averaged(tokens)
            else:
                pass

            # data labeled by sentiment score (thread-safe with lock)
            if vector is not None:
                values.append(vector)
                labels.append(sentiment)
                counter += 1

        except TextTooShortException as e:
            pass
Example #2
0
def timed_dataload(data, args, values, labels):

    # use separate counter to account for invalid input along the way
    counter = 0

    for text,sentiment in data:

        try:
            if (counter % 10000 == 0):
                print("Loading at {}".format(counter))

            # normalize and tokenize if necessary
            if args.has_key('normalize'):
                text_normalized = data_utils.normalize(text, **args['normalize'])
            else:
                text_normalized = text

            # tokenize
            if data_args.get('load', {}).get('form', None) == 'hanzi':
                tokens = data_utils.tokenize_hanzi(text_normalized)
            else:
                tokens = data_utils.tokenize(text_normalized)

            # choose embedding type
            vector = None
            if args['embed']['type'] == 'concatenated':
                vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed'])
            elif args['embed']['type'] == 'averaged':
                vector = embedder.embed_words_into_vectors_averaged(tokens)
            else:
                pass

            # data labeled by sentiment score (thread-safe with lock)
            if vector is not None:
                values.append(vector)
                labels.append(sentiment)
                counter += 1

        except TextTooShortException as e:
            pass
Example #3
0
from gensim.models import Word2Vec, Doc2Vec
from src.datasets.open_weiboscope import OpenWeibo
from src.datasets.data_utils import tokenize, tokenize_hanzi
import random

# save raw or romanized form
form = 'hanzi'  #pinyin

# load data
data = OpenWeibo('/data/openweibo/').load_data(form=form, keep_retweets=True)

# get input sentences for vector model
if form == 'hanzi':
    sentences = [tokenize_hanzi(text) for text, sentiment in data]
else:
    sentences = [tokenize(text) for text, sentiment in data]
print("loaded {} sentences".format(len(sentences)))

# build and train model
model = Word2Vec(size=200, window=5, min_count=1, workers=32)
model.build_vocab(sentences)
model.train(sentences)

# save model
model.save_word2vec_format(
    '/data/openweibo/openweibo_fullset_{}_vocab{}.bin'.format(
        form, len(model.vocab)))
from gensim.models import Word2Vec, Doc2Vec
from src.datasets.open_weiboscope import OpenWeibo
from src.datasets.data_utils import tokenize, tokenize_hanzi
import random

# save raw or romanized form
form = "hanzi"  # pinyin

# load data
data = OpenWeibo("/data/openweibo/").load_data(form=form, keep_retweets=True)

# get input sentences for vector model
if form == "hanzi":
    sentences = [tokenize_hanzi(text) for text, sentiment in data]
else:
    sentences = [tokenize(text) for text, sentiment in data]
print("loaded {} sentences".format(len(sentences)))

# build and train model
model = Word2Vec(size=200, window=5, min_count=1, workers=32)
model.build_vocab(sentences)
model.train(sentences)

# save model
model.save_word2vec_format("/data/openweibo/openweibo_fullset_{}_vocab{}.bin".format(form, len(model.vocab)))