def cross_validation_ah():
    import random
    random.seed(1234567)

    import tensorflow

    sess_config = tensorflow.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    from tensorflow.python.keras.backend import set_session

    set_session(tensorflow.Session(config=sess_config))

    vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz')
    embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz')

    reader = JSONPerLineDocumentReader(
        'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json',
        True)
    # e = ClassificationExperiment(reader, RandomTokenizedDocumentClassifier(), ClassificationEvaluator())
    # e = ClassificationExperiment(reader, MajorityClassTokenizedDocumentClassifier(), ClassificationEvaluator())
    # e = ClassificationExperiment(reader, SimpleLSTMTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator())
    e = ClassificationExperiment(
        reader, StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings),
        ClassificationEvaluator())
    # e = ClassificationExperiment(reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings), ClassificationEvaluator())
    e.run()
def cross_validation_ah(model_type):
    # classification without context
    import random
    random.seed(1234567)

    import tensorflow as tf
    if tf.test.is_gpu_available():
        strategy = tf.distribute.MirroredStrategy()
        print('Using GPU')
    else:
        raise ValueError('CPU not recommended.')

    with strategy.scope():
        vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz')
        embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz')
        reader = JSONPerLineDocumentReader(
            'data/experiments/ah-classification1/exported-3621-sampled-positive-negative-ah-no-context.json',
            True)
        e = None
        if model_type == 'cnn':
            e = ClassificationExperiment(
                reader, CNNTokenizedDocumentClassifier(vocabulary, embeddings),
                ClassificationEvaluator())
        else:
            e = ClassificationExperiment(
                reader,
                StackedLSTMTokenizedDocumentClassifier(vocabulary, embeddings),
                ClassificationEvaluator())
        e.run()
def train_test_model_with_context(train_dir, indir, outdir):
    '''Custom training and testing SSAE model
    :param train_dir: Path to JSON file containing training examples
    :param indir: Path to LOG file containing examples as Comment() object (which has already been classified by Bert)
    :param outdir: Path to LOG file to be created by adding prediction of this model as well'''

    import random
    random.seed(1234567)

    import tensorflow as tf
    if tf.test.is_gpu_available():
        strategy = tf.distribute.MirroredStrategy()
        print('Using GPU')
    else:
        raise ValueError('CPU not recommended.')

    with strategy.scope():
        vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz')
        embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz')
        reader = JSONPerLineDocumentReader(train_dir, True)
        e = ClassificationExperiment(
            reader,
            StructuredSelfAttentiveSentenceEmbedding(vocabulary, embeddings),
            ClassificationEvaluator())
        test_comments = TokenizedDocumentReader(indir)
        result = e.label_external(test_comments)

    for k in result.keys():
        print(f'{k}: {result[k]}')

    instances = dict()

    e = Comment(-1, 'lol', 'ah')
    f = open(indir, 'rb')

    try:
        while True:
            e = pickle.load(f)
            print(e)
            instances[str(e.id)] = e
    except EOFError:
        f.close()

    f = open(outdir, 'wb')

    for key in result.keys():
        model_label, model_score = result[key]
        model_label = model_label.lower()
        score = model_score[1]
        if model_label == 'none':
            score = model_score[0]
        instances[key].add_model(model_type, model_label, score, None)
        e = instances[key]
        print(e)
        print(e.labels)
        print(e.scores)
        print('=' * 20)
        pickle.dump(instances[key], f)

    f.close()
    def __init__(self):
        self.vocabulary = Vocabulary.deserialize(
            'en-top100k.vocabulary.pkl.gz')
        self.embeddings = WordEmbeddings.deserialize(
            'en-top100k.embeddings.pkl.gz')

        assert isinstance(self.vocabulary, Vocabulary)
        assert isinstance(self.embeddings, WordEmbeddings)

        # for caching computed average word vectors (it's expensive)
        # dictionary = (str, np.ndarray)
        # key = text, value = average word vector
        self._average_word_vector_cache = dict()
Ejemplo n.º 5
0
def run_it_all(dat, tok, rm_s, size, window, skipgram, workers, min_count):
    """
    Return a WhiskyEmbeddings object which allows for some cool trickeries
    such as finding similar whiskies, describing whiskies and finding
    similar wordings ('synonyms') in the whisky-tasting vocabulary.

    This function does it all from beginning to the end:
    1) Transform the scraped whisky reviews into a well-structured object
    2) Use all whisky reviews to build a corpus and train a whisky-specific
        word2vec model.
    3) Use the word embeddings to create whisky embeddings.

    The methods in WhiskyEmbeddings can then be used.
    All of this takes approx. 30-60 seconds.

    :param dat: input data
    :param tok: (bool) use tokenize and gensim preprocessing or not?
    :param rm_s: (bool) remove stopwords or not?
    :param size: (int) the number of word2vec dimensions
    :param window: (int) window size of the context while training word2vec
    :param skipgram: (bool) use skipgram model or CBOW?
    :param workers: (int) number of workers to train word2vec
    :param min_count: (int) min. number of occurrences in corpus. Words with
        less occurences will be deleted.
    :return: (WhiskyEmbeddings instance)
    """
    # 1) Transform whisky reviews into well-structured objects:
    all_reviews = [WC(x, tokenize=tok, rm_stopwords=rm_s) for x in dat[1:]]

    # 2) Build a corpus and train a word2vec model:
    w2v = WordEmbeddings(all_reviews)
    word_vectors = w2v.train(size, window, skipgram, workers, min_count)

    # 3) Create whisky embeddings
    w_embedding = WhiskyEmbeddings(all_reviews, word_vectors)
    return w_embedding
def cross_validation_thread_ah_delta_context3():
    import random
    random.seed(1234567)

    import tensorflow

    sess_config = tensorflow.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    from tensorflow.python.keras.backend import set_session
    set_session(tensorflow.Session(config=sess_config))

    vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz')
    embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz')

    reader = AHVersusDeltaThreadReader(
        'data/sampled-threads-ah-delta-context3', True)
    e = ClassificationExperiment(
        reader,
        StructuredSelfAttentiveSentenceEmbedding(
            vocabulary, embeddings, '/tmp/visualization-context3'),
        ClassificationEvaluator())

    e.run()
def cross_validation_thread_ah_delta_context3():
    # classification with context
    import random
    random.seed(1234567)

    import tensorflow as tf
    if tf.test.is_gpu_available():
        strategy = tf.distribute.MirroredStrategy()
        print('Using GPU')
    else:
        raise ValueError('CPU not recommended.')

    with strategy.scope():
        vocabulary = Vocabulary.deserialize('en-top100k.vocabulary.pkl.gz')
        embeddings = WordEmbeddings.deserialize('en-top100k.embeddings.pkl.gz')
        reader = AHVersusDeltaThreadReader(
            'data/sampled-threads-ah-delta-context3', True)
        e = ClassificationExperiment(
            reader,
            StructuredSelfAttentiveSentenceEmbedding(
                vocabulary, embeddings, '/tmp/visualization-context3'),
            ClassificationEvaluator())
        e.run()
	alt_loss_val, alt_accuracy_val = discriminator_1.train_fn(X, target_mat) if not skip_discriminator else discriminator_1.eval_fn(X, target_mat)

	if batch_id == 1:
		accumulators[:] = np.array([accuracy_val, loss_val, alt_accuracy_val, alt_loss_val, gen_loss_val, recon_gen_loss_val, adv_gen_loss_val, cos_gen_loss_val, float(skip_generator), float(skip_discriminator), preout_grad_norm_val])
	else:
		accumulators[:] = ACCUMULATOR_EXPAVG * np.array([accuracy_val, loss_val, alt_accuracy_val, alt_loss_val, gen_loss_val, recon_gen_loss_val, adv_gen_loss_val, cos_gen_loss_val, float(skip_generator), float(skip_discriminator), preout_grad_norm_val]) + (1.0 - ACCUMULATOR_EXPAVG) * accumulators

	if batch_id % print_every_n == 0:
		print >> sys.stderr, 'batch: %s, acc: %s, loss: %s, alt acc: %s, alt loss: %s, gloss: %s, grloss: %s, galoss: %s, gcloss: %s, gskip: %s, dskip: %s, gn: %s' % tuple([batch_id] + accumulators.tolist())

def save_model():
	params_vals = lasagne.layers.get_all_param_values([discriminator_0.l_out, discriminator_1.l_out, gen_l_out])
	cPickle.dump(params_vals, open(MODEL_FILENAME, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)

print >> sys.stderr, 'Loading Italian embeddings...'
we_it = WordEmbeddings()
we_it.load_from_word2vec('./it')
we_it.downsample_frequent_words()
skn_it = StandardScaler()
we_it.vectors = skn_it.fit_transform(we_it.vectors).astype(theano.config.floatX)
we_batches_it = we_it.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

print >> sys.stderr, 'Loading English embeddings...'
we_en = WordEmbeddings()
we_en.load_from_word2vec('./en')
we_en.downsample_frequent_words()
skn_en = StandardScaler()
we_en.vectors = skn_en.fit_transform(we_en.vectors).astype(theano.config.floatX)
we_batches_en = we_en.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

print >> sys.stderr, 'Ready to train.'
Ejemplo n.º 9
0
    if batch_id % print_every_n == 0:
        print >> sys.stderr, 'batch: %s, acc: %s, loss: %s, alt acc: %s, alt loss: %s, gloss: %s, grloss: %s, galoss: %s, gcloss: %s, gskip: %s, dskip: %s, gn: %s' % tuple(
            [batch_id] + accumulators.tolist())


def save_model():
    params_vals = lasagne.layers.get_all_param_values(
        [discriminator_0.l_out, discriminator_1.l_out, gen_l_out])
    cPickle.dump(params_vals,
                 open(MODEL_FILENAME, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)


print >> sys.stderr, 'Loading Italian embeddings...'
we_it = WordEmbeddings()
we_it.load_from_word2vec('./it')
we_it.downsample_frequent_words()
skn_it = StandardScaler()
we_it.vectors = skn_it.fit_transform(we_it.vectors).astype(
    theano.config.floatX)
we_batches_it = we_it.sample_batches(batch_size=HALF_BATCH_SIZE,
                                     random_state=rng)

print >> sys.stderr, 'Loading English embeddings...'
we_en = WordEmbeddings()
we_en.load_from_word2vec('./en')
we_en.downsample_frequent_words()
skn_en = StandardScaler()
we_en.vectors = skn_en.fit_transform(we_en.vectors).astype(
    theano.config.floatX)
Ejemplo n.º 10
0
                                                   W=lasagne.init.Orthogonal(),
                                                   b=None,
                                                   name='gen_l_out')

        self.dec_l_out = lasagne.layers.DenseLayer(self.gen_l_out,
                                                   num_units=n_input,
                                                   nonlinearity=None,
                                                   W=self.gen_l_out.W.T,
                                                   b=None,
                                                   name='dec_l_out')


dataDir = './'
rng = check_random_state(0)

we1 = WordEmbeddings()
we1.load_from_word2vec(dataDir, 'zh')
we1.downsample_frequent_words()
we1.vectors = normalize(we1.vectors)
we_batches1 = we1.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

we2 = WordEmbeddings()
we2.load_from_word2vec(dataDir, 'en')
we2.downsample_frequent_words()
we2.vectors = normalize(we2.vectors)
we_batches2 = we2.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

assert we1.embedding_dim == we2.embedding_dim
d = we1.embedding_dim

discriminator = Discriminator()
Ejemplo n.º 11
0
args = parser.parse_args()

DISCR_NUM_HIDDEN_LAYERS = args.Dlayers
DISCR_HIDDEN_DIM = args.Ddim
HALF_BATCH_SIZE = 128

MODEL_FILENAME = 'model.pkl'

rng = check_random_state(0)

lang1 = args.lang1
lang2 = args.lang2
dataDir = 'data/' + args.config + '/'

print >> sys.stderr, 'Loading', lang1, 'embeddings...'
we1 = WordEmbeddings()
we1.load_from_word2vec(dataDir, lang1)
we1.downsample_frequent_words()
we1.vectors = normalize(we1.vectors).astype(theano.config.floatX)
we_batches1 = we1.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

print >> sys.stderr, 'Loading', lang2, 'embeddings...'
we2 = WordEmbeddings()
we2.load_from_word2vec(dataDir, lang2)
we2.downsample_frequent_words()
we2.vectors = normalize(we2.vectors).astype(theano.config.floatX)
we_batches2 = we2.sample_batches(batch_size=HALF_BATCH_SIZE, random_state=rng)

assert we1.embedding_dim == we2.embedding_dim
d = we1.embedding_dim