Example #1
0
def get_pair_to_tids():
    print('Initializing Data Loader...')
    dl = Data_loader()
    test_ids = [tweet['tweet_id'] for tweet in dl.test_data()]
    pair2tids = {}
    for record in dl.all_data():
        if record['tweet_id'] not in test_ids:
            involved = set()
            involved.add(record['user_post'])
            if 'user_retweet' in record:
                involved.add(record['user_retweet'])
            if 'user_mentions' in record:
                for user in record['user_mentions']:
                    involved.add(user)
            involved = sorted(list(involved))

            for i, u1 in enumerate(involved):
                for u2 in involved[i + 1:]:
                    pair_id = str(u1) + '_' + str(u2)
                    if pair_id in pair2tids:
                        pair2tids[pair_id].append(record['tweet_id'])
                    else:
                        pair2tids[pair_id] = [record['tweet_id']]

    return pair2tids
Example #2
0
def main(args):
    # params for data loader
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))

    mode = args['mode']
    assert (mode == 'w2v' or mode == 'svd' or mode == 'd2v')
    if mode == 'w2v':
        sentences = []
        for tweet in all_data:
            # need indices split
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append([str(x) for x in tweet['int_arr']])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_w2v_embs(sentences, option)
    elif mode == 'svd':
        sentences = []
        for i, tweet in enumerate(all_data):
            # need indices joined
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                sentences.append(' '.join([str(x) for x in tweet['int_arr']]))
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        generate_svd_embs(sentences, option)
    else:  # mode == d2v
        sentences = []
        tags = []
        for tweet in all_data:
            if tweet['tweet_id'] not in test_ids and tweet[
                    'tweet_id'] not in ensemble_ids:
                # need indices split and use id's as tags
                sentences.append([str(x) for x in tweet['int_arr']])
                tags.append([str(tweet['tweet_id'])])
        print('Num sentences:', len(sentences))
        print('Check sentence0:', sentences[0])
        print('Check tag0:', tags[0])
        generate_d2v_embs(sentences, tags, option)
Example #3
0
def make_user_embeds(num_users):
    dim = 300
    embeds = np.random.rand(num_users, dim)

    print('Initializing Data Loader...')
    dl = Data_loader()
    tl = init_tl('w2v')
    test_ids = [tweet['tweet_id'] for tweet in dl.test_data()]
    pretrained_count = 0
    for user_idx in range(
            2, num_users
    ):  # reserve 0 for padding (i.e. no user), 1 for unknown user
        tweet_dicts = dl.tweets_by_user(
            user_idx)  # all tweets WRITTEN by this user
        if tweet_dicts is not None and len(tweet_dicts) > 0:
            tweet_count = 0
            all_tweets_sum = np.zeros(dim, dtype=np.float)
            for tweet_dict in tweet_dicts:
                tid = tweet_dict['tweet_id']
                if tid not in test_ids:
                    tweet_count += 1
                    tweet_avg = tl.get_representation(tid, mode='avg')
                    all_tweets_sum += tweet_avg
            if tweet_count > 0:
                pretrained_count += 1
                all_tweets_avg = all_tweets_sum / tweet_count
                embeds[user_idx] = all_tweets_avg
    print('Found tweets for {} out of {} users'.format(pretrained_count,
                                                       num_users - 2))

    embeds = StandardScaler().fit_transform(embeds)  # mean 0, variance 1
    embeds[0] = np.zeros(dim)  # make sure padding is all 0's

    save_file = str(num_users) + '_user_emb.np'
    np.savetxt(save_file, embeds)
    print('Saved embeddings in', save_file)
Example #4
0
                        type=int,
                        default=20,
                        help='iterations for word2vec; ignored if svd')

    args = vars(parser.parse_args())
    print(args)

    # main(args)
    option = args['option']
    print('Initializing Data Loader')
    dl = Data_loader(option=option)
    all_data = dl.all_data()
    all_tids = set([str(tweet['tweet_id']) for tweet in all_data])
    print(list(all_tids)[:10])
    print('Len of all data:', len(all_data))
    test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()])
    print('Len of test data:', len(test_ids))
    ensemble_ids = get_ensemble_tids()
    print('Len of ensemble data:', len(ensemble_ids))
    print(list(ensemble_ids)[:10])
    assert (len(ensemble_ids.intersection(all_tids)) == 0)

    # w2v_file = '../data/w2v_word_s300_w5_mc5_ep20.bin'
    # svd_file = '../data/svd_word_s300.pkl'
    # sample_usage(w2v_file, svd_file)

    # test_sents = [['2', '254', '440', '192', '94', '57', '72', '77'],
    # 			  ['2', '16', '60', '10', '219', '259', '16', '142', '538'],
    # 			  ['6', '132', '130', '11646', '47', '6', '25', '4', '132', '130', '3934', '73', '12', '163', '3035', '545', '221', '545']]
    # test_tags = [['740043438788345856'], ['258662084089368576'], ['842801723001487360']]
    # generate_d2v_embs(test_sents, test_tags, 'word')
Example #5
0
class Experiment:

    def __init__(self, experiment_dir, input_name2id2np=None, adapt_train_vocab=False,
                 comments='', epochs=100, patience=4, noise_function=None, filter_function=None, fold=5,
                 predict_ens_test=True, by_fold=False,
                 **kwargs):
        """
        an experiment class that runs cross validation
        designed to enable easy experiments with combinations of:
        1) context representation:
            handled by input_name2id2np
        2) pre-training methods:
            handled by pretrained_weight_dir in the kwargs argument
            None if there is no pretraining weight available
        3) char vs. word:
            specified in "options"
            options = ['char', 'word'] if you want to include both
            implement the value for key "word_content_input"
        options = ['char', 'word'] if you want to include everything

        Parameters
        ----------
        input_name2id2np:
        experiment_dir: the directory that the experiment weights and results will be saved
        adapt_train_vocab: under supervised training without pretraining,
                            some vocab will not be seen (twice) in the training set.
                            if set to True, then vocab occuring less than twice will be removed.
        comments: the comments that will be written to the README
        epochs: number of epochs of training during cross validation
        patience: number of epochs allowable for not having any improvement on the validation set
        kwargs: arguments that will be passed to initializing the neural network model (shown below)

        ========== below is the parameters needed by the neural network model ==========

        options: an array containing all the options considered in the neural network model ['char', 'word']
                    (probably splex in the future)
                    for each option, the input is mapped to a lower dimension,
                    then the lower dimension representation of each option is concatenated
                    and is followed by the final classification layer
        word_vocab_size: number of word level vocabs to be considered
        word_max_len: number of words in a tweet sentence
        char_vocab_size: number of char level vocabs to be considered
        char_max_len: number of chars in a tweet sentence
        drop_out: dropout rate for regularization
        filter: number of filters for each kernel size
        dense_size: the size of the dense layer following the max pooling layer
        embed_dim: embedding dimension for character and word level
        kernel_range: range of kernel sizes
        pretrained_weight_dir: a dictionary containing the pretrained weight.
                    e.g. {'char': '../weights/char_ds.weights'} means that the pretrained weight for character level model
                    is in ../weights/char_ds.weights
        weight_in_keras: whether the weight is in Keras
        context_dim: the dimension of context representation
        context_dense_size: the dense layer size right before the context representation
        splex_dense_size: dense layer size right before the splex reps
        """
        # creating the experiment dir
        # automatically generate a README
        if experiment_dir[:-1] != '/':
            experiment_dir += '/'
        experiment_dir = '../experiments/' + experiment_dir
        self.experiment_dir, self.kwargs = experiment_dir, kwargs
        subprocess.call(['rm', '-rf', experiment_dir])
        subprocess.call(['mkdir', experiment_dir])
        self.adapt_train_vocab = adapt_train_vocab
        self.predict_ens_test = predict_ens_test
        '''
        with open(self.experiment_dir + 'README', 'w') as readme:
            
            readme.write(comments + '\n')
            for key in kwargs:
                readme.write("%s: %s\n" % (str(key), str(kwargs[key])))
        '''
        if input_name2id2np is None:
            input_name2id2np = {}
        self.input_name2id2np = input_name2id2np
        self.fold = fold
        self.dl = Data_loader(option='both', labeled_only=True, **kwargs)
        self.epochs, self.patience = epochs, patience
        self.noise_function, self.filter_function = noise_function, filter_function
        self.pretrained_weight_dirs = self.kwargs.get('pretrained_weight_dirs')
        self.by_fold = by_fold

    # cross validation
    # write all results to the directory
    # see read_results for retrieving the performance
    def cv(self):
        results = []

        for fold_idx in range(self.fold):
            print('cross validation fold %d.' % (fold_idx + 1))

            # retriving cross validataion data
            fold_data = self.dl.cv_data(fold_idx)
            ((X_train, y_train), (X_val, y_val), (X_test, y_test)) = \
                create_clf_data(self.input_name2id2np, fold_data, return_generators=False)
        
            if self.predict_ens_test:
                # retrieving the ensemble data
                ensemble_data = self.dl.ensemble_data()
                X_ensemble, y_ensemble = create_data(self.input_name2id2np, ensemble_data)
            
                # retrieving the held-out test data
                held_out_data = self.dl.test_data()
                X_held_out, y_held_out = create_data(self.input_name2id2np, held_out_data)
            
            if self.filter_function is not None:
                def apply_filter(X):
                    X_filtered = {}
                    for key in X:
                        if 'char' in key and 'input' in key:
                            X_filtered[key] = np.array([self.filter_function(x[:])
                                                        for x in X[key]])
                        else:
                            X_filtered[key] = X[key]
                    return X_filtered
            
                X_train, X_val, X_test = apply_filter(X_train), apply_filter(X_val), apply_filter(X_test)
                if self.predict_ens_test:
                    X_ensemble, X_held_out = apply_filter(X_ensemble), apply_filter(X_held_out)

            # if no pretrained weights, adapting vocabulary so that those who appear in
            # X_train less than twice would not be counted
            if self.adapt_train_vocab:
                if self.predict_ens_test:
                    adapt_vocab(X_train, (X_val, X_test, X_ensemble, X_held_out))
                else:
                    adapt_vocab(X_train, (X_val, X_test))

            class_weight = calculate_class_weight(y_train)

            # initializing model, train and predict
            K.clear_session()
            self.kwargs['input_dim_map'] = extract_dim_input_name2id2np(self.input_name2id2np)

            # cross validation test data in categorical form
            y_test = np.argmax(y_test, axis=-1)
            
            # OBSOLETE
            if self.kwargs.get('mode') == 'ternary':
                if not self.by_fold:
                    self.model = NN_architecture(**self.kwargs).model
                else:
                    self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx]
                    self.model = NN_architecture(**self.kwargs).model
                self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[macro_f1])

                # call backs
                es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1)
                weight_dir = self.experiment_dir + str(fold_idx) + '.weight'
                mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True)
                callbacks = [es, mc]

                # fit for at least 1 epoch
                self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), class_weight=class_weight)

                # training
                self.model.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), callbacks=callbacks,
                               epochs=self.epochs, class_weight=class_weight)
                self.model.load_weights(weight_dir)

                # prediction
                y_pred = self.model.predict(x=X_test)
                y_pred_val = self.model.predict(x=X_val)

                # saving predictions for ensembles
                np.savetxt(self.experiment_dir + 'pred_test' + str(fold_idx) + '.np', y_pred)
                np.savetxt(self.experiment_dir + 'pred_val' + str(fold_idx) + '.np', y_pred_val)
                np.savetxt(self.experiment_dir + 'truth_test' + str(fold_idx) + '.np', y_test)
                np.savetxt(self.experiment_dir + 'truth_val' + str(fold_idx) + '.np', y_test)

                # make y categorical
                y_pred = np.argmax(y_pred, axis=-1)

            # only cascade model is used and it is the default
            elif self.kwargs.get('mode') is None or self.kwargs.get('mode') == 'cascade':
                # initialize the predictions
                num_train, num_val, num_test = y_train.shape[0], y_val.shape[0], y_test.shape[0]
                y_pred_val, y_pred_test = [None] * num_val, [None] * num_test

                for class_idx in range(2):
                    # time the training
                    start = int(round(time.time() * 1000))

                    # create layer name that has prefix
                    # since for each fodl we train model for aggression and loss models separately
                    if class_idx  == 0:
                        self.kwargs['prefix'] = 'aggression'
                    else:
                        self.kwargs['prefix'] = 'loss'
                
                    # initialize a model
                    if not self.by_fold:
                        self.model = NN_architecture(**self.kwargs).model
                    else:
                        self.kwargs['pretrained_weight_dirs'] = self.pretrained_weight_dirs[fold_idx]
                        self.model = NN_architecture(**self.kwargs).model

                    self.model.compile(optimizer='adam', loss='binary_crossentropy')

                    # create the label for this binary classification task
                    _y_train_, _y_val_ = y_train[:,class_idx], y_val[:,class_idx]
                    num_positive_train = sum(_y_train_)

                    # call backs
                    es = EarlyStopping(patience=self.patience, monitor='val_loss', verbose=1)
                    weight_dir = self.experiment_dir + str(fold_idx) + '_' + str(class_idx) + '.weight'
                    mc = ModelCheckpoint(weight_dir, save_best_only=True, save_weights_only=True)
                    callbacks = [es, mc]

                    # training
                    if self.noise_function is None:
                        self.model.fit(x=X_train, y=_y_train_,
                                       validation_data=(X_val, _y_val_))
                        history = self.model.fit(x=X_train, y=_y_train_,
                                                 validation_data=(X_val, _y_val_),
                                                 callbacks=callbacks, epochs=self.epochs)
                    else:
                        def add_noise2data(X_train):
                            X_train_noised = {}
                            for key in X_train:
                                if 'char' in key and 'input' in key:
                                    X_train_noised[key] = np.array([self.noise_function(x[:])
                                                                    for x in X_train[key]])
                                else:
                                    X_train_noised[key] = X_train[key]
                            return X_train_noised
        
                        self.model.fit(x=add_noise2data(X_train), y=_y_train_)
                                       
                        best_val_loss, best_epoch = float('inf'), 0
                        for epoch_idx in range(1, self.epochs + 1):
                            self.model.fit(x=add_noise2data(X_train), y=_y_train_)
                            val_loss = self.model.evaluate(x=X_val, y=_y_val_)
                            print('validation loss for epoch %d: %.3f' % (epoch_idx, val_loss))
                            if val_loss < best_val_loss:
                                best_epoch = epoch_idx
                                best_val_loss = val_loss
                                self.model.save_weights(weight_dir)

                            if epoch_idx - best_epoch >= self.patience:
                                break
            
                    self.model.load_weights(weight_dir)

                    _y_pred_val_score, _y_pred_test_score = (self.model.predict(X_val).flatten(),
                                                             self.model.predict(X_test).flatten())
                    if self.predict_ens_test:
                        _y_pred_ensemble_score, _y_pred_held_out_score = (self.model.predict(X_ensemble).flatten(),
                                                                          self.model.predict(X_held_out).flatten())
                    
                    prefix = self.experiment_dir + 'fold_%d_class_%d_' % (fold_idx, class_idx)
                    
                    np.savetxt(prefix + 'pred_val.np', _y_pred_val_score)
                    np.savetxt(prefix + 'pred_test.np', _y_pred_test_score)
                    
                    if self.predict_ens_test:
                        np.savetxt(prefix + 'pred_ensemble.np', _y_pred_ensemble_score)
                        np.savetxt(prefix + 'pred_held_out.np', _y_pred_held_out_score)

                    # threshold tuning
                    best_t, best_f_val = 0, -1
                    for t in np.arange(0.01, 1, 0.01):
                        y_val_pred_ = [0] * num_val
                        for idx in range(num_val):
                            if y_pred_val[idx] is None and _y_pred_val_score[idx] >= t:
                                y_val_pred_[idx] = 1
                        f = f1_score(_y_val_, y_val_pred_)
                        if f > best_f_val:
                            best_f_val = f
                            best_t = t
                        # a temp variable that we do not want its value
                        # to be accidentally accessed by outside code
                        y_val_pred_ = None

                    # predictions made only when predictions not made by the previous model
                    # and larger than the best threshold
                    # true for both val_pred and test_pred
                    for idx in range(num_val):
                        if y_pred_val[idx] is None and _y_pred_val_score[idx] >= best_t:
                            y_pred_val[idx] = class_idx


                    for idx in range(num_test):
                        if y_pred_test[idx] is None and _y_pred_test_score[idx] >= best_t:
                            y_pred_test[idx] = class_idx
                    
                    end = int(round(time.time()))

                    # write how many time it takes for a run into the readme
                    duration = end - start
                    with open(self.experiment_dir + 'README', 'a') as readme:
                        readme.write('fold %d class %d takes %d seconds\n'
                                     % (fold_idx, class_idx, duration))

                # predict the rest as the "Other" class
                for idx in range(num_test):
                    if y_pred_test[idx] is None:
                        y_pred_test[idx] = 2
                
                for idx in range(num_val):
                    if y_pred_val[idx] is None:
                        y_pred_val[idx] = 2

            np.savetxt(self.experiment_dir + 'fold_%d_pred_val.np' % fold_idx, y_pred_val)
            np.savetxt(self.experiment_dir + 'fold_%d_pred_test.np' % fold_idx, y_pred_test)
            np.savetxt(self.experiment_dir + 'fold_%d_truth_val.np' % fold_idx, y_val)
            np.savetxt(self.experiment_dir + 'fold_%d_truth_test.np' % fold_idx, y_test)
            
            # append the result on this fold to results
            results.append(precision_recall_fscore_support(y_test, y_pred_test))

        # saving results
        results = np.array(results)
        np.savetxt(self.experiment_dir + 'result_by_fold.np', results.flatten())
        np.savetxt(self.experiment_dir + 'result_averaged.np', np.mean(results, axis=0))
        np.savetxt(self.experiment_dir + 'result_std.np', np.std(results, axis=0))

        avg_macro_f = np.mean(np.mean(results, axis=0)[2])
        with open(self.experiment_dir + 'README', 'a') as readme:
            readme.write('macro F-score: %.4f\n' % avg_macro_f)