コード例 #1
0
 def prepare_data(self, rumourIds, tweetIds, times, labels=[], make_target=True):
     maxtweetseq = 0;
     maxtweetlen = 0;
     # tweet_embeddings = self.extract_tweet_embeddings(rumourIds, tweetIds);
     tweets = tweet.preprocessTexts(tweet.readTweetContents(map(int, rumourIds), map(int, tweetIds), self.topic), vocabSize=_VOCABULARY_SIZE, remove_stop_words=True, mask_zero=True);
     rumour_seqs = rumours.extract_rumour_seqs(rumourIds, tweetIds, times);
     # x_train = np.zeros((len(set(rumourIds)), _MAX_TWEET_SEQ_LENGTH, _MAX_TWEET_LENGTH, _EMBEDDING_SIZE), dtype=float); # for _MASKING_VALUE 0.
     x_train = np.zeros((len(set(rumourIds)), _MAX_TWEET_SEQ_LENGTH, _MAX_TWEET_LENGTH), dtype=int); # for _MASKING_VALUE 0
     if make_target:
         y_train = np.zeros((len(set(rumourIds)), _MAX_TWEET_SEQ_LENGTH, 4), dtype=int);
     for rumourId, seq in enumerate(rumour_seqs):
         tweet_counter = 0;
         if len(seq) > maxtweetseq:
             maxtweetseq = len(seq);
         for tweet_ind, tweetId in enumerate(seq):
             # x_train[rumourId, tweet_ind,:,:] = np.asarray(tweet_embeddings[list(tweetIds).index(tweetId)])[0:_MAX_TWEET_LENGTH,:]
             tweet_tokens = tweets[list(tweetIds).index(tweetId)]
             if len(tweet_tokens) > maxtweetlen:
                 maxtweetlen = len(tweet_tokens);
             x_train[rumourId,tweet_ind,:len(tweet_tokens)] = tweet_tokens
             if make_target:
                 y_train[rumourId, tweet_ind, int(labels[list(tweetIds).index(tweetId)])] = 1;
             tweet_counter += 1;
             if tweet_counter >= _MAX_TWEET_SEQ_LENGTH:
                 break;
     # print >> sys.stderr, 'max tweet seq : ', maxtweetseq;
     # print >> sys.stderr, 'max tweet length : ', maxtweetlen;
     # print >> sys.stderr, x_train[0,0,:]
     # print >> sys.stderr, x_train[0,1,:]
     # print >> sys.stderr, x_train[1,2,:]
     if make_target:
         return x_train, y_train;
     else:
         return x_train;
コード例 #2
0
    def train(self, rememberEmbeddings=False):
        print >> sys.stderr, 'LMRNN Training...'
        t1 = time.time()
        # tweetTexts = tweet.readTweetContents(map(int, self.ememes), map(int, self.infecting_vec));
        # self.model = kerasRNN(texts=tweetTexts, labels=self.node_vec, static=True);
        # x_train = self.model.preprocess_text(texts=tweetTexts, createIndex=True);
        # ls = [];
        # for s in x_train:
        #     ls.append(len(s));
        # print >> sys.stderr, 'max : ', np.max(ls);

        tweetTexts = tweet.readTweetContents(map(int, self.rumourIds),
                                             map(int, self.tweetIds))
        self.lm_s = kerasRNN(texts=tweetTexts,
                             labels=self.labels,
                             binary=True,
                             targetClass=0,
                             maxlen=self.maxlen,
                             vocabulary_size=self.vocabulary_size,
                             embedding_size=self.embedding_size)
        self.lm_d = kerasRNN(texts=tweetTexts,
                             labels=self.labels,
                             binary=True,
                             targetClass=1,
                             maxlen=self.maxlen,
                             vocabulary_size=self.vocabulary_size,
                             embedding_size=self.embedding_size)
        self.lm_q = kerasRNN(texts=tweetTexts,
                             labels=self.labels,
                             binary=True,
                             targetClass=2,
                             maxlen=self.maxlen,
                             vocabulary_size=self.vocabulary_size,
                             embedding_size=self.embedding_size)
        self.lm_c = kerasRNN(texts=tweetTexts,
                             labels=self.labels,
                             binary=True,
                             targetClass=3,
                             maxlen=self.maxlen,
                             vocabulary_size=self.vocabulary_size,
                             embedding_size=self.embedding_size)

        if rememberEmbeddings:
            self.predict_test(None,
                              None,
                              None,
                              self.tweetIds,
                              self.rumourIds,
                              None,
                              None, [],
                              rememberEmbeddings=True)

        t2 = time.time()
        print >> sys.stderr, "Training Time: %f seconds" % ((t2 - t1) * 1.)
コード例 #3
0
 def makeFeaturedData(self,
                      rumourIds,
                      tweetIds,
                      labels=[],
                      makeTarget=True):
     # x_train = tweet.extractVocabFeature(rumourIds, tweetIds);
     x_train = self.doc2vec.extract_train_vec(
         tweet.readTweetContents(map(int, rumourIds), map(int, tweetIds),
                                 self.topic))
     if makeTarget:
         y_train = labels
         return x_train, y_train
     else:
         return x_train
コード例 #4
0
    def train(self, rememberHistory=False):
        print >> sys.stderr, 'seq2seqRNN Training...'
        t1 = time.time()

        self.binary = False;
        self.useTree = False;

        tweetTexts = tweet.readTweetContents(map(int, self.ememes), map(int, self.infecting_vec));
        # features = self.convertTextToFeature(tweetTexts);
        features = self.doc2vec.extract_train_vec(tweetTexts);
        self.input_shape = (1, 1, len(features[0]))
        if self.useTree:
            x_train, y_train = self.extract_threads_targets(features, self.infecting_vec, self.infected_vec,
                                                            self.node_vec, makeTarget=True);
        else:
            if self.binary:
                x_train, y_train_s, y_train_d, y_train_q, y_train_c = self.extract_seq_targets(features, self.etimes,
                                                                                               self.ememes,
                                                                                               self.node_vec,
                                                                                               makeTarget=True,
                                                                                               binary=True);
            else:
                x_train, y_train = self.extract_seq_targets(features, self.etimes, self.ememes, self.node_vec,
                                                            makeTarget=True);

        if self.binary:
            self.model_s = self.trainModel(x_train, y_train_s, binary=True);
            self.model_d = self.trainModel(x_train, y_train_d, binary=True);
            self.model_q = self.trainModel(x_train, y_train_q, binary=True);
            self.model_c = self.trainModel(x_train, y_train_c, binary=True);
        else:
            self.model = self.trainModel(x_train, y_train, binary=False);


        if rememberHistory:
            self.predict_test(None,self.etimes,self.infected_vec,self.infecting_vec,self.ememes,None,None,[],rememberHistory=True);

        t2 = time.time()
        print >> sys.stderr, "Training Time: %f seconds" % ((t2 - t1) * 1.)
コード例 #5
0
    def predict_test(self,
                     testN,
                     testtimes,
                     testinfected_vec,
                     testinfecting_vec,
                     testeventmemes,
                     testW,
                     testT,
                     testnode_vec,
                     rememberEmbeddings=False):
        predictednode_vec = [None for _ in xrange(len(testinfecting_vec))]
        tweetTexts = tweet.readTweetContents(map(int, testeventmemes),
                                             map(int, testinfecting_vec))
        index = -1
        for tt in tweetTexts:
            index += 1

            s_score = self.lm_s.predict(tt, score=True)
            d_score = self.lm_d.predict(tt, score=True)
            q_score = self.lm_q.predict(tt, score=True)
            c_score = self.lm_c.predict(tt, score=True)

            scores = [s_score, d_score, q_score, c_score]
            max_index, max_value = max(enumerate(scores),
                                       key=operator.itemgetter(1))

            predictednode_vec[index] = max_index

            if rememberEmbeddings:
                self.embeddings_s[
                    testinfecting_vec[index]] = self.lm_s.getEmbeddings(tt)
                self.embeddings_d[
                    testinfecting_vec[index]] = self.lm_d.getEmbeddings(tt)
                self.embeddings_q[
                    testinfecting_vec[index]] = self.lm_q.getEmbeddings(tt)
                self.embeddings_c[
                    testinfecting_vec[index]] = self.lm_c.getEmbeddings(tt)

        return predictednode_vec
コード例 #6
0
    def predict_test(
            self,
            testN,
            testtimes,
            testinfected_vec,
            testinfecting_vec,
            testeventmemes,
            testW,
            testT,
            testnode_vec,
            rememberHistory=False
    ):
        predictednode_vec = np.zeros((len(testtimes),), dtype=int);

        tweetTexts = tweet.readTweetContents(map(int, testeventmemes), map(int, testinfecting_vec))
        # features = self.convertTextToFeature(tweetTexts);
        features = self.doc2vec.infer_test_vec(tweetTexts);

        if self.useTree:
            predictions = {};
            x_test = self.extract_threads_targets(features, testinfecting_vec, testinfected_vec, [], makeTarget=False, return_full=True);
            for thread in x_test:
                for point in thread:
                    feature = point.auxilaries[0];
                    tweetId = point.data;
                    ind = list(testinfecting_vec).index(int(tweetId));
                    label = self.model.predict_classes(np.asarray([[list(feature)]]), batch_size=1)[0];
                    if ind in predictions.keys():
                        predictions[ind].append(label);
                    else:
                        predictions[ind] = [label];
                self.model.reset_states()
            assert len(predictions.keys()) == len(testtimes)
            for ind in predictions.keys():
                predictednode_vec[ind] = np.argmax(np.bincount(np.asarray(predictions[ind])));

        else:
            x_test, indices = self.extract_seq_targets(features, testtimes, testeventmemes, [], makeTarget=False,
                                                       retunr_indices=True);
            # x_test = x_test[0];
            # indices = indices[0]

            for i in xrange(len(x_test)):
                for j in xrange(len(x_test[i])):
                    if self.binary:
                        s_score = self.model_s.predict_on_batch(np.asarray([[list(x_test[i][j])]]))[0];
                        d_score = self.model_s.predict_on_batch(np.asarray([[list(x_test[i][j])]]))[0];
                        q_score = self.model_s.predict_on_batch(np.asarray([[list(x_test[i][j])]]))[0];
                        c_score = self.model_s.predict_on_batch(np.asarray([[list(x_test[i][j])]]))[0];

                        scores = [s_score, d_score, q_score, c_score];
                        max_index, max_value = max(enumerate(scores), key=operator.itemgetter(1))

                        # if len(x_test) == 1:
                        predictednode_vec[indices[i][j]] = max_index;
                    else:
                        # if len(x_test) == 1:
                        predictednode_vec[indices[i][j]] = \
                                self.model.predict_classes(np.asarray([[list(x_test[i][j])]]), batch_size=1)[0];
                        if(rememberHistory):
                            self.histories[testinfecting_vec[indices[i][j]]] = list(K.get_value(self.model.layers[0].states[1])[0])


                if self.binary:
                    self.model_s.reset_states()
                    self.model_d.reset_states()
                    self.model_q.reset_states()
                    self.model_c.reset_states()
                else:
                    self.model.reset_states()

        if rememberHistory is not True:
            print >> sys.stderr, predictednode_vec;
            print >> sys.stderr, testnode_vec;

        return predictednode_vec