Example #1
0
 def __init__(self, topic, group):
     modelpath = "./runs/best_model_" + topic + "_" + group + ".pth"
     self.topic = topic
     self.group = group
     self.USE_GPU = torch.cuda.is_available()
     self.EMBEDDING_DIM = 300
     self.HIDDEN_DIM = 150
     self.BATCH_SIZE = 1000
     self.id_field = data.Field(sequential=False, use_vocab=False)
     self.text_field = data.Field(lower=True)
     self.label_field = data.Field(sequential=False)
     self.train_iter, self.dev_iter, self.test_iter = self.load_sst(
         self.text_field, self.label_field, self.BATCH_SIZE)
     self.model = LSTMSentiment(embedding_dim=self.EMBEDDING_DIM, hidden_dim=self.HIDDEN_DIM, vocab_size=len(self.text_field.vocab), label_size=len(self.label_field.vocab)-1,\
                       use_gpu=self.USE_GPU, batch_size=self.BATCH_SIZE)
     self.model.load_state_dict(torch.load(modelpath))
     return
Example #2
0
def train_root(topic, group, test_on_annotated_data=False):
    args = argparse.ArgumentParser()
    args.add_argument('--m', dest='model', default='lstm', help='specify the mode to use (default: lstm)')
    args = args.parse_args()

    EPOCHS = 1
    USE_GPU = torch.cuda.is_available()
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 150

    BATCH_SIZE = 50
    timestamp = str(int(time.time()))
    best_dev_acc = 0.0


    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE, topic, group, test_on_annotated_data=test_on_annotated_data)

    print (text_field.vocab.stoi["zzzzzzlove"])
    model = LSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)


    if USE_GPU:
        model = model.cuda()


    print('Load word embeddings...')
# # glove
# text_field.vocab.load_vectors('glove.6B.100d')

# word2vector
    word_to_idx = text_field.vocab.stoi
    pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
    pretrained_embeddings[0] = 0
    word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300-SLIM.bin', word_to_idx)
    for word, vector in word2vec.items():
        pretrained_embeddings[word_to_idx[word]-1] = vector

# text_field.vocab.load_vectors(wv_type='', wv_dim=300)

    model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
# model.embeddings.weight.data = text_field.vocab.vectors
# model.embeddings.embed.weight.requires_grad = False


    best_model = model
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_function = nn.NLLLoss()

    print('Training...')
    #out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs"))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    model_path = out_dir + '/best_model_'+topic+'_'+group + '.pth'
    for epoch in range(EPOCHS):
        avg_loss, acc = train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch)
        tqdm.write('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))

        #if epoch == 1:
        #    ans = []
        #    test = doc_complete[:1000]
        #    for x in test:
        #        tl = []
        #        for w in x:
        #            tl.append(text_field.vocab.stoi[w])
        #        t = evaluate2(model,torch.Tensor(tl).view(-1,1))
        #        ans.append(t[0][1])
        #    sort_idx = [i[0] for i in sorted(enumerate(ans), key= lambda x:x[1], reverse = True)]
        #    tpo100 = sort_idx[:100]
        #    pos_tweet = [test[t] for t in tpo100]
        #    for idx,p in enumerate(pos_tweet):
        #        p = " ".join(p)
        #        print (str(idx)+" : "+ p)
        #        print (" ")
    

    #evaluate1(model,torch.Tensor(tl).view(-1,1))
        dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
        test_acc = evaluate(model, test_iter, loss_function, 'Test')
        if dev_acc > best_dev_acc:
            if best_dev_acc > 0:
                os.system('rm '+ model_path)
            best_dev_acc = dev_acc
            torch.save(model.state_dict(), model_path)
            print("Best model is saved.")

    model.load_state_dict(torch.load(model_path))
    test_acc = evaluate(model, test_iter, loss_function, 'Final Test')
Example #3
0
USE_GPU = torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

BATCH_SIZE = 32
timestamp = str(int(time.time()))
best_dev_acc = 0.0


text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)
train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE)

if args.model == 'lstm':
    model = LSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)

if args.model == 'bilstm':
    model = BiLSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)




print('Load word embeddings...')
# # glove
text_field.vocab.load_vectors('glove.6B.300d')

# word2vector
#word_to_idx = text_field.vocab.stoi
#pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
Example #4
0
class TweetSearch:
    def __init__(self, topic, group):
        modelpath = "./runs/best_model_" + topic + "_" + group + ".pth"
        self.topic = topic
        self.group = group
        self.USE_GPU = torch.cuda.is_available()
        self.EMBEDDING_DIM = 300
        self.HIDDEN_DIM = 150
        self.BATCH_SIZE = 1000
        self.id_field = data.Field(sequential=False, use_vocab=False)
        self.text_field = data.Field(lower=True)
        self.label_field = data.Field(sequential=False)
        self.train_iter, self.dev_iter, self.test_iter = self.load_sst(
            self.text_field, self.label_field, self.BATCH_SIZE)
        self.model = LSTMSentiment(embedding_dim=self.EMBEDDING_DIM, hidden_dim=self.HIDDEN_DIM, vocab_size=len(self.text_field.vocab), label_size=len(self.label_field.vocab)-1,\
                          use_gpu=self.USE_GPU, batch_size=self.BATCH_SIZE)
        self.model.load_state_dict(torch.load(modelpath))
        return

    def evaluate(self, model, sent):
        model.eval()
        model.batch_size = 1
        model.hidden = model.init_hidden()
        pred = model(Variable(sent.long(), requires_grad=False))
        pred_label = pred.data.numpy().tolist()
        #print (pred_label)
        return pred_label

    def load_sst(self, text_field, label_field, batch_size):
        train, dev, test = data.TabularDataset.splits(
            path='./data/',
            train='train_' + self.topic + '_' + self.group + '.csv',
            validation='test_' + self.topic + '_' + self.group + '.csv',
            test=self.topic + '_' + self.group + '_evaluate_dataset.csv',
            format='tsv',
            fields=[('text', text_field), ('label', label_field)])
        text_field.build_vocab(train, dev, test)
        label_field.build_vocab(train, dev, test)
        train_iter, dev_iter, test_iter = data.BucketIterator.splits(
            (train, dev, test),
            batch_sizes=(batch_size, len(dev), len(test)),
            sort_key=lambda x: len(x.text),
            repeat=False,
            device=-1)
        ## for GPU run
        #     train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
        #                 batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=None)
        return train_iter, dev_iter, test_iter

    def preprocess(self):
        all_data = data.TabularDataset(path='tweet_slim.csv',
                                       format='csv',
                                       fields=[('grp', None),
                                               ('id', self.id_field),
                                               ('text', self.text_field)],
                                       skip_header=True)
        data_iter = data.BucketIterator(all_data,
                                        batch_size=self.BATCH_SIZE,
                                        repeat=False,
                                        device=-1)
        return data_iter, all_data

    def load_test_data(self):
        tweets = []
        labels = []
        with open(
                os.path.join(
                    "data",
                    self.topic + '_' + self.group + '_evaluate_dataset.csv'),
                'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter='\t',
                                 escapechar='\\')
            for row in readCSV:
                tweets.append(row[0])
                labels.append(int(row[1]))

        with open(
                os.path.join(
                    "data", self.topic + '_' + self.group +
                    '_evaluate_dataset_withID.csv'), 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter='\t',
                                    escapechar='\\')
            for i in range(len(labels)):
                # id, tweet, label
                spamwriter.writerow([i, tweets[i], labels[i]])

        test_data = data.TabularDataset(path=os.path.join(
            "data",
            self.topic + '_' + self.group + '_evaluate_dataset_withID.csv'),
                                        format='tsv',
                                        fields=[('id', self.id_field),
                                                ('text', self.text_field)])
        test_iter = data.Iterator(test_data,
                                  batch_size=self.BATCH_SIZE,
                                  repeat=False,
                                  device=-1,
                                  shuffle=False)
        return test_data, test_iter, tweets, labels

    def predict(self):
        test_data, test_iter, tweets, labels = self.load_test_data()
        print(test_data[0].text)
        print(test_data[0].id)
        print(test_data[1].text)
        print(test_data[1].id)
        print(test_data[2].text)
        print(test_data[2].id)
        self.model.eval()
        print("begin to predict....")
        pred_res = []
        ids = []
        for batch in test_iter:
            sent = batch.text
            idd = batch.id
            ids += (idd.data.numpy().tolist())
            self.model.batch_size = len(idd.data)
            self.model.hidden = self.model.init_hidden()
            pred = self.model(sent)
            pred_label = pred.data.numpy()
            pred_res += pred_label[:, 1].tolist()

        ori_score = [math.exp(x) for x in pred_res]

        with open(
                os.path.join(
                    "data", 'predict_' + self.topic + '_' + self.group +
                    '_evaluate_dataset.csv'), 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter='\t',
                                    escapechar='\\')
            for tweet, label, score in zip(tweets, labels, ori_score):
                spamwriter.writerow([tweet, label, score])

    def search2(self):
        print("loading data....")
        data, ori_data = self.preprocess()
        print(ori_data[0].text)
        print(ori_data[1].text)
        print(ori_data[2].text)
        self.model.eval()
        avg_loss = 0.0
        id_list = []
        pred_res = []
        id = 0
        print("begin to search....")
        for batch in data:
            id = id + 1
            if id % 10 == 0:
                print(id)
            sent = batch.text
            idd = batch.id
            #id_list = np.append(id_list,idd.data.numpy() )
            id_list += (idd.data.numpy().tolist())
            self.model.batch_size = len(idd.data)
            self.model.hidden = self.model.init_hidden()
            pred = self.model(sent)
            pred_label = pred.data.numpy()
            #pred_res = np.append(pred_res,pred_label[:,1])
            pred_res += pred_label[:, 1].tolist()
        #print (id_list)
        #print (pred_res)
        #pos = pred_res>-0.69
        #id_list = id_list[pos].tolist()
        #pred_res = pred_res[pos].tolist()
        #ss = sorted(pred_res, reverse=True)
        #print ([math.exp(s) for s in ss[:10]])
        print("finish searching.")
        print("begin to ranking....")
        sort_index = [
            i[0] for i in sorted(
                enumerate(pred_res), key=lambda x: x[1], reverse=True)
        ]
        sort_id = [id_list[id] for id in sort_index]
        ori_score = [math.exp(pred_res[id]) for id in sort_index]
        # Save sorted id and corresponding scores
        pickle.dump(
            sort_id,
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "wb"))
        pickle.dump(
            ori_score,
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "wb"))
        #ori_score_pos = [s for s in ori_score if s>0.5]
        #plt.hist(ori_score_pos, normed=True, bins=100)
        #plt.ylabel('Probability')
        #plt.show()
        pos_ids, pos_h = self.get_tweets_id_score(threshold=0.5)
        return pos_ids, pos_h

    def show_scores_hist(self):
        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        print(ori_score[:10])
        hist = np.histogram(ori_score, np.linspace(0, 1, 11))
        print(hist)
        pickle.dump(
            hist,
            open("scoreHist_" + self.topic + "_" + self.group + ".pkl", "wb"))

    def get_tweets_id_score(self, threshold):
        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        pos_ids = []
        pos_h = {}
        # pos_ids = [sort_id[idx] for idx, s in enumerate(ori_score) if s>0.5]
        for idx, s in enumerate(ori_score):
            if s > threshold:
                pos_ids.append(sort_id[idx])
                pos_h[sort_id[idx]] = s
        # print (sort_id[:10])
        # print (ori_score[:10])
        # print (ori_data[0].text)
        print(pos_ids)
        return pos_ids, pos_h

    def sample_tweets_id_score(self, num, path):
        """
        :num: randomly sample num of tweets, equally distributed in each 10 bin.
        :return:
        """
        # Get id for each group
        ids_by_group = {"con": set(), "lib": set()}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                if first_line == True:
                    first_line = False
                    continue
                else:
                    ids_by_group[row[0]].add(int(row[1]))

        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        bin = 10
        size = num / 10
        all_pos_ids = []
        pos_ids = []
        pos_h = {}
        threshold = 0.9
        # Group id and scores by 10 bin
        for idx, s in enumerate(ori_score):
            if s >= threshold:
                id = sort_id[idx]
                if id in ids_by_group[self.group]:
                    pos_ids.append(id)
                    pos_h[id] = s
            else:
                print(
                    "There are {} tweets with a score more than {} in group {}"
                    .format(len(pos_ids), threshold, self.group))
                all_pos_ids.append(pos_ids)
                pos_ids = []
                threshold -= 1.0 / bin
        print(
            "There are {} tweets with a score more than {} in group {}".format(
                len(pos_ids), threshold, self.group))
        all_pos_ids.append(pos_ids)
        # Sample the number "size" of tweets in each sub-group
        sample_pos_ids = []

        for pos_ids in all_pos_ids:
            sampled_ids = random.sample(pos_ids, size)
            sample_pos_ids += sampled_ids

        random.shuffle(sample_pos_ids)

        return sample_pos_ids, pos_h

    def get_id_already(self, file_path):
        id_already = []
        with open(file_path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            for row in readCSV:
                status = row[2].strip()
                id = status.split("/")[-1]
                id_already.append(int(id))
        print("id_ready: {}".format(id_already))
        return id_already

    def sample_tweets_id_score2(self, num, path, id_already):
        """
        :num: randomly sample num of tweets, equally distributed in each 10 bin.
        :id_already: is the list of ID which is already sampled last time
        :return:
        """
        # Get id for each group
        ids_by_group = {"con": set(), "lib": set()}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                if first_line == True:
                    first_line = False
                    continue
                else:
                    ids_by_group[row[0]].add(int(row[1]))

        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        bin = 5
        size = num / bin
        all_pos_ids = []
        pos_ids = []
        pos_h = {}
        threshold = 0.95
        # Group id and scores by 10 bin
        for idx, s in enumerate(ori_score):
            if s >= threshold:
                id = sort_id[idx]
                if id in ids_by_group[self.group] and id not in id_already:
                    pos_ids.append(id)
                    pos_h[id] = s
            else:
                print(
                    "There are {} tweets with a score more than {} in group {}"
                    .format(len(pos_ids), threshold, self.group))
                all_pos_ids.append(pos_ids)
                pos_ids = []
                threshold -= 0.05
                if threshold < 0.74:  # TODO: modify the lowest threshold for sampling
                    break
        # Sample the number "size" of tweets in each sub-group
        sample_pos_ids = []

        for pos_ids in all_pos_ids:
            sampled_ids = random.sample(pos_ids, size)
            sample_pos_ids += sampled_ids

        random.shuffle(sample_pos_ids)

        return sample_pos_ids, pos_h

    def read_tweet_by_id(self, path, ids, ids_h, topic, group, sample=False):
        print(len(ids))
        id_set = set(ids)
        print(len(id_set))
        ans = {}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                #print (row)
                if first_line == True:
                    first_line = False
                    continue
                if int(row[1]) in id_set and row[0] == group:
                    ans[int(row[1])] = row
        print(len(ans))
        if sample:
            tweet_name = 'sampled_tweet_'
        else:
            tweet_name = 'tweet_'
        with open(tweet_name + topic + '_' + group + '.csv', 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter=',',
                                    escapechar='\\')
            for x in ids:
                if x in ans:
                    t = ans[x]
                    tweet = [
                        t[0], t[2], 'https://twitter.com/a/status/' + t[1],
                        ids_h[int(t[1])]
                    ]
                    #print (ans[x])
                    #print ("")
                    spamwriter.writerow(tweet)