Python LSTMSentiment Examples

Programming Language: Python

Namespace/Package Name: lstm

Class/Type: LSTMSentiment

Examples at hotexamples.com: 4

Python LSTMSentiment - 4 examples found. These are the top rated real world Python examples of lstm.LSTMSentiment extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LSTMSentiment(3)

cuda(2)

load_state_dict(2)

parameters(2)

eval(1)

init_hidden(1)

state_dict(1)

to(1)

Example #1

Show file

File: test.py Project: picsolab/ContextFilter

 def __init__(self, topic, group):
     modelpath = "./runs/best_model_" + topic + "_" + group + ".pth"
     self.topic = topic
     self.group = group
     self.USE_GPU = torch.cuda.is_available()
     self.EMBEDDING_DIM = 300
     self.HIDDEN_DIM = 150
     self.BATCH_SIZE = 1000
     self.id_field = data.Field(sequential=False, use_vocab=False)
     self.text_field = data.Field(lower=True)
     self.label_field = data.Field(sequential=False)
     self.train_iter, self.dev_iter, self.test_iter = self.load_sst(
         self.text_field, self.label_field, self.BATCH_SIZE)
     self.model = LSTMSentiment(embedding_dim=self.EMBEDDING_DIM, hidden_dim=self.HIDDEN_DIM, vocab_size=len(self.text_field.vocab), label_size=len(self.label_field.vocab)-1,\
                       use_gpu=self.USE_GPU, batch_size=self.BATCH_SIZE)
     self.model.load_state_dict(torch.load(modelpath))
     return

Example #2

Show file

def train_root(topic, group, test_on_annotated_data=False):
    args = argparse.ArgumentParser()
    args.add_argument('--m', dest='model', default='lstm', help='specify the mode to use (default: lstm)')
    args = args.parse_args()

    EPOCHS = 1
    USE_GPU = torch.cuda.is_available()
    EMBEDDING_DIM = 300
    HIDDEN_DIM = 150

    BATCH_SIZE = 50
    timestamp = str(int(time.time()))
    best_dev_acc = 0.0


    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE, topic, group, test_on_annotated_data=test_on_annotated_data)

    print (text_field.vocab.stoi["zzzzzzlove"])
    model = LSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)


    if USE_GPU:
        model = model.cuda()


    print('Load word embeddings...')
# # glove
# text_field.vocab.load_vectors('glove.6B.100d')

# word2vector
    word_to_idx = text_field.vocab.stoi
    pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
    pretrained_embeddings[0] = 0
    word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300-SLIM.bin', word_to_idx)
    for word, vector in word2vec.items():
        pretrained_embeddings[word_to_idx[word]-1] = vector

# text_field.vocab.load_vectors(wv_type='', wv_dim=300)

    model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
# model.embeddings.weight.data = text_field.vocab.vectors
# model.embeddings.embed.weight.requires_grad = False


    best_model = model
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    loss_function = nn.NLLLoss()

    print('Training...')
    #out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs"))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    model_path = out_dir + '/best_model_'+topic+'_'+group + '.pth'
    for epoch in range(EPOCHS):
        avg_loss, acc = train_epoch_progress(model, train_iter, loss_function, optimizer, text_field, label_field, epoch)
        tqdm.write('Train: loss %.2f acc %.1f' % (avg_loss, acc*100))

        #if epoch == 1:
        #    ans = []
        #    test = doc_complete[:1000]
        #    for x in test:
        #        tl = []
        #        for w in x:
        #            tl.append(text_field.vocab.stoi[w])
        #        t = evaluate2(model,torch.Tensor(tl).view(-1,1))
        #        ans.append(t[0][1])
        #    sort_idx = [i[0] for i in sorted(enumerate(ans), key= lambda x:x[1], reverse = True)]
        #    tpo100 = sort_idx[:100]
        #    pos_tweet = [test[t] for t in tpo100]
        #    for idx,p in enumerate(pos_tweet):
        #        p = " ".join(p)
        #        print (str(idx)+" : "+ p)
        #        print (" ")
    

    #evaluate1(model,torch.Tensor(tl).view(-1,1))
        dev_acc = evaluate(model, dev_iter, loss_function, 'Dev')
        test_acc = evaluate(model, test_iter, loss_function, 'Test')
        if dev_acc > best_dev_acc:
            if best_dev_acc > 0:
                os.system('rm '+ model_path)
            best_dev_acc = dev_acc
            torch.save(model.state_dict(), model_path)
            print("Best model is saved.")

    model.load_state_dict(torch.load(model_path))
    test_acc = evaluate(model, test_iter, loss_function, 'Final Test')

Example #3

Show file

File: train_batch.py Project: a0083330/CS5246

USE_GPU = torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 300
HIDDEN_DIM = 150

BATCH_SIZE = 32
timestamp = str(int(time.time()))
best_dev_acc = 0.0


text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)
train_iter, dev_iter, test_iter = load_sst(text_field, label_field, BATCH_SIZE)

if args.model == 'lstm':
    model = LSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)

if args.model == 'bilstm':
    model = BiLSTMSentiment(embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, vocab_size=len(text_field.vocab), label_size=len(label_field.vocab)-1,\
                          use_gpu=USE_GPU, batch_size=BATCH_SIZE)




print('Load word embeddings...')
# # glove
text_field.vocab.load_vectors('glove.6B.300d')

# word2vector
#word_to_idx = text_field.vocab.stoi
#pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))

Example #4

Show file

File: test.py Project: picsolab/ContextFilter

class TweetSearch:
    def __init__(self, topic, group):
        modelpath = "./runs/best_model_" + topic + "_" + group + ".pth"
        self.topic = topic
        self.group = group
        self.USE_GPU = torch.cuda.is_available()
        self.EMBEDDING_DIM = 300
        self.HIDDEN_DIM = 150
        self.BATCH_SIZE = 1000
        self.id_field = data.Field(sequential=False, use_vocab=False)
        self.text_field = data.Field(lower=True)
        self.label_field = data.Field(sequential=False)
        self.train_iter, self.dev_iter, self.test_iter = self.load_sst(
            self.text_field, self.label_field, self.BATCH_SIZE)
        self.model = LSTMSentiment(embedding_dim=self.EMBEDDING_DIM, hidden_dim=self.HIDDEN_DIM, vocab_size=len(self.text_field.vocab), label_size=len(self.label_field.vocab)-1,\
                          use_gpu=self.USE_GPU, batch_size=self.BATCH_SIZE)
        self.model.load_state_dict(torch.load(modelpath))
        return

    def evaluate(self, model, sent):
        model.eval()
        model.batch_size = 1
        model.hidden = model.init_hidden()
        pred = model(Variable(sent.long(), requires_grad=False))
        pred_label = pred.data.numpy().tolist()
        #print (pred_label)
        return pred_label

    def load_sst(self, text_field, label_field, batch_size):
        train, dev, test = data.TabularDataset.splits(
            path='./data/',
            train='train_' + self.topic + '_' + self.group + '.csv',
            validation='test_' + self.topic + '_' + self.group + '.csv',
            test=self.topic + '_' + self.group + '_evaluate_dataset.csv',
            format='tsv',
            fields=[('text', text_field), ('label', label_field)])
        text_field.build_vocab(train, dev, test)
        label_field.build_vocab(train, dev, test)
        train_iter, dev_iter, test_iter = data.BucketIterator.splits(
            (train, dev, test),
            batch_sizes=(batch_size, len(dev), len(test)),
            sort_key=lambda x: len(x.text),
            repeat=False,
            device=-1)
        ## for GPU run
        #     train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),
        #                 batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text), repeat=False, device=None)
        return train_iter, dev_iter, test_iter

    def preprocess(self):
        all_data = data.TabularDataset(path='tweet_slim.csv',
                                       format='csv',
                                       fields=[('grp', None),
                                               ('id', self.id_field),
                                               ('text', self.text_field)],
                                       skip_header=True)
        data_iter = data.BucketIterator(all_data,
                                        batch_size=self.BATCH_SIZE,
                                        repeat=False,
                                        device=-1)
        return data_iter, all_data

    def load_test_data(self):
        tweets = []
        labels = []
        with open(
                os.path.join(
                    "data",
                    self.topic + '_' + self.group + '_evaluate_dataset.csv'),
                'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter='\t',
                                 escapechar='\\')
            for row in readCSV:
                tweets.append(row[0])
                labels.append(int(row[1]))

        with open(
                os.path.join(
                    "data", self.topic + '_' + self.group +
                    '_evaluate_dataset_withID.csv'), 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter='\t',
                                    escapechar='\\')
            for i in range(len(labels)):
                # id, tweet, label
                spamwriter.writerow([i, tweets[i], labels[i]])

        test_data = data.TabularDataset(path=os.path.join(
            "data",
            self.topic + '_' + self.group + '_evaluate_dataset_withID.csv'),
                                        format='tsv',
                                        fields=[('id', self.id_field),
                                                ('text', self.text_field)])
        test_iter = data.Iterator(test_data,
                                  batch_size=self.BATCH_SIZE,
                                  repeat=False,
                                  device=-1,
                                  shuffle=False)
        return test_data, test_iter, tweets, labels

    def predict(self):
        test_data, test_iter, tweets, labels = self.load_test_data()
        print(test_data[0].text)
        print(test_data[0].id)
        print(test_data[1].text)
        print(test_data[1].id)
        print(test_data[2].text)
        print(test_data[2].id)
        self.model.eval()
        print("begin to predict....")
        pred_res = []
        ids = []
        for batch in test_iter:
            sent = batch.text
            idd = batch.id
            ids += (idd.data.numpy().tolist())
            self.model.batch_size = len(idd.data)
            self.model.hidden = self.model.init_hidden()
            pred = self.model(sent)
            pred_label = pred.data.numpy()
            pred_res += pred_label[:, 1].tolist()

        ori_score = [math.exp(x) for x in pred_res]

        with open(
                os.path.join(
                    "data", 'predict_' + self.topic + '_' + self.group +
                    '_evaluate_dataset.csv'), 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter='\t',
                                    escapechar='\\')
            for tweet, label, score in zip(tweets, labels, ori_score):
                spamwriter.writerow([tweet, label, score])

    def search2(self):
        print("loading data....")
        data, ori_data = self.preprocess()
        print(ori_data[0].text)
        print(ori_data[1].text)
        print(ori_data[2].text)
        self.model.eval()
        avg_loss = 0.0
        id_list = []
        pred_res = []
        id = 0
        print("begin to search....")
        for batch in data:
            id = id + 1
            if id % 10 == 0:
                print(id)
            sent = batch.text
            idd = batch.id
            #id_list = np.append(id_list,idd.data.numpy() )
            id_list += (idd.data.numpy().tolist())
            self.model.batch_size = len(idd.data)
            self.model.hidden = self.model.init_hidden()
            pred = self.model(sent)
            pred_label = pred.data.numpy()
            #pred_res = np.append(pred_res,pred_label[:,1])
            pred_res += pred_label[:, 1].tolist()
        #print (id_list)
        #print (pred_res)
        #pos = pred_res>-0.69
        #id_list = id_list[pos].tolist()
        #pred_res = pred_res[pos].tolist()
        #ss = sorted(pred_res, reverse=True)
        #print ([math.exp(s) for s in ss[:10]])
        print("finish searching.")
        print("begin to ranking....")
        sort_index = [
            i[0] for i in sorted(
                enumerate(pred_res), key=lambda x: x[1], reverse=True)
        ]
        sort_id = [id_list[id] for id in sort_index]
        ori_score = [math.exp(pred_res[id]) for id in sort_index]
        # Save sorted id and corresponding scores
        pickle.dump(
            sort_id,
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "wb"))
        pickle.dump(
            ori_score,
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "wb"))
        #ori_score_pos = [s for s in ori_score if s>0.5]
        #plt.hist(ori_score_pos, normed=True, bins=100)
        #plt.ylabel('Probability')
        #plt.show()
        pos_ids, pos_h = self.get_tweets_id_score(threshold=0.5)
        return pos_ids, pos_h

    def show_scores_hist(self):
        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        print(ori_score[:10])
        hist = np.histogram(ori_score, np.linspace(0, 1, 11))
        print(hist)
        pickle.dump(
            hist,
            open("scoreHist_" + self.topic + "_" + self.group + ".pkl", "wb"))

    def get_tweets_id_score(self, threshold):
        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        pos_ids = []
        pos_h = {}
        # pos_ids = [sort_id[idx] for idx, s in enumerate(ori_score) if s>0.5]
        for idx, s in enumerate(ori_score):
            if s > threshold:
                pos_ids.append(sort_id[idx])
                pos_h[sort_id[idx]] = s
        # print (sort_id[:10])
        # print (ori_score[:10])
        # print (ori_data[0].text)
        print(pos_ids)
        return pos_ids, pos_h

    def sample_tweets_id_score(self, num, path):
        """
        :num: randomly sample num of tweets, equally distributed in each 10 bin.
        :return:
        """
        # Get id for each group
        ids_by_group = {"con": set(), "lib": set()}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                if first_line == True:
                    first_line = False
                    continue
                else:
                    ids_by_group[row[0]].add(int(row[1]))

        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        bin = 10
        size = num / 10
        all_pos_ids = []
        pos_ids = []
        pos_h = {}
        threshold = 0.9
        # Group id and scores by 10 bin
        for idx, s in enumerate(ori_score):
            if s >= threshold:
                id = sort_id[idx]
                if id in ids_by_group[self.group]:
                    pos_ids.append(id)
                    pos_h[id] = s
            else:
                print(
                    "There are {} tweets with a score more than {} in group {}"
                    .format(len(pos_ids), threshold, self.group))
                all_pos_ids.append(pos_ids)
                pos_ids = []
                threshold -= 1.0 / bin
        print(
            "There are {} tweets with a score more than {} in group {}".format(
                len(pos_ids), threshold, self.group))
        all_pos_ids.append(pos_ids)
        # Sample the number "size" of tweets in each sub-group
        sample_pos_ids = []

        for pos_ids in all_pos_ids:
            sampled_ids = random.sample(pos_ids, size)
            sample_pos_ids += sampled_ids

        random.shuffle(sample_pos_ids)

        return sample_pos_ids, pos_h

    def get_id_already(self, file_path):
        id_already = []
        with open(file_path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            for row in readCSV:
                status = row[2].strip()
                id = status.split("/")[-1]
                id_already.append(int(id))
        print("id_ready: {}".format(id_already))
        return id_already

    def sample_tweets_id_score2(self, num, path, id_already):
        """
        :num: randomly sample num of tweets, equally distributed in each 10 bin.
        :id_already: is the list of ID which is already sampled last time
        :return:
        """
        # Get id for each group
        ids_by_group = {"con": set(), "lib": set()}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                if first_line == True:
                    first_line = False
                    continue
                else:
                    ids_by_group[row[0]].add(int(row[1]))

        ori_score = pickle.load(
            open("oriScore_" + self.topic + "_" + self.group + ".pkl", "rb"))
        sort_id = pickle.load(
            open("sortId_" + self.topic + "_" + self.group + ".pkl", "rb"))
        bin = 5
        size = num / bin
        all_pos_ids = []
        pos_ids = []
        pos_h = {}
        threshold = 0.95
        # Group id and scores by 10 bin
        for idx, s in enumerate(ori_score):
            if s >= threshold:
                id = sort_id[idx]
                if id in ids_by_group[self.group] and id not in id_already:
                    pos_ids.append(id)
                    pos_h[id] = s
            else:
                print(
                    "There are {} tweets with a score more than {} in group {}"
                    .format(len(pos_ids), threshold, self.group))
                all_pos_ids.append(pos_ids)
                pos_ids = []
                threshold -= 0.05
                if threshold < 0.74:  # TODO: modify the lowest threshold for sampling
                    break
        # Sample the number "size" of tweets in each sub-group
        sample_pos_ids = []

        for pos_ids in all_pos_ids:
            sampled_ids = random.sample(pos_ids, size)
            sample_pos_ids += sampled_ids

        random.shuffle(sample_pos_ids)

        return sample_pos_ids, pos_h

    def read_tweet_by_id(self, path, ids, ids_h, topic, group, sample=False):
        print(len(ids))
        id_set = set(ids)
        print(len(id_set))
        ans = {}
        with open(path, 'r') as csvfile:
            readCSV = csv.reader(csvfile,
                                 quoting=csv.QUOTE_ALL,
                                 delimiter=',',
                                 escapechar='\\')
            first_line = True
            for row in readCSV:
                #print (row)
                if first_line == True:
                    first_line = False
                    continue
                if int(row[1]) in id_set and row[0] == group:
                    ans[int(row[1])] = row
        print(len(ans))
        if sample:
            tweet_name = 'sampled_tweet_'
        else:
            tweet_name = 'tweet_'
        with open(tweet_name + topic + '_' + group + '.csv', 'w') as csvfile:
            spamwriter = csv.writer(csvfile,
                                    quoting=csv.QUOTE_ALL,
                                    delimiter=',',
                                    escapechar='\\')
            for x in ids:
                if x in ans:
                    t = ans[x]
                    tweet = [
                        t[0], t[2], 'https://twitter.com/a/status/' + t[1],
                        ids_h[int(t[1])]
                    ]
                    #print (ans[x])
                    #print ("")
                    spamwriter.writerow(tweet)