Ejemplo n.º 1
0
    def train(self):
        rsl = []
        files = listdir(self.path)
        reader = InputData(self.dataset_type, self.path)
        corpus = plsa.Corpus()
        for filename in files:
            vectors=[]
            question = reader.readFile(filename)
            id = question["id"]
            self.data[id]=[ref["text"] for ref in question["referenceAnswers"]]
            for r in question["referenceAnswers"]:
                rid=r["id"]
                references=[ sr["text"] for sr in r["studentAnswers"]]
                """
                for ans in question["student_answers"]:
                    if ans["id"]==rid:
                        references.append(ans["text"])
                #references=[ self.stemmer.stem(sr["text"]) for sr in r["studentAnswers"]]
                """
                references.append(r["text"])
                corpus.addBaseline(references)
                #print corpus.getVector()
                vectors.append(corpus.getVector())
                corpus.reset()

            self.model[id]=vectors

        return
Ejemplo n.º 2
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        #self.model.load_state_dict(torch.load("../results/skipgram_nge.pkl"))
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(5 * batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_w = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)
            pos_w = pos_w
            pos_v = pos_v
            neg_v = neg_v

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_w, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()
            process_bar.set_postfix(loss=loss.data)
            process_bar.update()
        torch.save(self.model.state_dict(), "../results/skipgram_nge.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Ejemplo n.º 3
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=5):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
Ejemplo n.º 4
0
def main():
    # 入力データのshape定義
    x = nn.variable.Variable(
        [BATCH_SIZE, IMAGE_DEPTH * IMAGE_WIDTH * IMAGE_HEIGHT])
    # ラベルのshape定義
    t = nn.variable.Variable([BATCH_SIZE, LABEL_NUM])

    pred = convolution(x)
    loss_ = loss(pred, t)

    solver = S.Adam()
    solver.set_parameters(nn.get_parameters())

    data = InputData()

    for i in range(NUM_STEP):
        # 100STEP毎にテスト実施
        if i % 100 == 0:
            l = 0
            a = 0
            for k, (t.d, x.d) in enumerate(data.test_data()):
                loss_.forward()
                l += loss_.d
                a += accuracy(pred, t)
            print("Step: %05d Test loss: %0.05f Test accuracy: %0.05f" %
                  (i, l / k, a / k))
        t.d, x.d = data.next_batch()
        loss_.forward()
        solver.zero_grad()
        loss_.backward()
        solver.weight_decay(DECAY_RATE)
        solver.update()
        if i % 10 == 0:
            print("Step: %05d Train loss: %0.05f Train accuracy: %0.05f" %
                  (i, loss_.d, accuracy(pred, t)))
Ejemplo n.º 5
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_w = [int(pair[0]) for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_v = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_w, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr

        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Ejemplo n.º 6
0
class Word2Vec:
    def __init__(self,
                 input_file_name,
                 output_file_name):
        self.min_count = 5
        self.emb_dimension = 100
        self.batch_size = 64
        self.window_size = 5
        self.iteration = 1
        self.initial_lr = 0.001
        self.data = InputData(input_file_name, self.min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                             self.iteration, self.initial_lr, self.min_count)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):
        """Multiple training.
        Returns:
            None.
        """
        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)
            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, 5)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u))
            pos_v = Variable(torch.LongTensor(pos_v))
            neg_v = Variable(torch.LongTensor(neg_v))
            if self.use_cuda:
                pos_u = pos_u.cuda()
                pos_v = pos_v.cuda()
                neg_v = neg_v.cuda()

            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.data,
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
        self.skip_gram_model.save_embedding(
            self.data.id2word, self.output_file_name, self.use_cuda)
Ejemplo n.º 7
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION)
        self.lr = LR
        self.optimizer = optim.SparseAdam(self.model.parameters(), lr=self.lr)

    def train(self):
        start = time.clock()
        max_accuracy = 0
        for epoch in range(5000):
            all_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(all_pairs)

            # pos是huffman编码为1的部分
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]  # 与1对应的非叶子节点

            #neg是huffman编码为0的部分
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]  # 与0对应的非叶子节点

            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()  #梯度更新
            #mid_end=time.clock()
            #print('one time:%s seconds'%(mid_end-start))
            if epoch % 100 == 0:

                print("Epoch : %d, loss : %.02f" % (epoch, loss))
                ac = self.model.predict(all_pairs, self.data.huffman_tree)
                if ac > max_accuracy:
                    max_accuracy = ac

        end = time.clock()
        print('time:%s seconds' % (end - start))
        print('accuracy:%.06f' % (max_accuracy))
        #self.model.save_embedding(self.data.id2word_dict, self.output_file_name)
        tsne = TSNE(perplexity=30, n_components=2, init='pca',
                    n_iter=500)  #词向量图
        embed_two = tsne.fit_transform(
            self.model.u_embeddings.weight.cpu().detach().numpy())
        labels = [self.data.id2word_dict[i] for i in range(200)]
        plt.figure(figsize=(15, 12))
        for i, label in enumerate(labels):
            x, y = embed_two[i, :]
            plt.scatter(x, y)
            plt.annotate(label, (x, y), ha='center', va='top')
        plt.savefig('HS.png')
Ejemplo n.º 8
0
    def __init__(self, wikidump_filename, output_text_filename, emb_dimension,
                 batch_size, window_size, iteration, initial_lr, min_count):

        self.data = InputData(wikidump_filename, min_count,
                              output_text_filename)
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
class Word2Vec:
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)

    def train(self):

        pair_count = self.data.evaluate_pair_count(self.window_size)
        batch_count = self.iteration * pair_count / self.batch_size
        process_bar = tqdm(range(int(batch_count)))
        count = int(batch_count) // 3
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(self.batch_size,
                                                  self.window_size)

            neg_v = self.data.get_neg_v_neg_sampling(pos_pairs, self.neg_num)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [pair[1] for pair in pos_pairs]

            pos_u = Variable(torch.LongTensor(pos_u)).cuda()
            pos_v = Variable(torch.LongTensor(pos_v)).cuda()
            neg_v = Variable(torch.LongTensor(neg_v)).cuda()
            self.optimizer.zero_grad()
            loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            self.optimizer.step()

            process_bar.set_description("Loss: %0.8f, lr: %0.6f" %
                                        (loss.item(),
                                         self.optimizer.param_groups[0]['lr']))
            if i * self.batch_size % 100000 == 0:
                lr = self.initial_lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr
            if i != 0 and i % count == 0:
                self.skip_gram_model.save_embedding(self.data.id2word,self.output_file_name + str(i))
        self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name + 'final')
Ejemplo n.º 10
0
def evaluate():
    '''テストデータに対する評価を実施する関数
  '''
    data = InputData(test_data_path=FLAGS.test_data, train=False)
    input_ph = tf.placeholder(tf.int32, [None, data.max_len])
    training_ph = tf.placeholder(tf.bool, [])
    label_ph = tf.placeholder(tf.float32, [None, data.num_category])
    with tf.Session() as sess:
        output = convolution(input_ph, training_ph, data.num_chars,
                             data.num_category)
        values, indices = tf.nn.top_k(output, k=10)
        saver = tf.train.Saver()
        load_checkpoint(sess, saver)
        with open(FLAGS.output_dir + '/evaluate.tsv', 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            for test_labels, test_texts, unique_ids, item_names in data.next_batch_evaluation_data(
            ):
                values_, indices_ = sess.run([values, indices],
                                             feed_dict={
                                                 input_ph: test_texts,
                                                 training_ph: False
                                             })
                for (value, index, test_label, unique_id,
                     item_name) in zip(values_, indices_, test_labels,
                                       unique_ids, item_names):
                    row = [unique_id] + [
                        data.category_dict[np.argmax(test_label)]
                    ] + [data.chars_to_unknown(item_name)
                         ] + list(value) + list(
                             map(lambda x: data.category_dict[x], index)) + [
                                 index[0] == np.argmax(test_label)
                             ] + [np.argmax(test_label) in index[0:3]]
                    writer.writerow(row)

        num_records = len(
            open(FLAGS.output_dir + '/evaluate.tsv', 'r').readlines())
        with open(FLAGS.output_dir + '/evaluate.tsv', 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            accuracy_count = [(line[23], line[24]) for line in reader]
            accuracy_top1 = len(
                list(filter(lambda x: x[0] == 'True',
                            accuracy_count))) / num_records
            accuracy_top3 = len(
                list(filter(lambda x: x[1] == 'True',
                            accuracy_count))) / num_records

        with open(FLAGS.output_dir + '/test_accuracy.tsv', 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow([accuracy_top1, accuracy_top3])
Ejemplo n.º 11
0
    def __call__(self, data: InputData) -> ResultData:
        (x, y) = data.initial
        path = [(x, y)]

        grad = (data.df_dx1(x, y), data.df_dx2(x, y))
        calls_count = 2

        while (grad[0] * grad[0] + grad[1] * grad[1]) >= data.eps * data.eps:
            (x, y) = (x - self.alpha * grad[0], y - self.alpha * grad[1])
            grad = (data.df_dx1(x, y), data.df_dx2(x, y))
            calls_count += 2
            path.append((x, y))

        return ResultData(self.name, (x, y), data.function(x, y), 0,
                          calls_count, path)
    def __init__(self, input_file_name, output_file_name, emb_dimension=100, batch_size=50,
                 window_size=5, iteration=5, initial_lr=0.025, neg_num=5, min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.neg_num = neg_num
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)
        self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 13
0
    def __init__(self, args):
        # data class
        self.data = InputData(args.train, args.min_count, args.minn, args.maxn,
                              args.thread)

        self.outfile = args.output
        self.save_model = args.save_model
        self.load_model = args.load_model
        self.emb_dim = args.size
        self.bs = args.batch_size
        self.win_size = args.window
        self.iters = args.iter
        self.lr = args.lr
        self.neg_n = args.negative
        self.sub_samp_th = args.sample
        #subsampling,  prob reserving the word
        self.sub_samp_probs = np.sqrt(self.sub_samp_th / self.data.idx2freq)
        self.thread = args.thread
        self.use_cuda = args.cuda

        print('Initializing model...')
        self.init_model(args)
        if self.use_cuda:
            self.model.cuda()
        self.model.share_memory()
Ejemplo n.º 14
0
Archivo: Main.py Proyecto: dx88968/SRA
    def test(self,mode,inputdir,outputdir):
        head = ["id","grade" ,"Accuracy","Predicted"]
        self.mode=mode
        rsl=[]
        files = listdir(inputdir)
        reader = InputData(self.dataset_type, inputdir)
        for filename in files:
            question = reader.readFile(filename)
            id = question["id"]
            stuAns = []
            for r in question["referenceAnswers"]:   
                for sr in r["studentAnswers"]:
                    stuAns.append(sr)
            for sr in question["otherStudentAnswers"]:
                stuAns.append(sr)
                
            for sr in stuAns:
                grade=""
                if self.nonDomain.test(sr["text"]):
                    if mode==2 or mode==3:
                        grade="incorrect"
                    if mode==5:
                        grade="non_domain"
                    rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":"NA"})
                    print rsl[len(rsl)-1]
                    continue

                if self.contradictBigram.isContradictory(id,sr["text"]) or self.contradict.isContradictory(self.modeler.getReferences(id),sr["text"]):
                    if mode==2:
                        grade="incorrect"
                    if mode==3 or mode==5:
                        grade="contradictory"
                    rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":"NA"})
                    print rsl[len(rsl)-1]
                    continue

                score=self.modeler.grade(id,sr["text"])
                if self.datamode== "beetle":
                    self.irr.build(self.modeler.getReferences(id))
                    if self.irr.isIrrelevent(sr["text"]):
                        score=-1
                grade=self.predict(score)
                rsl.append({"id": sr["id"],"Accuracy":sr["accuracy"],"Predicted":grade,"grade":score})
                print rsl[len(rsl)-1]
            
            output(outputdir, head, rsl)
Ejemplo n.º 15
0
    def __call__(self, data: InputData) -> ResultData:
        (x, y) = data.initial
        path = [(x, y)]

        grad = (data.df_dx1(x, y), data.df_dx2(x, y))
        calls_count = 2
        f_calls = 0

        while (grad[0]**2 + grad[1]**2) >= data.eps**2:

            (t_x, t_y) = (x - self.alpha * grad[0], y - self.alpha * grad[1])
            f_calls += 2
            if data.function(t_x, t_y) > data.function(
                    x,
                    y) - self.delta * self.alpha * (grad[0]**2 + grad[1]**2):
                self.alpha *= self.delta
                continue

            (x, y) = (t_x, t_y)
            grad = (data.df_dx1(x, y), data.df_dx2(x, y))
            calls_count += 2
            path.append((x, y))

        return ResultData(self.name, (x, y), data.function(x, y), f_calls,
                          calls_count, path)
Ejemplo n.º 16
0
    def __call__(self, data: InputData) -> ResultData:
        (x, y) = data.initial

        x0 = (x, y)
        x1 = (x + self.length, y)
        x2 = (x, y + self.length)

        xk = [x0, x1, x2]

        f_calls = 3
        path = [x0]
        fk = [data.function(xi[0], xi[1]) for xi in xk]

        while sum([(xk[i][0] - xk[0][0])**2 + (xk[i][1] - xk[0][1])**2
                   for i in range(1, 3)]) / 2 > data.eps**2:

            min_i = min(range(3), key=lambda i: fk[i])
            max_i = max(range(3), key=lambda i: fk[i])

            c_x = 0
            c_y = 0

            for i in range(3):
                if i != max_i:
                    c_x += xk[i][0]
                    c_y += xk[i][1]
            c_x /= 2
            c_y /= 2

            u_k = (2 * c_x - xk[max_i][0], 2 * c_y - xk[max_i][1])
            f_u = data.function(u_k[0], u_k[1])

            if f_u < fk[max_i]:
                xk[max_i] = u_k
                fk[max_i] = f_u
                f_calls += 1
            else:
                for i in range(0, 3):
                    xk[i] = ((xk[i][0] + xk[min_i][0]) / 2,
                             (xk[i][1] + xk[min_i][1]) / 2)
                    fk[i] = data.function(xk[i][0], xk[i][1])
                    f_calls += 2
            path.append(xk[0])

        return ResultData(self.name, xk[0], fk[0], f_calls, 0, path)
Ejemplo n.º 17
0
    def __init__(self,
                 input_user_file_name,
                 input_links_file_name,
                 output_file_name,
                 emb_dimension=100,
                 num_batch=30000,
                 batch_size=100,
                 initial_lr=0.025):
        """Initilize class parameters.

        Args:
            input_user_file_name: 用户数据文件
            input_links_file_name: 关系数据文件
            output_file_name:保存文件
            emb_dimention: 向量维度
            num_batch:处理次数
            batch_size:批处理大小
            initial_lr: 初始学习率


        Returns:
            None.
        """
        ##处理数据
        self.data = InputData(input_user_file_name, input_links_file_name)
        self.output_file_name = output_file_name
        ##emb_size为embed的大小,等于顶点个数
        self.emb_size = self.data.vertex_count
        self.emb_dimension = emb_dimension
        ##batch_size是每次更新时的数据规模
        self.batch_size = batch_size
        self.initial_lr = initial_lr
        self.num_batch = num_batch
        ##调用模型,+1的原因是顶点是从1开始的,所以我们把0位置的向量保存下来,但其实没啥意思
        self.NetModel = NetModel(self.emb_size + 1, self.emb_dimension)
        ##是否使用cuda加速
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.NetModel.cuda()

        ##使用随机梯度下降的方法来更新参数
        self.optimizer = optim.SGD(self.NetModel.parameters(),
                                   lr=self.initial_lr)
Ejemplo n.º 18
0
 def __init__(self,
              input_file_name,
              output_file_name):
     self.min_count = 5
     self.emb_dimension = 100
     self.batch_size = 64
     self.window_size = 5
     self.iteration = 1
     self.initial_lr = 0.001
     self.data = InputData(input_file_name, self.min_count)
     self.output_file_name = output_file_name
     self.emb_size = len(self.data.word2id)
     self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.batch_size, self.window_size,
                                          self.iteration, self.initial_lr, self.min_count)
     self.use_cuda = torch.cuda.is_available()
     if self.use_cuda:
         self.skip_gram_model.cuda()
     self.optimizer = optim.SGD(
         self.skip_gram_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 19
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=100,
                 window_size=5,
                 iteration=5,
                 initial_lr=0.025,
                 min_count=5,
                 using_hs=False,
                 using_neg=False,
                 context_size=2,
                 hidden_size=128,
                 cbow=None,
                 skip_gram=None):

        print("\nInput File loading......\n")
        self.data = InputData(input_file_name, min_count)
        print("\nInput File loaded.\n")
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.context_size = context_size
        self.hidden_size = hidden_size
        self.using_hs = using_hs
        self.using_neg = using_neg
        self.cbow = cbow
        self.skip_gram = skip_gram
        if self.skip_gram is not None and self.skip_gram:
            self.skip_gram_model = SkipGramModel(self.emb_size,
                                                 self.emb_dimension)
            print("skip_gram_model", self.skip_gram_model)
            self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                       lr=self.initial_lr)
        if self.cbow is not None and self.cbow:
            self.cbow_model = CBOW(self.emb_size, self.emb_dimension)
            print("CBOW_model", self.cbow_model)
            self.optimizer = optim.SGD(self.cbow_model.parameters(),
                                       lr=self.initial_lr)
Ejemplo n.º 20
0
    def __call__(self, data: InputData) -> ResultData:
        (x, y) = data.initial
        h = self.h

        calls_count = 0
        path = [(x, y)]

        while h > self.delta:
            x1 = x
            y1 = y

            f = data.function(x, y)
            if data.function(x + h, y) < f:
                x1 += h
            if data.function(x - h, y) < f:
                x1 -= h

            if data.function(x, y + h) < f:
                y1 += h
            if data.function(x, y - h) < f:
                y1 -= h

            calls_count += 5

            if (x1, y1) == (x, y):
                h /= 2
                continue

            x += self.lambd * (x1 - x)
            y += self.lambd * (y1 - y)
            path.append((x, y))
        return ResultData(self.name, (x, y), data.function(x, y), calls_count,
                          0, path)
Ejemplo n.º 21
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("CBOW Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        loss = -1
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_w = [int(pair[1]) for pair in pos_pairs]
            neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

            self.optimizer.zero_grad()
            loss_now = self.model.forward(pos_u, pos_w, neg_w)
            if loss == -1:
                loss = loss_now.data.item()
            else:
                loss = 0.95 * loss + 0.05 * loss_now.data.item()
            loss_now.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr
            process_bar.set_postfix(loss=loss)
            process_bar.update()

        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Ejemplo n.º 22
0
def user_clustering():
    users = []
    with open(FILE_DIR + file_path[0]) as f:
        for line in f:
            _, _, _, user_data, _ = InputData.split_data(line)
            if user_data not in users: users.append(user_data)
            if len(users) % 10000 == 0: print(len(users))
    print(len(users))

    kmeans = UserCluster(N_CLUSTERING)
    kmeans.fit(features)
    joblib.dump(kmeans, 'kmeans.pkl')
    return kmeans
Ejemplo n.º 23
0
def user_clustering():
    users = []
    with open('../analytics/stdev/usercluster200.csv') as f:
        for line in f:
            _, _, _, user_data, _ = InputData.split_data(line)
            if user_data not in users: users.append(user_data)
            if len(users) % 10000 == 0: print(len(users))
    print(len(users))

    kmeans = UserCluster(N_CLUSTERING)
    kmeans.fit(features)
    joblib.dump(kmeans, 'kmeans.pkl')
    return kmeans
Ejemplo n.º 24
0
    def test_write(self):
        sents = [["a", "b", "c"], ["b", "c"], ["a"], []]
        sents_ids = [[1, 3, 5], [3, 5], [1], []]
        input_data = InputData(self.vocab, sents)
        with tempfile.NamedTemporaryFile() as fp:
            input_data.write(fp.name)

            filenames = [fp.name]
            dataset = tf.contrib.data.TFRecordDataset(filenames)
            dataset = dataset.map(input_data.get_single_example)
            iterator = dataset.make_initializable_iterator()
            sentence = iterator.get_next()

            with tf.Session() as sess:
                sentences = []
                sess.run(iterator.initializer)
                while True:
                    try:
                        sentences.append(sess.run(sentence))
                    except tf.errors.OutOfRangeError:
                        break
            self.assertEqual([s[1].tolist() for s in sentences], sents_ids)
Ejemplo n.º 25
0
    def __init__(
        self,
        input_path,
        output_dir,
        wordsim_path,
        dimension=100,
        batch_size=batch_size,
        window_size=5,
        epoch_count=1,
        initial_lr=1e-6,
        min_count=5,
    ):
        self.data = InputData(input_path, min_count)
        self.output_dir = output_dir
        self.vocabulary_size = len(self.data.id_from_word)
        self.dimension = dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.epoch_count = epoch_count
        self.initial_lr = initial_lr
        self.model = SkipGramModel(self.vocabulary_size, self.dimension)
        if torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')
        self.model = nn.DataParallel(self.model.to(self.device))
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr)

        if wordsim_path:
            self.wordsim_verification_tuples = []
            with open(wordsim_path, 'r') as f:
                f.readline()  # Abandon header
                for line in f:
                    word1, word2, actual_similarity = line.split(',')
                    self.wordsim_verification_tuples.append(
                        (word1, word2, float(actual_similarity))
                    )
        else:
            self.wordsim_verification_tuples = None
Ejemplo n.º 26
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = CBOWModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        for _ in range(1, EPOCH + 1):
            print("CBOW Training......")
            pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
            print("pairs_count", pairs_count)
            batch_count = int(np.ceil(pairs_count / BATCH_SIZE))
            print("batch_count", batch_count)
            process_bar = tqdm(range(int(batch_count)))
            # for _ in range(1, EPOCH + 1):
            for i in process_bar:
                pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
                pos_u = [pair[0] for pair in pos_pairs]
                pos_w = [int(pair[1]) for pair in pos_pairs]
                neg_w = self.data.get_negative_sampling(pos_pairs, NEG_COUNT)

                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_w, neg_w)
                loss.backward()
                self.optimizer.step()

                if i * BATCH_SIZE % 100000 == 0:
                    self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = self.lr
                process_bar.set_postfix(loss=loss.data)
                process_bar.update()
            print('\n')
        torch.save(self.model.state_dict(),
                   "../results/url_with_location_cbow_neg.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Ejemplo n.º 27
0
 def __init__(self,
              infile,
              outfile,
              emb_dim=100,
              batch_size=128,
              window_size=5,
              epochs=5,
              initial_lr=1,
              min_count=5):
     self.data = InputData(infile, min_count)
     self.outfile = outfile
     self.emb_size = len(self.data.id2word)
     self.emb_dim = emb_dim
     self.batch_size = batch_size
     self.window_size = window_size
     self.epochs = epochs
     self.initial_lr = initial_lr
     self.wv_model = SkipgramModel(self.emb_size, self.emb_dim)
     self.use_cuda = torch.cuda.is_available()
     if self.use_cuda:
         self.wv_model.cuda()
     self.optimizer = optim.SGD(self.wv_model.parameters(),
                                lr=self.initial_lr)
Ejemplo n.º 28
0
    def __call__(self, data: InputData) -> ResultData:
        (x, y) = data.initial
        path = [(x, y)]

        dx = data.df_dx1(x, y)
        x1 = x - self.alpha * dx
        dy = data.df_dx2(x1, y)
        y1 = y - self.alpha * dy
        calls_count = 2
        f_calls = 0

        while ((x1 - x)**2 + (y1 - y)**2) >= data.eps**2:
            (x, y) = (x1, y1)
            dx = data.df_dx1(x, y)
            x1 = x - self.alpha * dx
            dy = data.df_dx2(x1, y)
            y1 = y - self.alpha * dy
            calls_count += 2
            path.append((x, y))

        path.append((x1, y1))
        return ResultData(self.name, (x1, y1), data.function(x, y), f_calls,
                          calls_count, path)
Ejemplo n.º 29
0
class Word2Vec:
    def __init__(self, input_file_name, output_file_name):
        self.output_file_name = output_file_name
        self.data = InputData(input_file_name, MIN_COUNT)
        self.model = SkipGramModel(self.data.word_count, EMB_DIMENSION).cuda()
        self.lr = LR
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print("SkipGram Training......")
        pairs_count = self.data.evaluate_pairs_count(WINDOW_SIZE)
        print("pairs_count", pairs_count)
        batch_count = pairs_count / BATCH_SIZE
        print("batch_count", batch_count)
        process_bar = tqdm(range(int(batch_count)))
        for i in process_bar:
            pos_pairs = self.data.get_batch_pairs(BATCH_SIZE, WINDOW_SIZE)
            pos_pairs, neg_pairs = self.data.get_pairs(pos_pairs)
            pos_u = [pair[0] for pair in pos_pairs]
            pos_v = [int(pair[1]) for pair in pos_pairs]
            neg_u = [pair[0] for pair in neg_pairs]
            neg_v = [int(pair[1]) for pair in neg_pairs]
            self.optimizer.zero_grad()
            loss = self.model.forward(pos_u, pos_v, neg_u, neg_v)
            loss.backward()
            self.optimizer.step()

            if i * BATCH_SIZE % 100000 == 0:
                self.lr = self.lr * (1.0 - 1.0 * i / batch_count)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = self.lr
            process_bar.set_postfix(loss=loss.data.cpu().numpy())
            process_bar.update()
        torch.save(self.model.state_dict(), "../results/skipgram_hs.pkl")
        self.model.save_embedding(self.data.id2word_dict,
                                  self.output_file_name)
Ejemplo n.º 30
0
    def __init__(self,
                 input_file_name,
                 output_file_name,
                 emb_dimension=100,
                 batch_size=50,
                 window_size=5,
                 iteration=1,
                 initial_lr=0.025,
                 min_count=5):

        self.data = InputData(input_file_name, min_count)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.skip_gram_model = SGModel(self.emb_size, self.emb_dimension)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(
            self.skip_gram_model.parameters(), lr=self.initial_lr)
Ejemplo n.º 31
0
def run_enviroment(algorithms, cluster_model):
    ite = 0
    for file_name in file_path:
        with open(FILE_DIR + file_name) as f:
            old = {}
            old['LinearedUCB'] = 0
            old['OriginalUCB'] = 0
            old_imp = {}
            old_imp['LinearedUCB'] = 0
            old_imp['OriginalUCB'] = 0
            click_count = 0
            click_eq_count = 0
            ctr_list = {}
            ctr_list['LinearedUCB'] = []
            ctr_list['OriginalUCB'] = []
            for line in f:
                _, click_article_id, click, user_data, article_pool = InputData.split_data(
                    line)
                click_count += click
                userID = cluster_model.predict_cluster(user_data)[0]
                for name, alg in algorithms.items():
                    decide_id = alg.decide(userID, user_data, article_pool)
                    if evaluate(click_article_id, decide_id, click, name, ite,
                                line):
                        # if click == 1: print(click, algorithms['OriginalUCB'].get_prob_check(userID, article_pool[decide_id]))
                        ctr_list[name].append((reward[name] - old[name]) /
                                              (count[name] - old_imp[name]))
                        alg.update(userID, user_data, article_pool[decide_id],
                                   click)
                    if count[name] % 2000 == 0:
                        # print(ite, name, reward[name], count[name], reward[name]/count[name], reward[name]-old[name], count[name]-old_imp[name], (reward[name]-old[name])/(count[name]-old_imp[name]), click_count)
                        print(name, np.mean(ctr_list[name]),
                              reward[name] - old[name],
                              count[name] - old_imp[name], reward[name],
                              count[name])
                        old[name] = reward[name]
                        old_imp[name] = count[name]
                        click_count = 0
                        click_eq_count = 0
                        ctr_list[name] = []
                        count[name] += 1
                        # alg.save_weight(name + '_weight_' + str(N_CLUSTERING) + '_3.csv')
                ite += 1
    return
Ejemplo n.º 32
0
    def test_prepeare_one_hot_input(self):
        sents1 = [["a", "b", "c"], ["b", "c"], ["a"], []]
        sents2 = [["d", "e"], ["d"]]
        voc, (input_data_1, input_data_2) = InputData.prepeare_one_hot_input(sents1, sents2, min_word_count=0)
        self.assertEqual(set(voc.ids.keys()), {"a", "b", "c", "d", "e"})
        with tempfile.NamedTemporaryFile() as fp:
            input_data_1.write(fp.name)

            filenames = [fp.name]
            dataset = tf.contrib.data.TFRecordDataset(filenames)
            dataset = dataset.map(input_data_1.get_single_example)
            iterator = dataset.make_initializable_iterator()
            sentence = iterator.get_next()

            with tf.Session() as sess:
                sentences = []
                sess.run(iterator.initializer)
                while True:
                    try:
                        sentences.append(sess.run(sentence))
                    except tf.errors.OutOfRangeError:
                        break
            self.assertEqual([[voc.id2word(w) for w in s[1].tolist()] for s in sentences], sents1)
Ejemplo n.º 33
0
 def train_dir(self, dataset, path):
     reader = InputData(dataset, path)
     dictList = reader.readDir()
     self.train_all(dictList)
Ejemplo n.º 34
0

config = get_config()
eval_config = get_config()
eval_config.batch_size = 1
eval_config.num_steps = 1

min = True

suffix = '.txt'
if (min):
    suffix = '_min.txt'
train_path = train_file + suffix
test_path = test_file + suffix

data = InputData(model_fields=model_fields, file_path=train_path)
test = InputData(model_fields=model_fields, file_path=test_path)
test_data = test

if config.valid_on_test_data:
    train_data = data
    valid_data = test_data
else:
    train_data, valid = data.random_pick(0.7)
    valid_data = valid

print("valid data count:" + str(len(valid_data.data())))

with tf.Graph().as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)