コード例 #1
0
ファイル: score.py プロジェクト: MorLong/bimu
 def __init__(self, embedding_file, vocab_file, oov=-1, hidden1=50, hidden2=50, hidden3=50, win=7, alpha=0.025):
     self.hidden1, self.hidden2, self.hidden3 = hidden1, hidden2, hidden3
     self.win = win
     self.alpha = alpha
     self.oov = oov
     self.embedding_matrix = self.load_embeddings(embedding_file)
     self.dimension = self.embedding_matrix.shape[-1]
     log.debug("Embedding matrix n rows: {}".format(self.embedding_matrix.shape[0]))
     log.debug("Embedding dimension: {}".format(self.dimension))
コード例 #2
0
ファイル: text.py プロジェクト: fangzheng354/bimu
    def create(self, reader):
        w_cn_lst = sort_vocab(update_counts(reader, downcase=self.downcase, sep=self.sep))
        log.debug("Vocabulary size after sorting: {}.".format(len(w_cn_lst)))
        w_cn_lst = w_cn_lst[self.discard_n_top_freq:self.max_n_words]

        if self.min_freq > 1:
            w_cn_lst = prune_freq(w_cn_lst, self.min_freq)

        self.corpus_size = sum(i[1] for i in w_cn_lst)

        # create index
        self.w_index["<s>"] = 0  # for padding
        self.inv_w_index[0] = "<s>"
        for idx, (w, _) in enumerate(w_cn_lst, 1):
            self.w_index[w] = idx
            self.inv_w_index[idx] = w

        self.w_cn = dict(w_cn_lst)
        log.debug("Vocabulary size: {}.".format(len(self.w_index)))
コード例 #3
0
ファイル: score.py プロジェクト: MorLong/bimu
    def Sequence_Level(self, train_file, test_file, num_label, epochs):
        log.debug("Declaring theano vars.")
        random.seed(5)
        W1 = theano.shared(0.2 * random.random([self.win * self.dimension, self.hidden1]) - 0.1)
        W2 = theano.shared(0.2 * random.random([self.hidden1, self.hidden2]) - 0.1)
        W3 = theano.shared(0.2 * random.random([self.hidden2, self.hidden3]) - 0.1)
        U = theano.shared(0.2 * random.random([self.hidden3, num_label]) - 0.1)

        x = T.dmatrix("x")  # len(l) by win*dimension
        y = T.lvector("y")
        learn_rate = T.scalar("learn_rate")

        A1 = T.dot(x, W1)
        B1 = A1 * (A1 > 0)
        A2 = T.dot(B1, W2)
        B2 = A2 * (A2 > 0)
        A3 = T.dot(B2, W3)
        B3 = A3 * (A3 > 0)
        G = T.dot(B3, U)
        L1 = T.nnet.softmax(G)  # len(l) by num_label

        #L1=T.nnet.softmax(T.dot(T.tanh(T.dot(T.tanh(T.dot(T.tanh(T.dot(x,W1)),W2)),W3)),U))

        cost = T.nnet.categorical_crossentropy(L1, y).mean()
        gw1, gw2, gw3, gu = T.grad(cost, [W1, W2, W3, U])
        #gw_x = T.grad(cost, [x])

        log.info("Compiling theano model.")
        f1 = theano.function(inputs=[x, y, learn_rate], outputs=[cost], updates=(
            (W1, W1 - learn_rate * gw1), (W2, W2 - learn_rate * gw2), (W3, W3 - learn_rate * gw3),
            (U, U - learn_rate * gu)))

        #f2 = theano.function(inputs=[x, y], outputs=cost)
        prediction = T.argmax(L1, axis=1)
        discrepancy = prediction - y
        f3 = theano.function(inputs=[x, y], outputs=[discrepancy,prediction])
        #f4 = theano.function(inputs=[x, y], outputs=gw_x)

        alpha = self.alpha
        log.info("Read-in the training and test data.")
        open_train = open(train_file, "r")
        train_lines = open_train.readlines()
        open_test = open(test_file, "r")
        test_lines = open_test.readlines()

        log.info("Start training.")
        counter = 0
        start = time.time()
        iter_ = epochs
        for j in range(0, iter_):
            log.info("Epoch: {}...".format(j+1))
            x_ = []
            y_ = []
            for i in range(len(train_lines)):
                if i % 1000 == 0:
                    log.debug(i)
                counter = counter + 1
                current_alpha = alpha * (iter_ * len(train_lines) - counter) / (iter_ * len(train_lines))
                if current_alpha < 0.01: current_alpha = 0.01
                line_ = train_lines[i]
                G = line_.split("|")
                token_line = G[0]
                label_line = G[1]
                token_list = list(fromstring(token_line, dtype=int, sep=' '))
                x_ = self.contextwin(token_list)  # len(l) by win*dimension
                y_ = fromstring(label_line, dtype=int, sep=' ')
                f1(x_, y_, current_alpha)

            total_num = 0
            total_value = 0
            goldlabels = []
            predictions = []
            for i in range(len(test_lines)):
                line_ = test_lines[i]
                G = line_.split("|")
                token_line = G[0].strip()
                label_line = G[1].strip()

                y = fromstring(label_line, dtype=int, sep=' ')
                x = self.contextwin(list(fromstring(token_line, dtype=int, sep=' ')))
                total_num = total_num + x.shape[0]
                discrep, preds = f3(x, y)
                goldlabels.extend(list(y))
                predictions.extend(list(preds))
                total_value = total_value + x.shape[0] - count_nonzero(discrep)

            assert len(goldlabels) == len(predictions)
            log.info("f1 {}".format(f1_score(goldlabels, predictions, average="weighted")))
            acc = 1.00 * total_value / total_num
            log.info("acc " + str(acc))
        log.info("Training completed: {}s/epoch".format((time.time()-start)/iter_))
コード例 #4
0
                    e1 = embs2[inst.w1_idx]
                    e2 = embs2[inst.w2_idx]
                rel_embs2_1.append(e1)
                rel_embs2_2.append(e2)
                rel_rats2.append(inst.avg_rat)

            log.info("Calculating distances and correlation.")
            assert len(rel_embs2_1) == len(rel_embs2_2) == len(rel_rats2)
            scores2 = []
            for e1, e2 in zip(rel_embs2_1, rel_embs2_2):
                scores2.append(cosine(e1, e2))

        assert len(scores2) == len(rel_rats2)

    corr = spearman(scores, rel_rats)
    log.debug("{} embedded words found out of {}.".format(len(scores), len(d)))
    log.info("Correlation: {0[0]}, p-value: {0[1]}.".format(corr))
    if args.ci:
        ci = bootstrap.ci((scores, rel_rats), statfunction=spearman, method="pi")
        log.info("CI: {0[0]} ({1}), {0[1]} (+{2}).".format(ci[:, 0], ci[:, 0][0] - corr[0], ci[:, 0][1] - corr[0]))
    if args.input_dir2:
        corr2 = spearman(scores2, rel_rats2)
        log.debug("Model2: {} embedded words found out of {}.".format(len(scores2), len(d)))
        log.info("Model2: Correlation: {0[0]}, p-value: {0[1]}.".format(corr2))
        if args.ci:
            ci2 = bootstrap.ci((scores2, rel_rats2), statfunction=spearman, method="pi")
            log.info("Model2: CI: {0[0]}, {0[1]}.".format(ci2[:, 0], ci2[:, 0][0] - corr2[0], ci2[:, 0][1] - corr2[0]))
        #corr_between = spearman(scores, scores2)
        #log.info("Between-models: Correlation: {0[0]}, p-value: {0[1]}.".format(corr_between))
        #sign = dependent_corr(corr[0], corr2[0], corr_between[0], n=len(rel_rats), twotailed=True, conf_level=0.95)
        #log.info("Significance: Test score: {0[0]}, p-value: {0[1]}.".format(sign))