コード例 #1
0
    def init_rules(self):
        '''
        Remember all unary chains appeared in the treebank and index them.
        In the model, a whole unary chain is expanded at the same time.
        The only thing we need to care about is which unary chain to expand.
        During parsing, we need a dictionary of lists to record for each bottom nonterminal B,
        which possible unary chains U we could append upon it to reach a top nonterminal A.
        '''
        all_trees = list(ptb(["train", "dev", "test"]))

        self.idx2u = []
        self.unary = {}
        # number of unary chains
        self.nunary = 0
        # we need to recover the unary chain during print_parse, therefore
        # a simple and clever way would be to store the prefix and suffix of
        # a certain unary chain. For example, for a unary chain:
        #
        #   A -> B -> C -> .... that appears in the middle of a tree
        #   prefix = (A (B
        #   suffix = ))
        #   so if we have the parse starting at C, called it parse_C
        #   the parse staring at A becomes: prefix + parse_C + suffix
        self.unary_prefix = []
        self.unary_suffix = []

        # the word -> preterminal unary chain dictionary
        self.w_U = {}

        self.B_AC = {}
        self.A2B_bias_mask = torch.FloatTensor(self.nnt,
                                               self.nnt).fill_(-10000)

        for (_, gold_tree) in all_trees:
            head, unary_chain = self.find_rules(gold_tree)
            if len(unary_chain) > 1:
                if unary_chain not in self.idx2u:
                    self.add_unary_chain(unary_chain)

        for ur in self.idx2u:
            reverse = list(reversed(ur))
            #print "-".join([self.idx2nt[x] for x in reverse])
            tmp = reverse[:-1]
            pre = " (".join([self.idx2nt[x] for x in tmp])
            self.unary_prefix.append("(" + pre + " ")
            self.unary_suffix.append(")" * len(tmp))

        print " # unary chains: ", len(self.idx2u)
コード例 #2
0
    def make_trainset(self):
        '''
        Making the trainset. In other word, we create a more compact and
        convenient representation for each tree in our trainset: we store
        separately the binary and unary rules in a tree, and each rule has
        the info of its parent and children as well as the info of positions.
        '''
        begin_time = time.time()

        train_trees = list(
            ptb("train",
                minlength=3,
                maxlength=constants.MAX_SEN_LENGTH,
                n=2000))

        f = open(self.train_file, 'w')
        num_sen = 0
        counter = 0

        self.headify_trees(train_trees)

        with open(constants.HEADIFIED_FILE, 'r') as f_head:
            for line in f_head:
                counter += 1
                if counter == 1:
                    continue
                unbinarized_tree = Tree.fromstring(
                    line.replace("@(", "(^").replace("@", ""))

                d = self.encode_head_tree(head_binarize(unbinarized_tree))

                if num_sen == 0:
                    f.write(str(d))
                else:
                    f.write("\n" + str(d))
                num_sen += 1

        f.close()
        # DEBUG: print self.convert_tree_to_encoded_list(nltk.Tree.fromstring("(ROOT (S (@S (NP I) (VP (VBP live)))(. .)))"))
        end_time = time.time()

        print "-- Making trainset takes %.4f s" \
            % round(end_time - begin_time, 5)
        print " # sentences ", num_sen
コード例 #3
0
ファイル: utils.py プロジェクト: skokec/TF_TCN
def data_generator(args):
    #file, testfile, valfile = getattr(observations, args.dataset)('data/')
    if args.dataset == 'ptb':
        file, testfile, valfile = ptb('./data')
    file_len = len(file)
    valfile_len = len(valfile)
    testfile_len = len(testfile)
    corpus = Corpus(file + " " + valfile + " " + testfile)

    #############################################################
    # Use the following if you want to pickle the loaded data
    #
    # pickle_name = "{0}.corpus".format(args.dataset)
    # if os.path.exists(pickle_name):
    #     corpus = pickle.load(open(pickle_name, 'rb'))
    # else:
    #     corpus = Corpus(file + " " + valfile + " " + testfile)
    #     pickle.dump(corpus, open(pickle_name, 'wb'))
    #############################################################

    return file, file_len, valfile, valfile_len, testfile, testfile_len, corpus
コード例 #4
0
    def make_data(self, dataset, num=None):
        '''
        Making the trainset. In other word, we create a more compact and
        convenient representation for each tree in our trainset: we store
        separately the binary and unary rules in a tree, and each rule has
        the info of its parent and children as well as the info of positions.
        '''
        begin_time = time.time()

        trees = list(ptb(dataset, maxlength=constants.MAX_SEN_LENGTH, n=num))

        filename = self.data_file + dataset + ".txt"

        f = open(filename, 'w')
        num_sen = 0
        counter = 0

        for (sentence, gold_tree) in trees:
            counter += 1

            d = self.encode_tree(gold_tree)

            if num_sen == 0:
                f.write(str(d))
            else:
                f.write("\n" + str(d))

            num_sen += 1

        f.close()

        end_time = time.time()

        print "-- Making %sset takes %.4f s".format(
            dataset, round(end_time - begin_time, 5))
        print " # sentences ", num_sen
コード例 #5
0
    def init_word_indices(self):
        '''
        Temporarily we will train our own word embeddings.
        According to the experiment result, it shows that training
        a new terminal embedding is no worse than using existing word2vec embedding;
        but it is faster than using the word2vec. Because the normal word2vec embedding
        has at least 100 dimensions (300 if it is case sensitive). And this becomes slow.
        
        Also, for prediction, originally, we want something like:

        Let E be the terminal embedding, dim(E) = num_words * word_dim
            W be some weight matrix,     dim(W) = word_dim * conditions_dim

        P( book | conditions ) = softmax(E*W*conditions).select(index_of_book)

        But after some experiment, it shows that it is equally good if we just simply:

        P( book | conditions ) = softmax( g1(relu( g2(conditions) )) ).select(index_of_book)
        So basically we just pass the conditions to a single layer neural net and softmax directly.

        In this way, it becomes possible to use neural net, however in the original case the computation
        is very intensive

        '''

        if not self.check_file_exists(constants.GLOVE_FILE,
                                      "GloVe embeddings"):
            return

        begin_time = time.time()

        # all_trees are all the trees we can use in training -- (trainset) from the WSJ 2-21
        train_trees = list(ptb("train"))

        # here we follow the old convention from Berkeley's to record the frequency
        # of each word. But we simplify the process such that we only have one threshold, namely
        # the RARE_TRESHOLD (10). There are about 7000+ words that have appeared more than 10
        # times in the WSJ 2-21. And we create signatures for those that do not.
        # Details for creating signature are in Signature in util.py
        word_frequencies = {}

        num_train_tokens = 0
        num_train_sens = len(train_trees)
        max_length_sen = 0
        for (sentence, _) in train_trees:

            sen_split = sentence.split()
            length = len(sen_split)
            num_train_tokens += length
            if length > max_length_sen:
                max_length_sen = length

            for word in sen_split:
                word = word.rstrip()
                if word in word_frequencies:
                    word_frequencies[word] += 1
                else:
                    word_frequencies[word] = 1

        # the dimension of terminals -- self.dt
        self.dt = 128
        self.w2idx = {}
        self.idx2w = []
        # there's a special symbol here called BOS, we add this symbol
        # so that we have "some left context" even at the very beginning
        # of the sentence
        self.words_set = set()
        self.w2idx[constants.BOS] = 0
        self.idx2w.append(constants.BOS)

        # the number of terminals -- self.nt
        self.nt = 1
        oov_set = set()

        for word, freq in word_frequencies.iteritems():
            if freq <= constants.RARE_THRESHOLD:
                if word.lower() in word_frequencies:
                    knownlc = (word_frequencies[word.lower()] >
                               constants.RARE_THRESHOLD)
                else:
                    knownlc = False
                oov_set.add(sig(word, False, knownlc))
            else:
                self.words_set.add(word)
                self.w2idx[word] = self.nt
                self.idx2w.append(word)
                self.nt += 1

        num_train_oov = len(oov_set)

        print " - In train set: Sequences: {} Tokens: {} Token types: {} " \
            "Unknown types: {} Max sen length: {} ".format(
            num_train_sens, num_train_tokens, self.nt, num_train_oov, max_length_sen)

        # we also want to include all signatures that we haven't covered in
        # training set.
        rest_trees = list(ptb("dev", "test"))

        for (sentence, _) in rest_trees:
            sen_split = sentence.split()
            for word in sen_split:
                word = word.rstrip()
                if word in self.words_set:
                    continue
                knownlc = word.lower() in self.words_set
                oov_set.add(sig(word, False, knownlc))

        for oov in oov_set:
            self.w2idx[oov] = self.nt
            self.idx2w.append(oov)
            self.nt += 1

        self.dt = 300
        self.word_emb = torch.zeros(self.nt, self.dt)

        with open(constants.GLOVE_FILE, 'r') as f:
            for line in f:
                emb = line.split()
                word = emb.pop(0)
                if word in self.w2idx:
                    idx = self.w2idx[word]
                    self.word_emb[idx] = torch.FloatTensor(
                        [float(i) for i in emb])

        print " - There are {} number of OOVs. ".format(len(oov_set))

        end_time = time.time()

        if self.verbose:
            print "-- Initializing word indices takes %.4f s" % round(
                end_time - begin_time, 5)
        return
コード例 #6
0
def test(dataset):

    model.parse_setup()
    test_data = list(ptb(dataset, minlength=3, maxlength=constants.MAX_TEST_SEN_LENGTH, n=200))

    return eval_unofficial(dataset, test_data)