Esempio n. 1
0
    def rulesToCode(rules):
        stack = []
        code = []
        for i in range(0, len(rules)):
            if not CDDataset._is_terminal_rule(rules[i]):
                stack.extend(rhs(rules[i]).split('___')[::-1])
            else:
                code.append(rhs(rules[i]))

            try:
                top = stack.pop()

                while not top[0].isupper():
                    code.append(top)
                    if len(stack) == 0:
                        break
                    top = stack.pop()
            except:
                pass

        return code
Esempio n. 2
0
    def rulesToCode(rules):
        stack = []
        code = []
        for i in range(0, len(rules)):
            if not CDDataset._is_terminal_rule(rules[i]):
                stack.extend(
                    rhs(rules[i]).replace('concode_idiom___',
                                          '').split('___')[::-1]
                )  # Removing concode_idiom. We introduced this so that we could color rules in the tree in order to identify idioms.
            else:
                code.append(rhs(rules[i]))

            try:
                top = stack.pop()

                while not top.endswith('_NT'):
                    code.append(top)
                    if len(stack) == 0:
                        break
                    top = stack.pop()
            except:
                pass

        return code
Esempio n. 3
0
    def getCurrentState(self):
        "Get the outputs for the current timestep."
        # We need to return a batch here
        # the batch should contain nt, prev_rule, parent_rule, parent_states
        batch = {
            'nt':
            self.tt.LongTensor(self.size, 1),
            'prev_rules':
            self.tt.LongTensor(self.size, 1),
            'prev_rules_split':
            self.tt.LongTensor(self.size, 200, 1).fill_(
                self.vocabs['nt'].stoi['<blank>']),  # has to be padded
            'parent_rules':
            self.tt.LongTensor(self.size, 1),
            'parentpos':
            self.tt.LongTensor(self.size, 1),
            'parent_rules_split':
            self.tt.LongTensor(self.size, 200,
                               1).fill_(self.vocabs['nt'].stoi['<blank>']),
            'parent_states': {}
        }

        max_prev_rules_split = 0
        max_parent_rules_split = 0
        for i in range(0, len(self.nextYs[-1])):  # this is over the beam

            # Here, we are taking the rule that was best in the previous step, and converting it into a prev_rule
            # for the next decoding step
            if len(self.prevKs) == 0:  # In the beginning
                prev_rule = '<s>'
            elif self.nextYs[-1][i] >= len(
                    self.vocabs['next_rules']
            ):  # The best Y is a copy operation. How do we convert a copy operation into a prev_rule.
                prevNt = self.vocabs['nt'].itos[self.nextNts[-1][i]]
                # What happens if prevNt is not one of the valid NTs that can generate a copy
                prev_rule = CDDataset._unk_rule_from_Nt(prevNt)
            else:
                prev_rule = self.vocabs['next_rules'].itos[self.nextYs[-1][i]]

            try:
                str_prev_rule = CDDataset.getAnonRule(prev_rule)
                prev_rule_str_splits = [str_prev_rule
                                        ] if "-->" not in str_prev_rule else (
                                            [lhs(str_prev_rule)] + ['<sep>'] +
                                            rhs(str_prev_rule).split('___'))
                if len(prev_rule_str_splits) > max_prev_rules_split:
                    max_prev_rules_split = len(prev_rule_str_splits)
                for k in range(0, len(prev_rule_str_splits)):
                    batch['prev_rules_split'][i][k][0] = self.vocabs[
                        'nt'].stoi[prev_rule_str_splits[k]]
                batch['prev_rules'][i][0] = self.vocabs['prev_rules'].stoi[
                    str_prev_rule]
            except:
                import ipdb
                ipdb.set_trace()

            # if the stack is empty put a placeholder
            if len(self.stacks[i]) == 0:
                (nt, parent_rule, parent_pos,
                 parent_state) = (self.start_symbol, '<s>', 0,
                                  Variable(self.tt.FloatTensor(
                                      1, 1, self.rnn_size).zero_(),
                                           requires_grad=False))
            else:
                (nt, parent_rule, parent_pos,
                 parent_state) = self.stacks[i][-1]  #.top()

            batch['parent_rules'][i][0] = self.vocabs['prev_rules'].stoi[
                parent_rule]
            parent_rule_str_splits = [
                parent_rule
            ] if "-->" not in parent_rule else ([lhs(parent_rule)] +
                                                ['<sep>'] +
                                                rhs(parent_rule).split('___'))
            if len(parent_rule_str_splits) > max_parent_rules_split:
                max_parent_rules_split = len(parent_rule_str_splits)
            for k in range(0, len(parent_rule_str_splits)):
                batch['parent_rules_split'][i][k][0] = self.vocabs['nt'].stoi[
                    parent_rule_str_splits[k]]

            try:
                batch['nt'][i][0] = self.vocabs['nt'].stoi[nt]
                batch['parentpos'][i][0] = parent_pos
            except:
                import ipdb
                ipdb.set_trace()

            batch['parent_states'][i] = {}
            batch['parent_states'][i][0] = parent_state

        # lstm doesnt like a batch with unnecessary extra lengths. The batch should be as long as the longest sequence only, not longer
        batch['parent_rules_split'] = batch[
            'parent_rules_split'][:, :max_parent_rules_split, ].contiguous()
        batch['prev_rules_split'] = batch[
            'prev_rules_split'][:, :max_prev_rules_split, ].contiguous()
        return batch
Esempio n. 4
0
    def advance(self, wordLk, attnOut, rnn_output):
        """
        Given prob over words for every last beam `wordLk` and attention
        `attnOut`: Compute and update the beam search.

        Parameters:

        * `wordLk`- probs of advancing from the last step (K x words)
        * `attnOut`- attention at the last step

        Returns: True if beam search is complete.
        """
        numWords = wordLk.size(1)

        # Sum the previous scores.
        if len(self.prevKs) > 0:
            beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)

            # Don't let EOS have children.
            for i in range(self.nextYs[-1].size(0)):
                if len(self.stacks[i]) == 0:
                    beamLk[i] = -1e20
        else:
            beamLk = wordLk[0]
        flatBeamLk = beamLk.view(-1)
        bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)

        self.scores = bestScores

        # bestScoresId is flattened beam x word array, so calculate which
        # word and beam each score came from
        oldStacks = self.stacks
        self.stacks = [[] for i in range(0, self.size)
                       ]  # stacks for non terminals

        # bestScoresId is flattened beam x word array, so calculate which
        # word and beam each score came from
        prevK = bestScoresId / numWords
        self.prevKs.append(prevK)
        self.nextYs.append((bestScoresId - prevK * numWords))
        self.attn.append(attnOut.index_select(0, prevK))
        self.stacks = [copy.deepcopy(oldStacks[k]) for k in prevK]
        for i in range(0, self.size):
            currentRule = (bestScoresId[i] - prevK[i] * numWords)
            # currentRule can be a copy index
            if currentRule >= len(self.vocabs['next_rules']):
                rule = '<unk>'
            else:
                rule = self.vocabs['next_rules'].itos[currentRule]
            try:
                self.stacks[i].pop(
                )  # This rule has been processed. This should not error out
            except:
                # This can error out if there are very few options for the previous rules (rest are -inf) and a stack with 1e-20 is also chosen in topk
                pass

            # If its a terminal rule, we dont needs its parents anymore
            if not CDDataset._is_terminal_rule(rule):
                # in the beginning, MemberDeclaration has only 2 options
                # so the third best in the beam is -inf
                # it should get eliminated later because the score is -inf
                if rule != '<blank>':
                    for elem in rhs(rule).split('___')[::-1]:
                        if elem[0].isupper():
                            self.stacks[i].append(
                                (elem, rule,
                                 rnn_output[prevK[i]].unsqueeze(0)))

        for i in range(self.nextYs[-1].size(0)):
            if len(self.stacks[i]) == 0:
                s = self.scores[i]
                self.finished.append((s, len(self.nextYs) - 1, i))

        # End condition is when top-of-beam is EOS and no global score.
        if len(self.stacks[0]) == 0:
            self.eosTop = True
Esempio n. 5
0
    def advance(self, wordLk, attnOut, rnn_output, inp):
        """
        Given prob over words for every last beam `wordLk` and attention
        `attnOut`: Compute and update the beam search.

        Parameters:

        * `wordLk`- probs of advancing from the last step (K x words)
        * `attnOut`- attention at the last step

        Returns: True if beam search is complete.
        """
        numWords = wordLk.size(1)

        # Sum the previous scores.
        if len(self.prevKs) > 0:
            beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)

            # Don't let EOS have children.
            for i in range(self.nextYs[-1].size(0)):
                if len(self.stacks[i]) == 0:
                    beamLk[i] = -1e20
        else:
            beamLk = wordLk[0]
        flatBeamLk = beamLk.view(-1)
        bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)

        self.scores = bestScores

        # bestScoresId is flattened beam x word array, so calculate which
        # word and beam each score came from
        oldStacks = self.stacks
        self.stacks = [[] for i in range(0, self.size)
                       ]  # stacks for non terminals

        # bestScoresId is flattened beam x word array, so calculate which
        # word and beam each score came from
        prevK = bestScoresId / numWords
        self.prevKs.append(prevK)
        self.nextYs.append((bestScoresId - prevK * numWords))

        self.nextNts.append([])
        for i in range(0, self.size):
            self.nextNts[-1].append(inp['nt'][self.prevKs[-1][i]][0])

        def copyStack(stacks):
            return [(copy.deepcopy(stack[0]), copy.deepcopy(stack[1]),
                     copy.deepcopy(stack[2]), stack[3].clone())
                    for stack in stacks]

        self.attn.append(attnOut.index_select(0, prevK))
        self.stacks = [copyStack(oldStacks[k]) for k in prevK]
        for i in range(0, self.size):
            currentRule = (bestScoresId[i] - prevK[i] * numWords)

            try:
                self.stacks[i].pop(
                )  # This rule has been processed. This should not error out
            except:
                # This can error out if there are very few options for the previous rules (rest are -inf) and a stack with 1e-20 is also chosen in topk
                pass

            # currentRule can be a copy index. We need the non-terminal to determine
            # which unk it is
            if currentRule < len(self.vocabs['next_rules']):
                rule = self.vocabs['next_rules'].itos[currentRule]

                # If its a terminal rule, we dont needs its parents anymore
                if not CDDataset._is_terminal_rule(rule) and rule != '<blank>':
                    # in the beginning, MemberDeclaration has only 2 options
                    # so the third best in the beam is -inf
                    # it should get eliminated later because the score is -inf
                    rhs_split = rhs(rule).split('___')
                    for idx, elem in enumerate(rhs_split[::-1]):  # reverse it
                        if elem.endswith('_NT'):
                            pos = 2 + len(rhs_split) - idx - 1
                            self.stacks[i].append(
                                (elem, rule, pos,
                                 rnn_output[prevK[i]].unsqueeze(0)))
            else:
                pass

        for i in range(self.nextYs[-1].size(0)):
            if len(self.stacks[i]) == 0:
                s = self.scores[i]
                if s != float(
                        '-inf'
                ):  # This can happen in the first step, when the first rule only has 2 legitimate following rules, resulting in the third being inf
                    self.finished.append((s, len(self.nextYs) - 1, i))

        # End condition is when top-of-beam is EOS and no global score.
        if len(self.stacks[0]) == 0:
            self.eosTop = True