Exemple #1
0
    def __init__(self):
        self.amrs = []
        self.amrs_dev = []

        # index dictionaries
        self.nodes2Ints = {}
        self.words2Ints = {}
        self.chars2Ints = {}
        self.labels2Ints = {}

        print_log('amr', 'Starts reading data')
Exemple #2
0
    def __init__(self,
                 oracle_stats,
                 embedding_dim,
                 action_embedding_dim,
                 char_embedding_dim,
                 hidden_dim,
                 char_hidden_dim,
                 rnn_layers,
                 dropout_ratio,
                 pretrained_dim=1024,
                 amrs=None,
                 experiment=None,
                 use_gpu=False,
                 use_chars=False,
                 use_bert=False,
                 use_attention=False,
                 use_function_words=False,
                 use_function_words_rels=False,
                 parse_unaligned=False,
                 weight_inputs=False,
                 attend_inputs=False):
        super(AMRModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.char_embedding_dim = char_embedding_dim
        self.char_hidde_dim = char_hidden_dim
        self.action_embedding_dim = action_embedding_dim
        self.hidden_dim = hidden_dim
        self.exp = experiment
        self.pretrained_dim = pretrained_dim
        self.rnn_layers = rnn_layers
        self.use_bert = use_bert
        self.use_chars = use_chars
        self.use_attention = use_attention
        self.use_function_words_all = use_function_words
        self.use_function_words_rels = use_function_words_rels
        self.use_function_words = use_function_words or use_function_words_rels
        self.parse_unaligned = parse_unaligned
        self.weight_inputs = weight_inputs
        self.attend_inputs = attend_inputs
        # self.tokenizer = self.spacy_tokenizer()

        self.warm_up = False

        self.possible_predicates = oracle_stats["possible_predicates"]

        # Load spacy lemmatizer if needed
        self.lemmatizer = get_spacy_lemmatizer()

        self.state_dim = 3 * hidden_dim + (hidden_dim if use_attention else 0) \
            + (hidden_dim if self.use_function_words_all else 0)

        self.state_size = self.state_dim // hidden_dim

        if self.weight_inputs or self.attend_inputs:
            self.state_dim = hidden_dim

        self.use_gpu = use_gpu

        # Vocab and indices

        self.char2idx = oracle_stats['char2idx']
        self.word2idx = oracle_stats['word2idx']
        self.node2idx = oracle_stats['node2idx']
        word_counter = oracle_stats['word_counter']

        self.amrs = amrs

        self.singletons = {
            self.word2idx[w]
            for w in word_counter if word_counter[w] == 1
        }
        self.singletons.discard('<unk>')
        self.singletons.discard('<eof>')
        self.singletons.discard('<ROOT>')
        self.singletons.discard('<unaligned>')

        self.labelsO2idx = oracle_stats["labelsO2idx"]
        self.labelsA2idx = oracle_stats["labelsA2idx"]
        self.pred2idx = oracle_stats["pred2idx"]
        self.action2idx = oracle_stats["action2idx"]

        self.vocab_size = len(self.word2idx)
        self.action_size = len(self.action2idx)

        self.labelA_size = len(self.labelsA2idx)
        self.labelO_size = len(self.labelsO2idx)
        self.pred_size = len(self.pred2idx)

        self.idx2labelO = {v: k for k, v in self.labelsO2idx.items()}
        self.idx2labelA = {v: k for k, v in self.labelsA2idx.items()}
        self.idx2node = {v: k for k, v in self.node2idx.items()}
        self.idx2pred = {v: k for k, v in self.pred2idx.items()}
        self.idx2action = {v: k for k, v in self.action2idx.items()}
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        self.idx2char = {v: k for k, v in self.char2idx.items()}
        # self.ner_map = ner_map
        self.labelsA = []
        for k, v in self.labelsA2idx.items():
            self.labelsA.append(v)
        self.labelsO = []
        for k, v in self.labelsO2idx.items():
            self.labelsO.append(v)
        self.preds = []
        for k, v in self.pred2idx.items():
            self.preds.append(v)
        utils.print_log('parser',
                        f'Number of characters: {len(self.char2idx)}')
        utils.print_log('parser', f'Number of words: {len(self.word2idx)}')
        utils.print_log('parser', f'Number of nodes: {len(self.node2idx)}')
        utils.print_log('parser', f'Number of actions: {len(self.action2idx)}')
        for action in self.action2idx:
            print('\t', action)
        utils.print_log('parser', f'Number of labels: {len(self.labelsO2idx)}')
        utils.print_log('parser',
                        f'Number of labelsA: {len(self.labelsA2idx)}')
        utils.print_log('parser',
                        f'Number of predicates: {len(self.pred2idx)}')

        # Parameters
        self.word_embeds = nn.Embedding(self.vocab_size, embedding_dim)
        self.action_embeds = nn.Embedding(self.action_size,
                                          action_embedding_dim)
        self.labelA_embeds = nn.Embedding(self.labelA_size,
                                          action_embedding_dim)
        self.labelO_embeds = nn.Embedding(self.labelO_size,
                                          action_embedding_dim)
        self.pred_embeds = nn.Embedding(self.pred_size, action_embedding_dim)
        self.pred_unk_embed = nn.Parameter(torch.randn(
            1, self.action_embedding_dim),
                                           requires_grad=True)
        self.empty_emb = nn.Parameter(torch.randn(1, hidden_dim),
                                      requires_grad=True)

        # Stack-LSTMs
        self.buffer_lstm = nn.LSTMCell(self.embedding_dim, hidden_dim)
        self.stack_lstm = nn.LSTMCell(self.embedding_dim, hidden_dim)
        self.action_lstm = nn.LSTMCell(action_embedding_dim, hidden_dim)
        self.lstm_initial_1 = utils.xavier_init(self.use_gpu, 1,
                                                self.hidden_dim)
        self.lstm_initial_2 = utils.xavier_init(self.use_gpu, 1,
                                                self.hidden_dim)
        self.lstm_initial = (self.lstm_initial_1, self.lstm_initial_2)

        if self.use_chars:
            self.char_embeds = nn.Embedding(len(self.char2idx),
                                            char_embedding_dim)
            self.unaligned_char_embed = nn.Parameter(torch.randn(
                1, 2 * char_hidden_dim),
                                                     requires_grad=True)
            self.root_char_embed = nn.Parameter(torch.randn(
                1, 2 * char_hidden_dim),
                                                requires_grad=True)
            self.pad_char_embed = nn.Parameter(
                torch.zeros(1, 2 * char_hidden_dim))
            self.char_lstm_forward = nn.LSTM(char_embedding_dim,
                                             char_hidden_dim,
                                             num_layers=rnn_layers,
                                             dropout=dropout_ratio)
            self.char_lstm_backward = nn.LSTM(char_embedding_dim,
                                              char_hidden_dim,
                                              num_layers=rnn_layers,
                                              dropout=dropout_ratio)

            self.tok_2_embed = nn.Linear(
                self.embedding_dim + 2 * char_hidden_dim, self.embedding_dim)

        if self.use_bert:
            # bert embeddings to LSTM input
            self.pretrained_2_embed = nn.Linear(
                self.embedding_dim + self.pretrained_dim, self.embedding_dim)

        if use_attention:
            self.forward_lstm = nn.LSTM(self.embedding_dim,
                                        hidden_dim,
                                        num_layers=rnn_layers,
                                        dropout=dropout_ratio)
            self.backward_lstm = nn.LSTM(self.embedding_dim,
                                         hidden_dim,
                                         num_layers=rnn_layers,
                                         dropout=dropout_ratio)

            self.attention_weights = nn.Parameter(torch.randn(
                2 * hidden_dim, 2 * hidden_dim),
                                                  requires_grad=True)

            self.attention_ff1_1 = nn.Linear(2 * hidden_dim, hidden_dim)

        self.dropout_emb = nn.Dropout(p=dropout_ratio)
        self.dropout = nn.Dropout(p=dropout_ratio)

        self.action_softmax1 = nn.Linear(self.state_dim, hidden_dim)
        self.labelA_softmax1 = nn.Linear(self.state_dim, hidden_dim)
        self.pred_softmax1 = nn.Linear(self.state_dim, hidden_dim)
        if not self.use_function_words_rels:
            self.label_softmax1 = nn.Linear(self.state_dim, hidden_dim)
        else:
            self.label_softmax1 = nn.Linear(self.state_dim + hidden_dim,
                                            hidden_dim)

        self.action_softmax2 = nn.Linear(hidden_dim, len(self.action2idx) + 2)
        self.labelA_softmax2 = nn.Linear(hidden_dim, len(self.labelsA2idx) + 2)
        self.label_softmax2 = nn.Linear(hidden_dim, len(self.labelsO2idx) + 2)
        self.pred_softmax2 = nn.Linear(hidden_dim, len(self.pred2idx) + 2)

        # composition functions
        self.arc_composition_head = nn.Linear(
            2 * self.embedding_dim + self.action_embedding_dim,
            self.embedding_dim)
        self.merge_composition = nn.Linear(2 * self.embedding_dim,
                                           self.embedding_dim)
        self.dep_composition = nn.Linear(
            self.embedding_dim + self.action_embedding_dim, self.embedding_dim)
        self.addnode_composition = nn.Linear(
            self.embedding_dim + self.action_embedding_dim, self.embedding_dim)
        self.pred_composition = nn.Linear(
            self.embedding_dim + self.action_embedding_dim, self.embedding_dim)

        # experiments
        if self.use_function_words:
            self.functionword_lstm = nn.LSTMCell(self.embedding_dim,
                                                 hidden_dim)

        if self.parse_unaligned:
            self.pred_softmax1_unaligned = nn.Linear(self.state_dim,
                                                     hidden_dim)
            self.pred_softmax2_unaligned = nn.Linear(hidden_dim,
                                                     len(self.pred2idx) + 2)

        if self.weight_inputs:
            self.action_attention = nn.Parameter(torch.zeros(self.state_size),
                                                 requires_grad=True)
            self.label_attention = nn.Parameter(torch.zeros(self.state_size),
                                                requires_grad=True)
            self.labelA_attention = nn.Parameter(torch.zeros(self.state_size),
                                                 requires_grad=True)
            self.pred_attention = nn.Parameter(torch.zeros(self.state_size),
                                               requires_grad=True)
            if self.parse_unaligned:
                self.pred_attention_unaligned = nn.Parameter(
                    torch.zeros(self.state_size), requires_grad=True)
        elif self.attend_inputs:
            self.action_attention = torch.nn.Linear(self.state_size * 2,
                                                    self.state_size)
            self.label_attention = torch.nn.Linear(self.state_size * 2,
                                                   self.state_size)
            self.labelA_attention = torch.nn.Linear(self.state_size * 2,
                                                    self.state_size)
            self.pred_attention = torch.nn.Linear(self.state_size * 2,
                                                  self.state_size)
            if self.parse_unaligned:
                self.pred_attention_unaligned = torch.nn.Linear(
                    self.state_size * 2, self.state_size)
            self.prevent_overfitting = torch.nn.Linear(hidden_dim,
                                                       self.state_size * 2)

        # stats and accuracy
        self.action_acc = utils.Accuracy()
        self.label_acc = utils.Accuracy()
        self.labelA_acc = utils.Accuracy()
        self.pred_acc = utils.Accuracy()

        self.action_confusion_matrix = utils.ConfusionMatrix(
            self.action2idx.keys())
        self.label_confusion_matrix = utils.ConfusionMatrix(
            self.labelsO2idx.keys())

        self.action_loss = 0
        self.label_loss = 0
        self.labelA_loss = 0
        self.pred_loss = 0

        self.epoch_loss = 0

        self.rand_init()
        if self.use_gpu:
            for m in self.modules():
                m.cuda()
Exemple #3
0
    def load_amrs(self, amr_file_name, training=True, verbose=False):

        amrs = self.amrs if training else self.amrs_dev

        amrs.append(AMR())

        fp = open(amr_file_name, encoding='utf8')
        for line in fp:
            # empty line, prepare to read next amr in dataset
            if len(line.strip()) == 0:
                if verbose:
                    print(amrs[-1])
                amrs.append(AMR())
            # amr tokens
            elif line.startswith("# ::tok"):
                tokens = line[len('# ::tok '):]
                tokens = tokens.split()
                amrs[-1].tokens.extend(tokens)
                for tok in tokens:
                    # TODO: update dictionaries after entire AMR is read
                    if training:
                        self.words2Ints.setdefault(tok, len(self.words2Ints))
                        for char in tok:
                            self.chars2Ints.setdefault(char,
                                                       len(self.chars2Ints))
            # amr score
            elif line.startswith("# ::scr"):
                score = line.strip()[len('# ::scr '):]
                score = float(score)
                amrs[-1].score = score
            # an amr node
            elif line.startswith("# ::node"):
                node_id = ''
                for col, tab in enumerate(line.split("\t")):
                    # node id
                    if col == 1:
                        node_id = tab.strip()
                        # node label
                    elif col == 2:
                        node = tab.strip()
                        amrs[-1].nodes[node_id] = node
                        # TODO: update dictionaries after entire AMR is read
                        if training:
                            self.nodes2Ints.setdefault(node,
                                                       len(self.nodes2Ints))
                    # alignment
                    elif col == 3:
                        if '-' not in tab:
                            continue
                        start_end = tab.strip().split("-")
                        start = int(start_end[0])  # inclusive
                        end = int(start_end[1])  # exclusive
                        word_idxs = list(
                            range(start + 1,
                                  end + 1))  # off by one (we start at index 1)
                        amrs[-1].alignments[node_id] = word_idxs

            # an amr edge
            elif line.startswith("# ::edge"):
                edge = ['', '', '']
                in_quotes = False
                quote_offset = 0
                for col, tab in enumerate(line.split("\t")):
                    if tab.startswith('"'):
                        in_quotes = True
                    if tab.endswith('"'):
                        in_quotes = False
                    # edge label
                    if col == 2 + (quote_offset):
                        edge[1] = ':' + tab.strip()
                        # TODO: update dictionaries after entire AMR is read
                        if training:
                            self.labels2Ints.setdefault(
                                tab, len(self.labels2Ints))
                    # edge source id
                    elif col == 4 + (quote_offset):
                        edge[0] = tab.strip()
                    # edge target id
                    elif col == 5 + (quote_offset):
                        edge[2] = tab.strip()
                    if in_quotes:
                        quote_offset += 1
                amrs[-1].edges.append(tuple(edge))
            # amr root
            elif line.startswith("# ::root"):
                splinetabs = line.split("\t")
                root = splinetabs[1]
                root = root.strip()
                amrs[-1].root = root

        if len(amrs[-1].nodes) == 0:
            amrs.pop()
        print_log('amr', "Training Data" if training else "Dev Data")
        if training:
            print_log('amr', "Number of labels: " + str(len(self.labels2Ints)))
            print_log('amr', "Number of nodes: " + str(len(self.nodes2Ints)))
            print_log('amr', "Number of words: " + str(len(self.words2Ints)))
        print_log('amr', "Number of sentences: " + str(len(amrs)))
Exemple #4
0
    def toJAMRString(self, only_penman=False, allow_incomplete=False):
        output = str(self)

        # amr string
        amr_string = f'[[{self.root}]]'
        new_ids = {}
        for n in self.nodes:
            new_id = self.nodes[n][0] if self.nodes[n] else 'x'
            if new_id.isalpha() and new_id.islower():
                if new_id in new_ids.values():
                    j = 2
                    while f'{new_id}{j}' in new_ids.values():
                        j += 1
                    new_id = f'{new_id}{j}'
            else:
                j = 0
                while f'x{j}' in new_ids.values():
                    j += 1
                new_id = f'x{j}'
            new_ids[n] = new_id
        depth = 1
        nodes = {self.root}
        completed = set()
        while '[[' in amr_string:
            tab = '      ' * depth
            for n in nodes.copy():
                id = new_ids[n] if n in new_ids else 'r91'
                concept = self.nodes[
                    n] if n in new_ids and self.nodes[n] else 'None'
                edges = sorted([e for e in self.edges if e[0] == n],
                               key=lambda x: x[1])
                targets = set(t for s, r, t in edges)
                edges = [f'{r} [[{t}]]' for s, r, t in edges]
                children = f'\n{tab}'.join(edges)
                if children:
                    children = f'\n{tab}' + children
                if n not in completed:
                    if (concept[0].isalpha() and concept not in [
                            'imperative', 'expressive', 'interrogative'
                    ]) or targets:
                        amr_string = amr_string.replace(
                            f'[[{n}]]', f'({id} / {concept}{children})', 1)
                    else:
                        amr_string = amr_string.replace(
                            f'[[{n}]]', f'{concept}')
                    completed.add(n)
                amr_string = amr_string.replace(f'[[{n}]]', f'{id}')
                nodes.remove(n)
                nodes.update(targets)
            depth += 1

        if allow_incomplete:
            pass

        else:
            if len(completed) < len(self.nodes):
                raise Exception("Tried to print an uncompleted AMR")
                print_log(
                    'amr',
                    'Failed to print AMR, ' + str(len(completed)) + ' of ' +
                    str(len(self.nodes)) + ' nodes printed:\n ' + amr_string)
            if amr_string.startswith(
                    '"') or amr_string[0].isdigit() or amr_string[0] == '-':
                amr_string = '(x / ' + amr_string + ')'
            if not amr_string.startswith('('):
                amr_string = '(' + amr_string + ')'
            if len(self.nodes) == 0:
                amr_string = '(a / amr-empty)'

            output += amr_string + '\n\n'

        if only_penman:
            output = '\n'.join([
                line for line in output.split('\n') if line and line[0] != '#'
            ])

        return output