class Classifier:
    def __init__(self, max_words=500):
        self.model = None
        self.vocab = Vocabulary()
        self.max_words = max_words

    def build(self):
        self.vocab.build()
        model = Sequential()
        model.add(Embedding(self.vocab.size(), 128))
        model.add(LSTM(128))
        model.add(Dropout(0.5))
        model.add(Dense(1, W_regularizer=l2(0.01)))
        model.add(Activation('sigmoid'))

        model.load_weights("lib/imdb_lstm.w")
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      class_mode="binary")

        self.model = model
        return self

    def pad(self, X):
        return pad_sequences(X, maxlen=self.max_words)

    def classify(self, X):
        inp = [self.vocab.vectorize(X)]
        inp = np.array(self.pad(inp))
        y = self.model.predict(inp)[0][0]
        return (round(y), y)
Example #2
0
def get_dataloader(args):
    src_sents, tgt_sents = DataReader(args, train=True).load_dataset()
    src_vocab = Vocabulary(src_sents, args, args.src_vocab_size)
    tgt_vocab = Vocabulary(tgt_sents, args, args.tgt_vocab_size)
    train_dataset = Dataset(src_sents,
                            tgt_sents,
                            args,
                            src_vocab,
                            tgt_vocab,
                            train=True)

    src_sents, tgt_sents = DataReader(args, train=False).load_dataset()
    test_dataset = Dataset(src_sents,
                           tgt_sents,
                           args,
                           src_vocab,
                           tgt_vocab,
                           train=False)

    train_dataloader = data.DataLoader(train_dataset,
                                       args.batch_size,
                                       sampler=RandomSampler(train_dataset))
    test_dataloader = data.DataLoader(test_dataset,
                                      args.batch_size,
                                      shuffle=False)

    return train_dataloader, test_dataloader
Example #3
0
def print_predictions(datasets: List,
                      filename: str,
                      vocab: Vocabulary) -> None:
    with open(filename, "w", encoding="utf8") as f:
        for instance in datasets:
            seq_len = int((instance['ent_span_labels'] >= 0).sum())
            for idx, true_label, pred_label in zip(instance['tokens'][:seq_len],
                                                   instance['ent_labels'][:seq_len],
                                                   instance['all_ent_pred'][:seq_len]):
                token = vocab.get_token_from_index(idx, "tokens")
                true_label = vocab.get_token_from_index(true_label, "ent_labels")
                pred_label = vocab.get_token_from_index(pred_label, "ent_labels")
                print("{}\t{}\t{}".format(token, true_label, pred_label), file=f)

            for (s, e), r in zip(instance['candi_rels'], instance['rel_labels']):
                r = vocab.get_token_from_index(r, "rel_labels")

                assert r != "None"

                if r[-3:] == "<--":
                    s, e = e, s
                r = r[:-3]
                print("Rel-True\t{}\t{}\t{}".format(s, e, r), file=f)

            for (s, e), r in zip(instance['all_candi_rels'], instance['all_rel_pred']):
                r = vocab.get_token_from_index(r, "rel_labels")
                if r == "None":
                    continue
                if r[-3:] == "<--":
                    s, e = e, s
                r = r[:-3]
                print("Rel-Pred\t{}\t{}\t{}".format(s, e, r), file=f)
            print(file=f)
Example #4
0
def print_ent_span_predictions(datasets: List,
                               filename: str,
                               vocab: Vocabulary) -> None:
    with open(filename, "w", encoding="utf8") as f:
        for instance in datasets:
            seq_len = int((instance['ent_span_labels'] >= 0).sum())
            for idx, true_label, pred_label in zip(instance['tokens'][:seq_len],
                                                   instance['ent_span_labels'][:seq_len],
                                                   instance['ent_span_pred'][:seq_len]):
                token = vocab.get_token_from_index(idx, "tokens")
                true_label = vocab.get_token_from_index(true_label, "ent_span_labels")
                pred_label = vocab.get_token_from_index(pred_label, "ent_span_labels")
                if true_label != "O":
                    true_label = true_label + "-ENT"
                if pred_label != "O":
                    pred_label = pred_label + "-ENT"
                print("{}\t{}\t{}".format(token, true_label, pred_label), file=f)
            for (s, e), r in zip(instance['candi_rels'], instance['rel_labels']):
                r = vocab.get_token_from_index(r, "rel_labels")

                assert r != "None"
                r = "YES"
                print("Rel-True\t{}\t{}\t{}".format(s, e, r), file=f)

            for (s, e), r in zip(instance['all_candi_rels'], instance['all_bin_rel_pred']):
                if r == 0:
                    continue
                print("Rel-Pred\t{}\t{}\t{}".format(s, e, "YES"), file=f)
            print(file=f)
class Classifier:
    def __init__(self, max_words = 500):
        self.model = None
        self.vocab = Vocabulary()
        self.max_words = max_words

    def build(self):
        self.vocab.build()
        model = Sequential()
        model.add(Embedding(self.vocab.size(), 128))
        model.add(LSTM(128))
        model.add(Dropout(0.5))
        model.add(Dense(1, W_regularizer=l2(0.01)))
        model.add(Activation('sigmoid'))

        model.load_weights("lib/imdb_lstm.w")
        model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")

        self.model = model
        return self

    def pad(self, X):
        return pad_sequences(X, maxlen=self.max_words)

    def classify(self, X):
        inp = [self.vocab.vectorize(X)]
        inp = np.array(self.pad(inp))
        y = self.model.predict(inp)[0][0]
        return (round(y), y)
Example #6
0
class Classifier(ABC):
    def __init__(self, name, max_words=500):
        self.model = None
        self.graph = None
        self.name = name
        self.vocab = Vocabulary()
        self.max_words = max_words

    def build(self):
        self.vocab.build()
        if not os.path.isfile(self.name):
            print("No stored configuration for " + self.name +
                  " has been found.")
            model = self.architecture()
            print("Model has been built.")
            model = self.train(model)
            print("Model has been trained.")
            model.save(self.name)
            print("Model has been stored.")
        else:
            print("Stored configuration for " + self.name + " has been found.")
            model = load_model(self.name)
            model._make_predict_function()
            self.graph = tf.get_default_graph()
            print("Model has been loaded.")
        self.model = model
        return self

    def train(self, model):
        pos_dir = 'dataset/train/pos'
        neg_dir = 'dataset/train/neg'
        data = load_dir(pos_dir, 1, 12500) + load_dir(neg_dir, 0, 12500)
        random.shuffle(data)

        features = list()
        labels = list()

        for X, y in data:
            features.append(self.pad([self.vocab.vectorize(X)])[0])
            labels.append(y)

        model.fit(np.array(features), np.array(labels))
        return model

    def pad(self, X):
        return pad_sequences(X, maxlen=self.max_words)

    def classify(self, X):
        inp = [self.vocab.vectorize(X)]
        inp = np.array(self.pad(inp))
        with self.graph.as_default():
            y = self.model.predict(inp)[0][0]
            return round(y), y

    @abstractmethod
    def architecture(self):
        pass
Example #7
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    features = []
    list1 = []

    s1 = configuration.get_stack(0)
    s2 = configuration.get_stack(1)
    s3 = configuration.get_stack(2)

    s1lc1 = configuration.get_left_child(s1, 1)
    s1rc1 = configuration.get_right_child(s1, 1)

    s1lc2 = configuration.get_left_child(s1, 2)
    s1rc2 = configuration.get_right_child(s1, 2)

    s2lc1 = configuration.get_left_child(s2, 1)
    s2rc1 = configuration.get_right_child(s2, 1)

    s2lc2 = configuration.get_left_child(s2, 2)
    s2rc2 = configuration.get_right_child(s2, 2)

    s1lc1lc1 = configuration.get_left_child(s1lc1, 1)
    s1rc1rc1 = configuration.get_right_child(s1rc1, 1)

    s2lc1lc1 = configuration.get_left_child(s2lc1, 1)
    s2rc1rc1 = configuration.get_right_child(s2rc1, 1)

    b1 = configuration.get_buffer(0)
    b2 = configuration.get_buffer(1)
    b3 = configuration.get_buffer(2)

    list1.extend([
        s1, s2, s3, b1, b2, b3, s1lc1, s1rc1, s1lc2, s1rc2, s2lc1, s2rc1,
        s2lc2, s2rc2, s1lc1lc1, s1rc1rc1, s2lc1lc1, s2rc1rc1
    ])

    for word in list1:
        features.extend([vocabulary.get_word_id(configuration.get_word(word))])
        features.extend([vocabulary.get_pos_id(configuration.get_pos(word))])

    for word in range(6, len(list1)):
        features.extend(
            [vocabulary.get_label_id(configuration.get_label(list1[word]))])
    # TODO(Students) End
    assert len(features) == 48
    return features
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    li = []
    features = []
    li1=[]
    li0=[]
    feat1=[]
    feat2=[]
    feat3=[]
    li0.append(configuration.get_buffer(0))
    li0.append(configuration.get_buffer(1))
    li0.append(configuration.get_buffer(2))
    li0.append(configuration.get_stack(0))
    li0.append(configuration.get_stack(1))
    li0.append(configuration.get_stack(2))

    li.append(configuration.get_left_child(configuration.get_stack(0), 1))
    li.append(configuration.get_right_child(configuration.get_stack(0), 1))
    li.append(configuration.get_left_child(configuration.get_stack(0), 2))
    li.append(configuration.get_right_child(configuration.get_stack(0), 2))
    li.append(configuration.get_left_child(configuration.get_stack(1), 1))
    li.append(configuration.get_right_child(configuration.get_stack(1), 1))
    li.append(configuration.get_left_child(configuration.get_stack(1), 2))
    li.append(configuration.get_right_child(configuration.get_stack(1), 2))

    li.append(configuration.get_left_child(configuration.get_left_child(configuration.get_stack(0), 1), 1))
    li.append(configuration.get_right_child(configuration.get_right_child(configuration.get_stack(0), 1), 1))
    li.append(configuration.get_left_child(configuration.get_left_child(configuration.get_stack(1), 1), 1))
    li.append(configuration.get_right_child(configuration.get_right_child(configuration.get_stack(1), 1), 1))
    li1=li0+li
    for p in li1:
        feat1.append(vocabulary.get_pos_id(configuration.get_pos(p)))
    for w in li1:
        feat2.append(vocabulary.get_word_id(configuration.get_word(w)))
    for l in li:
        feat3.append(vocabulary.get_label_id((configuration.get_label(l))))
    features=feat1+feat2+feat3

    # TODO(Students) End

    assert len(features) == 48
    return features
Example #9
0
def seqchar2number(instance: Dict,
                   vocab: Vocabulary,
                   lower_case: bool) -> List[List]:
    nums = []
    for token in instance['tokens']:
        nums.append([vocab.get_token_index(item.lower() if lower_case else item, 'token_chars')
                     for item in token])
    return nums
Example #10
0
def get_minibatch(batch: List[Dict], vocab: Vocabulary, use_cuda: bool) -> Dict[str, Any]:
    batch = sorted(batch, key=lambda x: len(x['tokens']), reverse=True)
    batch_seq_len = [len(instance['tokens']) for instance in batch]
    max_seq_len = max(batch_seq_len)
    # max_char_seq_len = max([len(tok) for instance in batch for tok in instance['token_chars']])

    outputs = defaultdict(list)
    token_padding_idx = vocab.get_token_index(vocab._padding_token, 'tokens')
    char_padding_idx = vocab.get_token_index(vocab._padding_token, 'token_chars')
    label_padding_idx = -1
    for instance in batch:
        cur_seq_len = len(instance['tokens'])

        outputs['tokens'].append(instance['tokens'] + [token_padding_idx] * (max_seq_len - cur_seq_len))
        outputs['ent_labels'].append(instance['ent_labels'] + [label_padding_idx] * (max_seq_len - cur_seq_len))
        outputs['ent_span_labels'].append(
            instance['ent_span_labels'] + [label_padding_idx] * (max_seq_len - cur_seq_len))
        outputs['candi_rels'].append(instance['candi_rels'])
        outputs['ent_ids'].append(instance['ent_ids'])
        outputs['ent_ids_labels'].append(instance['ent_ids_labels'])
        outputs['rel_labels'].append(instance['rel_labels'])
        # char_pad = []
        # for char_seq in instance['token_chars']:
        #     char_pad.append(char_seq + [char_padding_idx] * (max_char_seq_len - len(char_seq)))
        # char_pad = char_pad + [[char_padding_idx] * max_char_seq_len] * (max_seq_len - cur_seq_len)
        # outputs['token_chars'].append(char_pad)
    outputs['tokens'] = torch.LongTensor(outputs['tokens'])
    # outputs['token_chars'] = torch.LongTensor(outputs['token_chars'])
    outputs['ent_labels'] = torch.LongTensor(outputs['ent_labels'])
    outputs['ent_span_labels'] = torch.LongTensor(outputs['ent_span_labels'])
    outputs['seq_lens'] = batch_seq_len
    if use_cuda:
        outputs['tokens'] = outputs['tokens'].cuda(non_blocking=True)
        # outputs['token_chars'] = outputs['token_chars'].cuda(non_blocking=True)
        outputs['ent_labels'] = outputs['ent_labels'].cuda(non_blocking=True)
        outputs['ent_span_labels'] = outputs['ent_span_labels'].cuda(non_blocking=True)
    return outputs
Example #11
0
def load_word_vectors(vector_file: str,
                      ndims: int,
                      vocab: Vocabulary,
                      namespace: str = 'tokens') -> List[List]:
    token_vocab_size = vocab.get_vocab_size(namespace)
    oov_idx = vocab.get_token_index(vocab._oov_token, namespace)
    padding_idx = vocab.get_token_index(vocab._padding_token, namespace)
    W = np.random.uniform(-0.25, 0.25, (token_vocab_size, ndims))
    W[padding_idx, :] = 0.0
    total, found = 0, 0
    with open(vector_file) as fp:
        for i, line in enumerate(fp):
            line = line.rstrip().split()
            if line:
                total += 1
                try:
                    assert len(line) == ndims + 1, (
                        "Line[{}] {} vector dims {} doesn't match ndims={}".
                        format(i, line[0],
                               len(line) - 1, ndims))
                except AssertionError as e:
                    print(e)
                    continue
                word = line[0]
                idx = vocab.get_token_index(word, namespace)
                if idx != oov_idx:
                    found += 1
                    vecs = np.array(list(map(float, line[1:])))
                    W[idx, :] = vecs
    print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".
          format(found, found * 100 / token_vocab_size, total, vector_file,
                 ndims))
    #  norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True))
    #  valid_idx = norm_W.squeeze() != 0
    #  W[valid_idx, :] /= norm_W[valid_idx]
    return W
Example #12
0
def data2number(corpus: List[Dict], vocab: Vocabulary) -> List[Dict]:
    instances = []
    oov_idx = vocab.get_token_index(vocab._oov_token, 'tokens')
    for e in corpus:
        instance = {}
        instance['tokens'] = seq2number(e, vocab, 'tokens', True)
        instance['token_chars'] = seqchar2number(e, vocab, False)
        instance['ent_labels'] = seq2number(e, vocab, 'ent_labels', False)
        instance['ent_span_labels'] = seq2number(e, vocab, 'ent_span_labels', False)
        instance['ent_ids_labels'] = seq2number(e, vocab, 'ent_ids_labels', False)
        instance['rel_labels'] = seq2number(e, vocab, 'rel_labels', False)
        instance['candi_rels'] = e['candi_rels']
        instance['ent_ids'] = e['ent_ids']

        assert all([oov_idx != n for n in instance['tokens']])
        assert all([oov_idx != m for n in instance['token_chars'] for m in n])

        instances.append(instance)
    return instances
        type=str,
        help=
        'serialization directory of the trained model. Used only for vocab.')
    parser.add_argument('gold_data_path',
                        type=str,
                        help='gold data file path.')
    parser.add_argument('prediction_data_path',
                        type=str,
                        help='predictions data file path.')

    args = parser.parse_args()

    print("Reading data")
    sentences, label_trees = read_conll_data(args.gold_data_path)
    _, predicted_trees = read_conll_data(args.prediction_data_path)

    print("Reading vocabulary")
    vocabulary_path = os.path.join(args.load_serialization_dir, "vocab.pickle")
    vocabulary = Vocabulary.load(vocabulary_path)
    sorted_labels = [
        item[0] for item in sorted(vocabulary.label_token_to_id.items(),
                                   key=lambda e: e[1])
    ]
    non_null_sorted_labels = sorted_labels[1:]
    parsing_system = ParsingSystem(non_null_sorted_labels)

    print("Evaluating")
    report = evaluate(sentences, parsing_system, predicted_trees, label_trees)

    print(report)
Example #14
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    fWord = []
    fPos = []
    fLabel = []
    feature = []

    for j in range(2, -1, -1):
        index = configuration.get_stack(j)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))

    for j in range(0, 3, 1):
        index = configuration.get_buffer(j)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))

    for j in range(0, 2, 1):
        k = configuration.get_stack(j)
        index = configuration.get_left_child(k, 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(k, 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_left_child(k, 2)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(k, 2)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_left_child(
            configuration.get_left_child(k, 1), 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(
            configuration.get_right_child(k, 1), 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

    feature.extend(fWord)
    feature.extend(fPos)
    feature.extend(fLabel)
    return feature
Example #15
0
 def __init__(self, name, max_words=500):
     self.model = None
     self.graph = None
     self.name = name
     self.vocab = Vocabulary()
     self.max_words = max_words
Example #16
0
def seq2number(instance: Dict,
               vocab: Vocabulary,
               namespace: str,
               lower_case: bool) -> List:
    return [vocab.get_token_index(item.lower() if lower_case else item, namespace)
            for item in instance[namespace]]
Example #17
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    '''
    all_words = list()
    features = list()

    # Top 3 words from stack
    # for idx in range(2, -1, -1):
    for idx in range(3):
        all_words.append(vocabulary.get_word_id(configuration.get_stack(idx)))


    # Top 3 words from buffer
    for idx in range(3):
        all_words.append(vocabulary.get_word_id(configuration.get_buffer(idx)))


    # Left & Right Child of the top words in stack

    for idx in range(2):
        k = configuration.get_stack(idx)
        all_words.append(configuration.get_left_child(k, 1))
        all_words.append(configuration.get_left_child(k, 2))
        all_words.append(configuration.get_right_child(k, 1))
        all_words.append(configuration.get_right_child(k, 2))

        # Left of Top word in stack, Right of Top word in stack
        all_words.append(configuration.get_left_child(configuration.get_left_child(k, 1), 1))
        # all_words.append(configuration.get_left_child(all_words[-4], 1))
        all_words.append(configuration.get_right_child(configuration.get_right_child(k, 1), 1))
        # all_words.append(configuration.get_right_child(all_words[-4], 1))


    # Left of 2nd word in stack, Right of 2nd word in stack

    # Word Features
    for word in all_words:
        features.append(vocabulary.get_word_id(configuration.get_word(word)))


    # POS Features
    for word in all_words:
        features.append(vocabulary.get_pos_id(configuration.get_pos(word)))


    # Labels
    for word in all_words[6:]:
        features.append(vocabulary.get_label_id(configuration.get_label(word)))
    '''
    features = list()
    fWord = []
    fPos = []
    fLabel = []
    feature = []

    for j in range(2, -1, -1):
        index = configuration.get_stack(j)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))

    for j in range(0, 3, 1):
        index = configuration.get_buffer(j)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))

    for j in range(0, 2, 1):
        k = configuration.get_stack(j)
        index = configuration.get_left_child(k, 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(k, 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_left_child(k, 2)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(k, 2)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_left_child(
            configuration.get_left_child(k, 1), 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

        index = configuration.get_right_child(
            configuration.get_right_child(k, 1), 1)
        fWord.append(vocabulary.get_word_id(configuration.get_word(index)))
        fPos.append(vocabulary.get_pos_id(configuration.get_pos(index)))
        fLabel.append(vocabulary.get_label_id(configuration.get_label(index)))

    features.extend(fWord)
    features.extend(fPos)
    features.extend(fLabel)

    # TODO(Students) End

    assert len(features) == 48
    return features
Example #18
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """

    # TODO(Students) Start
    # print("Deubg output")
    features = []
    postags = []
    arclabels = []

    # For possible steps
    for item in range(0, 2):
        features.append(
            configuration.get_left_child(configuration.get_stack(item), 1))
        features.append(
            configuration.get_left_child(configuration.get_stack(item), 2))
        features.append(
            configuration.get_right_child(configuration.get_stack(item), 1))
        features.append(
            configuration.get_right_child(configuration.get_stack(item), 2))
        features.append(
            configuration.get_left_child(
                configuration.get_left_child(configuration.get_stack(item), 1),
                1))
        features.append(
            configuration.get_right_child(
                configuration.get_right_child(configuration.get_stack(item),
                                              1), 1))

    for item in features:
        arclabels.append(configuration.get_label(item))
    for item in range(0, 3):
        features.append(configuration.get_stack(item))
    for item in range(0, 3):
        features.append(configuration.get_buffer(item))
    for item in features:
        postags.append(configuration.get_pos(item))

    # Initialize empty lists
    posids = []
    labelids = []
    wordids = []
    featureid = []
    # Append the final tags
    for i in postags:
        posids.append(vocabulary.get_pos_id(i))
    for i in arclabels:
        labelids.append(vocabulary.get_label_id(i))
    for i in features:
        wordids.append(vocabulary.get_word_id(configuration.get_word(i)))

    # Append the final data
    featureid.extend(wordids)
    featureid.extend(labelids)
    featureid.extend(posids)
    features = featureid

    # TODO(Students) End
    assert len(features) == 48
    return features
 def __init__(self, max_words=500):
     self.model = None
     self.vocab = Vocabulary()
     self.max_words = max_words
Example #20
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    features = []
    stacks = []
    buffers = []

    val_1 = configuration.get_stack(0)
    stacks.append(val_1)
    val_2 = configuration.get_stack(1)
    stacks.append(val_2)
    val_3 = configuration.get_stack(2)
    stacks.append(val_3)

    bval_1 = configuration.get_buffer(0)
    buffers.append(bval_1)
    bval_2 = configuration.get_buffer(1)
    buffers.append(bval_2)
    bval_3 = configuration.get_buffer(2)
    buffers.append(bval_3)

    ##word id stack and buffer
    for i in stacks:
        features.append(vocabulary.get_word_id(configuration.get_word(i)))
    for i in buffers:
        features.append(vocabulary.get_word_id(configuration.get_word(i)))

    #word id left
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_word_id(
                    configuration.get_word(
                        configuration.get_left_child(stacks[i], j))))

    ##word id right
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_word_id(
                    configuration.get_word(
                        configuration.get_right_child(stacks[i], j))))

    #word id left
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_word_id(
                configuration.get_word(
                    configuration.get_left_child(
                        configuration.get_left_child(stacks[i], 1), 1))))

    #word id right
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_word_id(
                configuration.get_word(
                    configuration.get_right_child(
                        configuration.get_right_child(stacks[i], 1), 1))))

    #position id stack and buffer
    for i in stacks:
        features.append(vocabulary.get_pos_id(configuration.get_pos(i)))
    for i in buffers:
        features.append(vocabulary.get_pos_id(configuration.get_pos(i)))

    #position id left
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_pos_id(
                    configuration.get_pos(
                        configuration.get_left_child(stacks[i], j))))

    #position id right
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_pos_id(
                    configuration.get_pos(
                        configuration.get_right_child(stacks[i], j))))

    #position id left
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_pos_id(
                configuration.get_pos(
                    configuration.get_left_child(
                        configuration.get_left_child(stacks[i], 1), 1))))

    #position id right
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_pos_id(
                configuration.get_pos(
                    configuration.get_right_child(
                        configuration.get_right_child(stacks[i], 1), 1))))

    #label id left
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_label_id(
                    configuration.get_label(
                        configuration.get_left_child(stacks[i], j))))

    #label id right
    for i in range(len(stacks) - 1):
        for j in range(1, 3):
            features.append(
                vocabulary.get_label_id(
                    configuration.get_label(
                        configuration.get_right_child(stacks[i], j))))

    #label id left
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_label_id(
                configuration.get_label(
                    configuration.get_left_child(
                        configuration.get_left_child(stacks[i], 1), 1))))

    #label id right
    for i in range(len(stacks) - 1):
        features.append(
            vocabulary.get_label_id(
                configuration.get_label(
                    configuration.get_right_child(
                        configuration.get_right_child(stacks[i], 1), 1))))
    # TODO(Students) End

    assert len(features) == 48
    return features
Example #21
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """

    #Reference: Understood the features from the github implementation of:
    #akjindal53244/dependency_parsing_tf/utils/feature_extraction.py

    # TODO(Students) Start
    features = []
    direct_tokens = []
    children_token = []

    direct_tokens.extend([configuration.get_stack(i) for i in range(3)])
    direct_tokens.extend([configuration.get_buffer(i) for i in range(3)])

    for i in range(2):
        children_token.extend(
            [configuration.get_left_child(configuration.get_stack(i), 1)])
        children_token.extend(
            [configuration.get_right_child(configuration.get_stack(i), 1)])

        children_token.extend(
            [configuration.get_left_child(configuration.get_stack(i), 2)])
        children_token.extend(
            [configuration.get_right_child(configuration.get_stack(i), 2)])

        children_token.extend(
            [configuration.get_left_child(children_token[0], 1)])
        children_token.extend(
            [configuration.get_right_child(children_token[1], 1)])

    features.extend([
        vocabulary.get_word_id(configuration.get_word(i))
        for i in direct_tokens
    ])
    features.extend([
        vocabulary.get_word_id(configuration.get_word(i))
        for i in children_token
    ])

    features.extend([
        vocabulary.get_pos_id(configuration.get_pos(i)) for i in direct_tokens
    ])
    features.extend([
        vocabulary.get_pos_id(configuration.get_pos(i)) for i in children_token
    ])

    features.extend([
        vocabulary.get_label_id(configuration.get_label(i))
        for i in children_token
    ])
    # TODO(Students) End

    assert len(features) == 48
    return features
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    # First we get the top three elements of the stack
    st1 = configuration.get_stack(0)
    st2 = configuration.get_stack(1)
    st3 = configuration.get_stack(2)

    # Next, we get the top three elements of the buffer
    buf1 = configuration.get_buffer(0)
    buf2 = configuration.get_buffer(1)
    buf3 = configuration.get_buffer(2)

    # The left children at level 1 and 2 of the topmost element of the stack
    left_ch1_st1 = configuration.get_left_child(st1, 1)
    left_ch2_st1 = configuration.get_left_child(st1, 2)

    # The left children at level 1 and 2 of the second topmost element of the stack
    left_ch1_st2 = configuration.get_left_child(st2, 1)
    left_ch2_st2 = configuration.get_left_child(st2, 2)

    # The right children at level 1 and 2 of the topmost element of the stack
    right_ch1_st1 = configuration.get_right_child(st1, 1)
    right_ch2_st1 = configuration.get_right_child(st1, 2)

    # The right children at level 1 and 2 of the second topmost element of the stack
    right_ch1_st2 = configuration.get_right_child(st2, 1)
    right_ch2_st2 = configuration.get_right_child(st2, 2)

    # The leftmost children of the topmost and the second topmost element of the stack
    left_ch1_left_ch1_st1 = configuration.get_left_child(left_ch1_st1, 1)
    left_ch1_left_ch1_st2 = configuration.get_left_child(left_ch1_st2, 1)

    # The rightmost children of the topmost and the second topmost element of the stack
    right_ch1_right_ch1_st1 = configuration.get_right_child(right_ch1_st1, 1)
    right_ch1_right_ch1_st2 = configuration.get_right_child(right_ch1_st2, 1)

    # Appending all of this in a list
    childs = [
        st1, st2, st3, buf1, buf2, buf3, left_ch1_st1, right_ch1_st1,
        left_ch2_st1, right_ch2_st1, left_ch1_st2, right_ch1_st2, left_ch2_st2,
        right_ch2_st2, left_ch1_left_ch1_st1, right_ch1_right_ch1_st1,
        left_ch1_left_ch1_st2, right_ch1_right_ch1_st2
    ]
    pos = []
    # We now get the respective parts of speech tags and labels for the ids
    for idx in childs:
        pos.append(configuration.get_pos(idx))

    for idx in childs[6:len(childs) + 1]:
        pos.append(configuration.get_label(idx))

    temp = childs + pos
    #print("features_temp",len(temp))

    features = []
    # Get words, POS tags and Labels and append them to features.
    for word in temp[0:18]:
        features.append(vocabulary.get_word_id(configuration.get_word(word)))

    for pos in temp[18:36]:
        features.append(vocabulary.get_pos_id(pos))

    for label in temp[36:48]:
        features.append(vocabulary.get_label_id(label))
    # TODO(Students) End

    assert len(features) == 48
    return features
Example #23
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    stack_size = configuration.get_stack_size()
    buffer_size = configuration.get_buffer_size()


    #print("Stack:"+str(stack_size))
    #print("Buffer"+str(buffer_size))


    stack_words = []
    buffer_words = []
    fst_2nd_leftmost_rightmost = []
    fst_2nd_leftmost_leftmost_rightmost_rightmost = []

    for i in range(0,3) :
        stack_words.append(configuration.get_stack(i))
        buffer_words.append(configuration.get_buffer(i))

        if i < 2 :
            fst_2nd_leftmost_rightmost.append(configuration.get_left_child(stack_words[i],1))
            fst_2nd_leftmost_rightmost.append(configuration.get_right_child(stack_words[i], 1))
            fst_2nd_leftmost_rightmost.append(configuration.get_left_child(stack_words[i], 2))
            fst_2nd_leftmost_rightmost.append(configuration.get_right_child(stack_words[i], 2))

            fst_2nd_leftmost_rightmost.append( configuration.get_left_child(configuration.get_left_child(stack_words[i],1),1) )
            fst_2nd_leftmost_rightmost.append( configuration.get_right_child(configuration.get_right_child(stack_words[i], 1),1) )



            #fst_2nd_leftmost_leftmost_rightmost_rightmost.append(
                #configuration.get_left_child(configuration.get_left_child(stack_words[i],1),1) )
            #fst_2nd_leftmost_leftmost_rightmost_rightmost.append(
                #configuration.get_right_child(configuration.get_right_child(stack_words[i], 1),1) )


    final = stack_words + buffer_words + fst_2nd_leftmost_rightmost #+ fst_2nd_leftmost_leftmost_rightmost_rightmost


    pos_tags = []
    for j in range(0,len(final)) :
        pos_tags.append(vocabulary.get_pos_id(configuration.get_pos(final[j])))

    temp_final = final[6:18]
    arc_labels = []
    for j in range(0,len(temp_final)) :
        arc_labels.append(vocabulary.get_label_id(configuration.get_label(temp_final[j])))


    for i,x in enumerate(final) :
        final[i] = vocabulary.get_word_id(configuration.get_word(final[i]))

    features = final + pos_tags + arc_labels

    # TODO(Students) End

    assert len(features) == 48

    return features
 def __init__(self, max_words = 500):
     self.model = None
     self.vocab = Vocabulary()
     self.max_words = max_words
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start

    Sw = []
    St = []
    Sl = []

    #s1,s2,s3,b1,b2,b3;
    for idx in [0, 1, 2]:
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_stack(idx))))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_stack(idx))))

    for idx in [0, 1, 2]:
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_buffer(idx))))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_buffer(idx))))

    #lc1(si),rc1(si),lc2(si),rc2(si),i=  1,2
    for idx in [0, 1]:
        wrd = configuration.get_stack(idx)
        # Sw.append(configuration.get_left_child(wrd, 1))
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_left_child(wrd, 1))))
        # St.append(vocabulary.get_pos_id(configuration.get_word(Sw[-1])))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_left_child(wrd, 1))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(configuration.get_left_child(wrd, 1))))

        # Sw.append(configuration.get_right_child(wrd, 1))
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_right_child(wrd, 1))))
        # St.append(vocabulary.get_pos_id(configuration.get_word(Sw[-1])))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_right_child(wrd, 1))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(configuration.get_right_child(wrd,
                                                                      1))))

        # Sw.append(configuration.get_left_child(wrd, 2))
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_left_child(wrd, 2))))
        # St.append(vocabulary.get_pos_id(configuration.get_word(Sw[-1])))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_left_child(wrd, 2))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(configuration.get_left_child(wrd, 2))))

        # Sw.append(configuration.get_right_child(wrd, 2))
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(configuration.get_right_child(wrd, 2))))
        # St.append(vocabulary.get_pos_id(configuration.get_word(Sw[-1])))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(configuration.get_right_child(wrd, 2))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(configuration.get_right_child(wrd,
                                                                      2))))

    #lc1(lc1(si)),rc1(rc1(si)),i= 1,2
    for idx in [0, 1]:
        wrd = configuration.get_stack(idx)
        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(
                    configuration.get_left_child(
                        configuration.get_left_child(wrd, 1), 1))))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(
                    configuration.get_left_child(
                        configuration.get_left_child(wrd, 1), 1))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(
                    configuration.get_left_child(
                        configuration.get_left_child(wrd, 1), 1))))

        Sw.append(
            vocabulary.get_word_id(
                configuration.get_word(
                    configuration.get_right_child(
                        configuration.get_right_child(wrd, 1), 1))))
        St.append(
            vocabulary.get_pos_id(
                configuration.get_pos(
                    configuration.get_right_child(
                        configuration.get_right_child(wrd, 1), 1))))
        Sl.append(
            vocabulary.get_label_id(
                configuration.get_label(
                    configuration.get_right_child(
                        configuration.get_right_child(wrd, 1), 1))))

    # print("***********************")
    # print(configuration.get_str())

    # print(Sw)
    # print(St)
    # print(Sl)

    features = []
    features.extend(Sw)
    features.extend(St)
    features.extend(Sl)
    assert len(features) == 48

    # print(configuration.get_stack(0))
    # print(configuration.get_word(configuration.get_stack(0)))
    # print(vocabulary.get_word_id("unreadable"))
    # print(configuration.get_label(configuration.get_stack(0)))
    # print("id", vocabulary.get_pos_id(configuration.get_stack(0)))
    # print("word", vocabulary.get_pos_id(configuration.get_word(configuration.get_stack(0))))
    # print("word", vocabulary.get_label_id(configuration.get_word(configuration.get_stack(0))))

    # print(f)
    return features
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start

    # TODO(Students) End

    features = []
    #Step 1: Take top 3 from stack and buffer
    for i in range(3):
        features.append(configuration.get_stack(i))

    for i in range(3):
        features.append(configuration.get_buffer(i))

    #Step2: first and second leftmost/rightmost children of the top two words on the stack
    for i in range(2):
        #first leftmost children
        left1 = configuration.get_left_child(features[i], 1)
        right1 = configuration.get_right_child(features[i], 1)

        #second leftmost children
        left2 = configuration.get_left_child(features[i], 2)
        right2 = configuration.get_right_child(features[i], 2)

        #As part of Step 3
        lc1_lc1_s_i = configuration.get_left_child(left1, 1)
        rc1_rc1_s_i = configuration.get_right_child(right1, 1)

        features.extend(
            [left1, right1, left2, right2, lc1_lc1_s_i, rc1_rc1_s_i])
        #print(features)

    num_of_features = len(features)

    #Extracting POS of the words extracted
    for i in range((num_of_features)):
        features.append(configuration.get_pos(features[i]))

    #Extracting arc labels excluding the 6 words on the stack/buffer
    for i in range(6, 18):
        features.append(configuration.tree.get_label(features[i]))

    #Extracting ID's of the features in the form of S_w,S_t,S_l
    for i in range(18):
        features[i] = vocabulary.get_word_id(
            configuration.get_word(features[i]))

    for i in range(18, 36):
        features[i] = vocabulary.get_pos_id(features[i])

    for i in range(36, 48):
        features[i] = vocabulary.get_label_id(features[i])

    assert len(features) == 48
    return features
Example #27
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start

    features = []  # Store list of all features (initially only word features)
    pos_features = []  # Store list of pos tag features
    label_features = []  # Store list of label features

    top3Stack = [configuration.get_stack(i)
                 for i in range(3)]  # top 3 elements of stack
    top3Buffer = [configuration.get_buffer(i)
                  for i in range(3)]  # top 3 elements of buffer

    for token_index in top3Stack + top3Buffer:  # Iterate over top 3 words in stack and top 3 words in buffer
        # Add word to the features
        features.append(
            vocabulary.get_word_id(configuration.get_word(token_index)))
        # Add pos tag of corresponding word to the pos_features
        pos_features.append(
            vocabulary.get_pos_id(configuration.get_pos(token_index)))

    for token_index in top3Stack[:2]:  # Iterate over top 2 words in stack
        # Iterate over 1 and 2 to get 1st leftmost, 1st rightmost, 2nd leftmost and 2nd rightmost child
        # of corresponding word in stack.
        for i in range(1, 3):
            ith_left_child = configuration.get_left_child(
                token_index, i)  # Get ith_leftmost_child of word in stack
            # Add child to the features
            features.append(
                vocabulary.get_word_id(configuration.get_word(ith_left_child)))
            # Add pos tag of corresponding child to the pos_features
            pos_features.append(
                vocabulary.get_pos_id(configuration.get_pos(ith_left_child)))
            # Add label of corresponding child to the label_features
            label_features.append(
                vocabulary.get_label_id(
                    configuration.get_label(ith_left_child)))

            # Similarly for rightmost child add child word, pos tag and label to respective features list
            ith_right_child = configuration.get_right_child(token_index, i)
            features.append(
                vocabulary.get_word_id(
                    configuration.get_word(ith_right_child)))
            pos_features.append(
                vocabulary.get_pos_id(configuration.get_pos(ith_right_child)))
            label_features.append(
                vocabulary.get_label_id(
                    configuration.get_label(ith_right_child)))

    for token_index in top3Stack[:2]:  # Iterate over top 2 words in stack
        # Get leftmost child of leftmost child of word in stack
        left_left_child = configuration.get_left_child(
            configuration.get_left_child(token_index, 1), 1)
        # Add the corresponding child word, pos tag and label to respective features list
        features.append(
            vocabulary.get_word_id(configuration.get_word(left_left_child)))
        pos_features.append(
            vocabulary.get_pos_id(configuration.get_pos(left_left_child)))
        label_features.append(
            vocabulary.get_label_id(configuration.get_label(left_left_child)))

        # Similarly for rightmost child of rightmost child add child word, pos tag and label to respective features list
        right_right_child = configuration.get_right_child(
            configuration.get_right_child(token_index, 1), 1)
        features.append(
            vocabulary.get_word_id(configuration.get_word(right_right_child)))
        pos_features.append(
            vocabulary.get_pos_id(configuration.get_pos(right_right_child)))
        label_features.append(
            vocabulary.get_label_id(
                configuration.get_label(right_right_child)))

    features += pos_features + label_features  # Append the pos and label features to the word features

    # TODO(Students) End

    assert len(features) == 48
    return features
Example #28
0
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    words = []
    posTags = []
    labels = []

    # Get the words and pos tags of the top 3 elements of the stack.
    for idx in range(2, -1, -1):
        stack = configuration.get_stack(idx)
        words.append(vocabulary.get_word_id(configuration.get_word(stack)))
        posTags.append(vocabulary.get_pos_id(configuration.get_pos(stack)))

    # Get the words and pos tags of the top 3 elements of the buffer.
    for idx in range(3):
        buffer = configuration.get_buffer(idx)
        words.append(vocabulary.get_word_id(configuration.get_word(buffer)))
        posTags.append(vocabulary.get_pos_id(configuration.get_pos(buffer)))

    # Get the words, labels, and pos tags of the first and second left child and right child of the top two elements
    # on the stack, and
    # Get the words, labels, and pos tags of the leftmost of the leftmost and rightmost of the rightmost child
    # of the top two elements on the stack
    for idx in range(2):
        stack = configuration.get_stack(idx)
        firstLeftChild = configuration.get_left_child(stack, 1)
        words.append(
            vocabulary.get_word_id(configuration.get_word(firstLeftChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(firstLeftChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(firstLeftChild)))

        firstRightChild = configuration.get_right_child(stack, 1)
        words.append(
            vocabulary.get_word_id(configuration.get_word(firstRightChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(firstRightChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(firstRightChild)))

        secondLeftChild = configuration.get_left_child(stack, 2)
        words.append(
            vocabulary.get_word_id(configuration.get_word(secondLeftChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(secondLeftChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(secondLeftChild)))

        secondRightChild = configuration.get_right_child(stack, 2)
        words.append(
            vocabulary.get_word_id(configuration.get_word(secondRightChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(secondRightChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(secondRightChild)))

        leftLeftChild = configuration.get_left_child(
            configuration.get_left_child(stack, 1), 1)
        words.append(
            vocabulary.get_word_id(configuration.get_word(leftLeftChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(leftLeftChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(leftLeftChild)))

        rightRightChild = configuration.get_right_child(
            configuration.get_right_child(stack, 1), 1)
        words.append(
            vocabulary.get_word_id(configuration.get_word(rightRightChild)))
        labels.append(
            vocabulary.get_label_id(configuration.get_label(rightRightChild)))
        posTags.append(
            vocabulary.get_pos_id(configuration.get_pos(rightRightChild)))

    features = []
    features += words + posTags + labels
    # TODO(Students) End

    assert len(features) == 48
    return features
Example #29
0
    # Setup Serialization dir
    save_serialization_dir = os.path.join("serialization_dirs",
                                          args.experiment_name)
    if not os.path.exists(save_serialization_dir):
        os.makedirs(save_serialization_dir)

    # Setup Training / Validation data
    print("Reading training data")
    train_sentences, train_trees = read_conll_data(args.train_data_file_path)

    print("Reading validation data")
    validation_sentences, validation_trees = read_conll_data(
        args.validation_data_file_path)

    vocabulary = Vocabulary(train_sentences, train_trees)

    sorted_labels = [
        item[0] for item in sorted(vocabulary.label_token_to_id.items(),
                                   key=lambda e: e[1])
    ]
    non_null_sorted_labels = sorted_labels[1:]

    parsing_system = ParsingSystem(non_null_sorted_labels)

    # Generating training instances takes ~20 minutes everytime. So once you finalize the
    # feature generation and want to try different configs for experiments, you can use caching.
    if args.use_cached_data:
        print("Loading cached training instances")
        cache_processed_data_path = args.train_data_file_path.replace(
            "conll", "jsonl")
def get_configuration_features(configuration: Configuration,
                               vocabulary: Vocabulary) -> List[List[int]]:
    """
    =================================================================

    Implement feature extraction described in
    "A Fast and Accurate Dependency Parser using Neural Networks"(2014)

    =================================================================
    """
    # TODO(Students) Start
    # print(configuration.get_stack_size())
    # print(configuration.get_buffer_size())
    # print("****")
    words = []
    pos_tags = []
    arc_labels = []
    for index in range(0, 3):
        stackIndex = configuration.get_stack(index)
        bufferIndex = configuration.get_buffer(index)

        #Pushing top 3 words from stack n buffer to features_list : s1; s2; s3; b1; b2; b3;
        words.append(vocabulary.get_word_id(
            configuration.get_word(stackIndex)))
        words.append(
            vocabulary.get_word_id(configuration.get_word(bufferIndex)))

        pos_tags.append(
            vocabulary.get_pos_id(configuration.get_pos(stackIndex)))
        pos_tags.append(
            vocabulary.get_pos_id(configuration.get_pos(bufferIndex)))

    for index in range(0, 2):
        stackIndex = configuration.get_stack(index)

        #Getting indexes of the first and second leftmost / rightmost children of the top two words on the stack: lc1(si); rc1(si); lc2(si); rc2(si), i = 1; 2.
        child1 = [
            configuration.get_left_child(stackIndex, 1),
            configuration.get_right_child(stackIndex, 1),
            configuration.get_left_child(stackIndex, 2),
            configuration.get_right_child(stackIndex, 2)
        ]

        #Getting indexes of leftmost of leftmost / rightmost of rightmost children of the top two words on the stack: lc1(lc1(si)); rc1(rc1(si)), i = 1; 2.
        child2 = [
            configuration.get_left_child(
                configuration.get_left_child(stackIndex, 1), 1),
            configuration.get_right_child(
                configuration.get_right_child(stackIndex, 1), 1)
        ]

        #Merging child1 and child2 to single children list.
        children = child1 + child2

        #Pushing children to features_list:
        for childIndex in children:
            words.append(
                vocabulary.get_word_id(configuration.get_word(childIndex)))
            pos_tags.append(
                vocabulary.get_pos_id(configuration.get_pos(childIndex)))
            arc_labels.append(
                vocabulary.get_label_id(configuration.get_label(childIndex)))

    features = words + pos_tags + arc_labels

    #*************************************************#
    # ##pushing top 3 words from stack
    # words.append(configuration.get_stack(0))
    # words.extend([configuration.get_stack(1),configuration.get_stack(2)])
    # ##pushing top 3 words from buffer:
    # words.extend([configuration.get_buffer(0),configuration.get_buffer(1),configuration.get_buffer(2)])
    # ##pushing The first and second leftmost / rightmost children of the top two words on the stack:
    # words.extend([configuration.get_left_child(0,1),configuration.get_right_child(0,1),configuration.get_left_child(0,2),configuration.get_right_child(0,2)])
    # words.extend([configuration.get_left_child(1,1),configuration.get_right_child(1,1),configuration.get_left_child(1,2),configuration.get_right_child(1,2)])
    #
    # ##pushing The leftmost of leftmost / rightmost of rightmost children of the top two words on the stack:
    # words.extend([configuration.get_left_child(configuration.get_left_child(0,1),1),configuration.get_right_child(configuration.get_right_child(0,1),1)])
    # words.extend([configuration.get_left_child(configuration.get_left_child(1,1),1),configuration.get_right_child(configuration.get_right_child(1,1),1)])
    #**************************************************#

    # TODO(Students) End

    assert len(features) == 48
    return features