Exemple #1
0
 def __get_instances_from_file(self, file_name):
     """
     helper function to convert input file to lists of lists holding input words|tags
     """
     data = [(words, tags) for (words, tags) in list(read_conll_file(file_name))]
     words = [words for (words, _) in data]
     tags = [tags for (_, tags) in data]
     return words, tags
Exemple #2
0
    def make_data(self, file_name, w2i=None, t2i=None, freeze=False):
        """
        transform data to features (map word to indices, w2i); reserve index 0 for PADDING, 1 for UNK
        map tags to indices (t2i) [in Keras labels need to be integers]
        :freeze: False = test data (do not add new words)
        """
        if not w2i:
            w2i = {"<pad>": 0, "_UNK": 1}
            t2i = {}

        X = []
        Y = []
        X_org = [] # keep original words for type-constr.
        num_sentences = 0
        num_tokens = 0

        for instance_idx, (words, tags) in enumerate(read_conll_file(file_name)):

            num_sentences += 1
            instance_feats_indices = []  # sequence of word indices
            instance_tags_indices = []  # sequence of tag indices

            for i, (word, tag) in enumerate(zip(words, tags)):
                num_tokens += 1
                # map words and tags to indices
                if word not in w2i:
                    if not freeze:
                        w2i[word] = len(w2i)
                        instance_feats_indices.append(w2i[word])
                    else:
                        # set to UNK
                        instance_feats_indices.append(w2i["_UNK"])
                else:
                    instance_feats_indices.append(w2i[word])

                if not freeze:
                    if tag not in t2i:
                        t2i[tag] = len(t2i) #+1 #start from 1 (reserve 0 for padding!)

                instance_tags_indices.append(t2i.get(tag))

            X.append(instance_feats_indices)
            Y.append(instance_tags_indices)
            X_org.append(words)

        # reading train data
        if not freeze:
            i2t = {id: tag for tag, id in t2i.items()}
            print("%s sentences %s tokens" % (num_sentences, num_tokens))
            print("%s features" % len(w2i))

        assert (len(X) == len(Y))  # make sure lengths match
        if not freeze:
            return X, Y, w2i, t2i  # return token/tag indices
        else:
            return X, Y, X_org
Exemple #3
0
    def make_data(self, file_name, w2i=None, t2i=None, freeze=False):
        """
        transform data to features (map word to indices, w2i); reserve index 0 for PADDING, 1 for UNK
        map tags to indices (t2i) [in Keras labels need to be integers]
        :freeze: False = test data (do not add new words)
        """
        if not w2i:
            w2i = {"<pad>": 0, "_UNK": 1}
            #t2i = {"<padtag>": 0} # get rid of padtag and use masks!
            t2i = {}

        X = []
        Y = []
        X_org = [] # keep original words for type-constr.
        num_sentences = 0
        num_tokens = 0

        for instance_idx, (words, tags) in enumerate(read_conll_file(file_name)):

            num_sentences += 1
            instance_feats_indices = []  # sequence of word indices
            instance_tags_indices = []  # sequence of tag indices

            for i, (word, tag) in enumerate(zip(words, tags)):
                num_tokens += 1
                # map words and tags to indices
                if word not in w2i:
                    if not freeze:
                        w2i[word] = len(w2i)
                        instance_feats_indices.append(w2i[word])
                    else:
                        # set to UNK
                        instance_feats_indices.append(w2i["_UNK"])
                else:
                    instance_feats_indices.append(w2i[word])

                if not freeze:
                    if tag not in t2i:
                        t2i[tag] = len(t2i) #+1 #start from 1 (reserve 0 for padding!)

                instance_tags_indices.append(t2i.get(tag))

            X.append(instance_feats_indices)
            Y.append(instance_tags_indices)
            X_org.append(words)

        if not freeze: # when reading train data
            i2t = {id: tag for tag, id in t2i.items()}
            print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
            print("%s features" % len(w2i), file=sys.stderr)

        assert (len(X) == len(Y))  # make sure lengths match
        if not freeze:
            return X, Y, w2i, t2i  # return token/tag indices
        else:
            return X, Y, X_org
Exemple #4
0
    def get_data_as_indices(self, file_name):
        """
        X = list of (word_indices, word_char_indices)
        Y = list of tag indices
        """
        X, Y = [],[]
        org_X, org_Y = [], []

        for (words, tags) in read_conll_file(file_name):
            word_indices, word_char_indices = self.get_features(words)
            tag_indices = [self.tag2idx.get(tag) for tag in tags]
            X.append((word_indices,word_char_indices))
            Y.append(tag_indices)
            org_X.append(words)
            org_Y.append(tags)
        return X, Y  #, org_X, org_Y - for now don't use
 def get_data_as_indices(self, folder_name, task):
     """
     X = list of (word_indices, word_char_indices)
     Y = list of tag indices
     """
     X, Y = [],[]
     org_X, org_Y = [], []
     task_labels = []
     for (words, tags) in read_conll_file(folder_name):
         word_indices, word_char_indices = self.get_features(words)
         tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags]
         X.append((word_indices,word_char_indices))
         Y.append(tag_indices)
         org_X.append(words)
         org_Y.append(tags)
         task_labels.append( task )
     return X, Y, org_X, org_Y, task_labels
Exemple #6
0
 def get_data_as_indices(self, folder_name, task):
     """
     X = list of (word_indices, word_char_indices)
     Y = list of tag indices
     """
     X, Y = [],[]
     org_X, org_Y = [], []
     task_labels = []
     for (words, tags) in read_conll_file(folder_name):
         word_indices, word_char_indices = self.get_features(words)
         tag_indices = [self.task2tag2idx[task].get(tag) for tag in tags]
         X.append((word_indices,word_char_indices))
         Y.append(tag_indices)
         org_X.append(words)
         org_Y.append(tags)
         task_labels.append( task )
     return X, Y, org_X, org_Y, task_labels
Exemple #7
0
    def get_train_data(self, train_data):
        """
        transform training data to features (word indices)
        map tags to integers
        """
        X = []
        Y = []

        # word 2 indices and tag 2 indices
        w2i = {} # word to index
        c2i = {} # char to index
        tag2idx = {} # tag2idx

        w2i["_UNK"] = 0  # unk word / OOV
        c2i["_UNK"] = 0  # unk char
        c2i["<w>"] = 1   # word start
        c2i["</w>"] = 2  # word end index
        
        
        num_sentences=0
        num_tokens=0
        for instance_idx, (words, tags) in enumerate(read_conll_file(train_data)):
            instance_word_indices = [] #sequence of word indices
            instance_char_indices = [] #sequence of char indices
            instance_tags_indices = [] #sequence of tag indices

            for i, (word, tag) in enumerate(zip(words, tags)):

                # map words and tags to indices
                if word not in w2i:
                    w2i[word] = len(w2i)
                instance_word_indices.append(w2i[word])

                if self.c_in_dim > 0:
                    chars_of_word = [c2i["<w>"]]
                    for char in word:
                        if char not in c2i:
                            c2i[char] = len(c2i)
                        chars_of_word.append(c2i[char])
                    chars_of_word.append(c2i["</w>"])
                    instance_char_indices.append(chars_of_word)

                if tag not in tag2idx:
                    tag2idx[tag]=len(tag2idx)

                instance_tags_indices.append(tag2idx.get(tag))

                num_tokens+=1

            num_sentences+=1

            X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
            Y.append(instance_tags_indices)


        print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
        print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)
        if self.c_in_dim == 0:
            print("char features disabled", file=sys.stderr)

        assert(len(X)==len(Y))

        # store mappings of words and tags to indices
        self.set_indices(w2i, c2i, tag2idx)

        return X, Y
    def get_train_data(self, list_folders_name):
        """

        :param list_folders_name: list of folders names
        :param lower: whether to lowercase tokens

        transform training data to features (word indices)
        map tags to integers
        """
        X = []
        Y = []
        task_labels = [] #keeps track of where instances come from "task1" or "task2"..
        self.tasks_ids = [] #record the id of the tasks

        #num_sentences=0
        #num_tokens=0

        # word 2 indices and tag 2 indices
        w2i = {} # word to index
        c2i = {} # char to index
        task2tag2idx = {} # id of the task -> tag2idx

        w2i["_UNK"] = 0  # unk word / OOV
        c2i["_UNK"] = 0  # unk char
        c2i["<w>"] = 1   # word start
        c2i["</w>"] = 2  # word end index
        
        
        for i, folder_name in enumerate( list_folders_name ):
            num_sentences=0
            num_tokens=0
            task_id = 'task'+str(i)
            self.tasks_ids.append( task_id )
            if task_id not in task2tag2idx:
                task2tag2idx[task_id] = {}
            for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)):
                num_sentences += 1
                instance_word_indices = [] #sequence of word indices
                instance_char_indices = [] #sequence of char indices 
                instance_tags_indices = [] #sequence of tag indices

                for i, (word, tag) in enumerate(zip(words, tags)):
                    num_tokens += 1

                    # map words and tags to indices
                    if word not in w2i:
                        w2i[word] = len(w2i)
                    instance_word_indices.append(w2i[word])

                    chars_of_word = [c2i["<w>"]]
                    for char in word:
                        if char not in c2i:
                            c2i[char] = len(c2i)
                        chars_of_word.append(c2i[char])
                    chars_of_word.append(c2i["</w>"])
                    instance_char_indices.append(chars_of_word)
                            
                    if tag not in task2tag2idx[task_id]:
                        #tag2idx[tag]=len(tag2idx)
                        task2tag2idx[task_id][tag]=len(task2tag2idx[task_id])

                    instance_tags_indices.append(task2tag2idx[task_id].get(tag))

                X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
                Y.append(instance_tags_indices)
                task_labels.append(task_id)

            #self.num_labels[task_id] = len( task2tag2idx[task_id] )

            if num_sentences == 0 or num_tokens == 0:
                sys.exit( "No data read from: "+folder_name )
            print("TASK "+task_id+" "+folder_name, file=sys.stderr )
            print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
            print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)

        assert(len(X)==len(Y))
        return X, Y, task_labels, w2i, c2i, task2tag2idx  #sequence of features, sequence of labels, necessary mappings
Exemple #9
0
vocab = Vocab(vocabfile)

if "embeds" in config:
    tagger = SimpleBiltyTagger(
        config.in_dim,
        config.h_dim,
        config.c_in_dim,
        config.h_layers,
        embeds_file=config.embeds,
        word2id=vocab.word2id,
    )
else:
    tagger = SimpleBiltyTagger(config.in_dim,
                               config.h_dim,
                               config.c_in_dim,
                               config.h_layers,
                               embeds_file=None,
                               word2id=vocab.word2id)

tagger = load_tagger(model)

test_X, test_Y = tagger.get_data_as_indices(testfile)

correct, total = tagger.evaluate(test_X, test_Y)
print("accuracy", correct / total)

dev_test_labels = []
for _, tags in read_conll_file(testfile):
    dev_test_labels.append(tags)
tagger.get_predictions_output(test_X, dev_test_labels, "dev.xxx.out")
Exemple #10
0
    def get_train_data(self, list_folders_name):
        """

        :param list_folders_name: list of folders names
        :param lower: whether to lowercase tokens

        transform training data to features (word indices)
        map tags to integers
        """
        X = []
        Y = []
        task_labels = [] #keeps track of where instances come from "task1" or "task2"..
        self.tasks_ids = [] #record the id of the tasks

        #num_sentences=0
        #num_tokens=0

        # word 2 indices and tag 2 indices
        w2i = {} # word to index
        c2i = {} # char to index
        task2tag2idx = {} # id of the task -> tag2idx

        w2i["_UNK"] = 0  # unk word / OOV
        c2i["_UNK"] = 0  # unk char
        c2i["<w>"] = 1   # word start
        c2i["</w>"] = 2  # word end index
        
        
        for i, folder_name in enumerate( list_folders_name ):
            num_sentences=0
            num_tokens=0
            task_id = 'task'+str(i)
            self.tasks_ids.append( task_id )
            if task_id not in task2tag2idx:
                task2tag2idx[task_id] = {}
            for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)):
                num_sentences += 1
                instance_word_indices = [] #sequence of word indices
                instance_char_indices = [] #sequence of char indices 
                instance_tags_indices = [] #sequence of tag indices

                for i, (word, tag) in enumerate(zip(words, tags)):
                    num_tokens += 1

                    # map words and tags to indices
                    if word not in w2i:
                        w2i[word] = len(w2i)
                    instance_word_indices.append(w2i[word])

                    chars_of_word = [c2i["<w>"]]
                    for char in word:
                        if char not in c2i:
                            c2i[char] = len(c2i)
                        chars_of_word.append(c2i[char])
                    chars_of_word.append(c2i["</w>"])
                    instance_char_indices.append(chars_of_word)
                            
                    if tag not in task2tag2idx[task_id]:
                        #tag2idx[tag]=len(tag2idx)
                        task2tag2idx[task_id][tag]=len(task2tag2idx[task_id])

                    instance_tags_indices.append(task2tag2idx[task_id].get(tag))

                X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
                Y.append(instance_tags_indices)
                task_labels.append(task_id)

            #self.num_labels[task_id] = len( task2tag2idx[task_id] )

            if num_sentences == 0 or num_tokens == 0:
                sys.exit( "No data read from: "+folder_name )
            print("TASK "+task_id+" "+folder_name, file=sys.stderr )
            print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
            print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)

        assert(len(X)==len(Y))
        return X, Y, task_labels, w2i, c2i, task2tag2idx  #sequence of features, sequence of labels, necessary mappings
Exemple #11
0
    def get_train_data(self, list_folders_name):
        """
        Get train data: read each train set (linked to a task)

        :param list_folders_name: list of folders names

        transform training data to features (word indices)
        map tags to integers
        """
        X = []
        Y = []
        task_labels = [] # keeps track of where instances come from "task1" or "task2"..
        self.tasks_ids = [] # record ids of the tasks

        # word 2 indices and tag 2 indices
        w2i = {} # word to index
        c2i = {} # char to index
        task2tag2idx = {} # id of the task -> tag2idx

        w2i[UNK] = 0  # unk word / OOV
        c2i[UNK] = 0  # unk char
        c2i["<w>"] = 1   # word start
        c2i["</w>"] = 2  # word end index

        if self.max_vocab_size is not None:
            word_counter = Counter()
            print('Reading files to create vocabulary of size %d.' %
                  self.max_vocab_size)
            for i, folder_name in enumerate(list_folders_name):
                for words, _ in read_conll_file(folder_name):
                    word_counter.update(words)
            word_count_pairs = word_counter.most_common(self.max_vocab_size-1)
            for word, _ in word_count_pairs:
                w2i[word] = len(w2i)

        for i, folder_name in enumerate(list_folders_name):
            num_sentences=0
            num_tokens=0
            task_id = 'task'+str(i)
            self.tasks_ids.append( task_id )
            if task_id not in task2tag2idx:
                task2tag2idx[task_id] = {}
            for instance_idx, (words, tags) in enumerate(read_conll_file(folder_name)):
                num_sentences += 1
                instance_word_indices = [] #sequence of word indices
                instance_char_indices = [] #sequence of char indices 
                instance_tags_indices = [] #sequence of tag indices

                for i, (word, tag) in enumerate(zip(words, tags)):
                    num_tokens += 1

                    # map words and tags to indices
                    if word not in w2i and self.max_vocab_size is not None:
                        # if word is not in the created vocab, add an UNK token
                        instance_word_indices.append(w2i[UNK])
                    else:
                        if word not in w2i:
                            w2i[word] = len(w2i)
                        instance_word_indices.append(w2i[word])

                    if self.c_in_dim > 0:
                        chars_of_word = [c2i["<w>"]]
                        for char in word:
                            if char not in c2i:
                                c2i[char] = len(c2i)
                            chars_of_word.append(c2i[char])
                        chars_of_word.append(c2i["</w>"])
                        instance_char_indices.append(chars_of_word)
                            
                    if tag not in task2tag2idx[task_id]:
                        task2tag2idx[task_id][tag]=len(task2tag2idx[task_id])

                    instance_tags_indices.append(task2tag2idx[task_id].get(tag))

                X.append((instance_word_indices, instance_char_indices)) # list of word indices, for every word list of char indices
                Y.append(instance_tags_indices)
                task_labels.append(task_id)

            if num_sentences == 0 or num_tokens == 0:
                sys.exit( "No data read from: "+folder_name )

            print("TASK "+task_id+" "+folder_name, file=sys.stderr )
            print("%s sentences %s tokens" % (num_sentences, num_tokens), file=sys.stderr)
            print("%s w features, %s c features " % (len(w2i),len(c2i)), file=sys.stderr)

        assert(len(X)==len(Y))
        return X, Y, task_labels, w2i, c2i, task2tag2idx  #sequence of features, sequence of labels, necessary mappings