def get_w_reps(idx, w_reps, vocab):
    ws = []
    reps = []
    if not idx:
        return ws, reps

    w_dict = LabelDictionary(read_vocab(vocab))
    for w, rep in w_reps:
        if w_dict.get_label_id(w) in idx:
            assert not np.isnan(np.sum(rep))
            ws.append(w)
            reps.append(rep)

    return ws, reps
Beispiel #2
0
def get_w_reps(idx, w_reps, vocab):
    ws = []
    reps = []
    if not idx:
        return ws, reps

    w_dict = LabelDictionary(read_vocab(vocab))
    for w, rep in w_reps:
        if w_dict.get_label_id(w) in idx:
            assert not np.isnan(np.sum(rep))
            ws.append(w)
            reps.append(rep)

    return ws, reps
Beispiel #3
0
    def __init__(self, dataset):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.feature_list = []

        self.add_features = False
        self.dataset = dataset

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.features_used = set()
Beispiel #4
0
    def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None,
                 eval_spec_rels=False, lr=False):

        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")
Beispiel #5
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
Beispiel #6
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
Beispiel #7
0
    def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None
Beispiel #8
0
    def __init__(self):
        self.x_dict = LabelDictionary(
            ["write", "that", "code", "ROOT", "don't"])
        self.train_trees = TreeList()
        tree_ex1 = Tree()  # container for node_list and edge_list
        idx = self.x_dict.get_label_id("write")
        n0 = Node(len(tree_ex1), idx)  # len is 0
        tree_ex1.add_node(n0)
        idx = self.x_dict.get_label_id("that")
        n1 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n1)
        idx = self.x_dict.get_label_id("code")
        n2 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n2)
        idx = self.x_dict.get_label_id("ROOT")
        n3 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n3)

        tree_ex1.add_edge(Edge(n0, n2))
        tree_ex1.add_edge(Edge(n2, n1))
        tree_ex1.add_edge(Edge(n3, n0))

        self.train_trees.add_tree(tree_ex1)

        tree_ex2 = Tree()
        idx = self.x_dict.get_label_id("don't")
        n0 = Node(len(tree_ex1), idx)  # len is 0
        tree_ex2.add_node(n0)
        idx = self.x_dict.get_label_id("write")
        n1 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n1)
        idx = self.x_dict.get_label_id("code")
        n2 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n2)
        idx = self.x_dict.get_label_id("ROOT")
        n3 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n3)

        tree_ex2.add_edge(Edge(n0, n1))
        tree_ex2.add_edge(Edge(n1, n2))
        tree_ex2.add_edge(Edge(n3, n0))

        self.train_trees.add_tree(tree_ex2)
Beispiel #9
0
    def __init__(self):
        #observation vocabulary
        self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"])

        #training sequences
        train_seqs = SequenceList(self.x_dict)
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "shop", "shop", "clean"])

        self.train = train_seqs
Beispiel #10
0
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(
            self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")
Beispiel #11
0
    def prepare_seqs_nl(self, vocab_f):
        self.ner_corpus = Conll2002NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)

        return train_seq, dev_seq, test_seq
Beispiel #12
0
    def __init__(self,
                 wordrep_dict=None,
                 eval_spec_rel=False,
                 dirname=None,
                 lr=False,
                 use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None
Beispiel #13
0
    def prepare_seqs_en(self, vocab_f):
        self.ner_corpus = Conll2003NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)
        if self.use_muc:
            mapper_corpus(muc_seq, self.embeddings)

        return train_seq, dev_seq, test_seq, muc_seq
Beispiel #14
0
class IDFeatures:
    '''
        Base class to extract features from a particular dataset.

        feature_dic --> Dictionary of all existing features maps feature_name (string) --> feature_id (int) 
        feture_names --> List of feature names. Each position is the feature_id and contains the feature name
        nr_feats --> Total number of features
        feature_list --> For each sentence in the corpus contains a pair of node feature and edge features
        dataset --> The original dataset for which the features were extracted

        Caches (for speedup):
        initial_state_feature_cache -->
        node_feature_cache -->
        edge_feature_cache -->
        final_state_feature_cache -->
    '''

    def __init__(self, dataset):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.feature_list = []

        self.add_features = False
        self.dataset = dataset

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.features_used = set()

    def get_num_features(self):
        return len(self.feature_dict)

    def build_features(self):
        '''
        Generic function to build features for a given dataset.
        Iterates through all sentences in the dataset and extracts its features,
        saving the node/edge features in feature list.
        '''
        self.add_features = True
        for sequence in self.dataset.seq_list:
           initial_features, transition_features, final_features, emission_features = \
               self.get_sequence_features(sequence)
           self.feature_list.append([initial_features, transition_features, final_features, emission_features])
        self.add_features = False

    def get_sequence_features(self, sequence):
        '''
        Returns the features for a given sequence.
        For a sequence of size N returns:
        Node_feature a list of size N. Each entry contains the node potentials for that position.
        Edge_features a list of size N+1.
        - Entry 0 contains the initial features
        - Entry N contains the final features
        - Entry i contains entries mapping the transition from i-1 to i.
        '''
        emission_features = []
        initial_features = []
        transition_features = []
        final_features = []

        ## Take care of first position
        features = []
        features = self.add_initial_features(sequence, sequence.y[0], features)
        initial_features.append(features)


        ## Take care of middle positions
        for pos, tag in enumerate(sequence.y):
            features = {}
            features = self.add_emission_features(sequence, pos, sequence.y[pos], features)
            emission_features.append(features)
            if pos > 0:
                prev_tag = sequence.y[pos-1]
                features = []
                features = self.add_transition_features(sequence, pos-1, tag, prev_tag, features)            
                transition_features.append(features)
            """
            if pos > 1:
                prev_tag = sequence.y[pos-1]
                prev_prev_tag = sequence.y[pos-2]
                features = []
                features = self.add_transition_features(sequence, pos-1, tag, prev_tag, prev_prev_tag, features)
                transition_features.append(features)
            """
        ## Take care of final position
        features = []
        features = self.add_final_features(sequence, sequence.y[-1], features)
        final_features.append(features)

        return initial_features, transition_features, final_features, emission_features

    #f(t,y_t,X)
    # Add the word identity and if position is
    # the first also adds the tag position
    def get_emission_features(self, sequence, pos, y):
        all_feat = []
        x = sequence.x[pos]
        if x not in self.node_feature_cache:
            self.node_feature_cache[x] = {}
        if y not in self.node_feature_cache[x]:
            node_idx = []
            node_idx = self.add_emission_features(sequence, pos, y, node_idx)
            self.node_feature_cache[x][y] = node_idx
        idx = self.node_feature_cache[x][y]
        all_feat = idx[:]
        return all_feat

    #f(t,y_t,y_(t-1),X)
    ##Speed up of code
    def get_transition_features(self, sequence, pos, y, y_prev):
        assert (pos >= 0 and pos < len(sequence.x))

        if y not in self.edge_feature_cache:
            self.edge_feature_cache[y] = {}
        if y_prev not in self.edge_feature_cache[y]:
            edge_idx = []
            edge_idx = self.add_transition_features(sequence, pos, y, y_prev, edge_idx)            
            self.edge_feature_cache[y][y_prev] = edge_idx
        return self.edge_feature_cache[y][y_prev]

    def get_initial_features(self, sequence, y):
       if y not in self.initial_state_feature_cache:
           edge_idx = []
           edge_idx = self.add_initial_features(sequence, y, edge_idx)
           self.initial_state_feature_cache[y] = edge_idx
       return self.initial_state_feature_cache[y]

    def get_final_features(self, sequence, y_prev):
        if y_prev not in self.final_state_feature_cache:
            edge_idx = []
            edge_idx = self.add_final_features(sequence, y_prev, edge_idx)            
            self.final_state_feature_cache[y_prev] = edge_idx
        return self.final_state_feature_cache[y_prev]

    def add_initial_features(self, sequence, y, features):
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Generate feature name.
        feat_name = "init_tag:{}".format(y_name)
        self.features_used.add("init_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_final_features(self, sequence, y_prev, features):
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y_prev)
        # Generate feature name.
        feat_name = "final_prev_tag:{}".format(y_name)
        self.features_used.add("final_prev_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if(feat_id != -1):
            features.append(feat_id)
        return features

    def add_emission_features(self, sequence, pos, y, features):
        '''Add word-tag pair feature.'''
        x = sequence.x[pos]
        # Get tag name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Get word name from ID.
        x_name = self.dataset.x_dict.get_label_name(x)
        # Generate feature name.
        feat_name = "id:{}::{}".format(x_name,y_name)
        self.features_used.add("id")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_transition_features(self, sequence, pos, y, y_prev, features):
        """ Adds a feature to the edge feature list.
        Creates a unique id if its the first time the feature is visited
        or returns the existing id otherwise
        """
        assert pos < len(sequence.x)-1
        # Get label name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)
        # Get previous label name from ID.
        y_prev_name = self.dataset.y_dict.get_label_name(y_prev)
        # Generate feature name.
        feat_name = "prev_tag:{}::{}".format(y_prev_name, y_name)
        self.features_used.add("prev_tag")
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        return features

    def add_feature(self, feat_name):
        """
        Builds a dictionary of feature name to feature id
        If we are at test time and we don't have the feature
        we return -1.
        """
        # Check if feature exists and if so, return the feature ID. 
        if feat_name in self.feature_dict:
            return self.feature_dict[feat_name]
        # If 'add_features' is True, add the feature to the feature 
        # dictionary and return the feature ID. Otherwise return -1.
        if not self.add_features:
            return -1
        return self.feature_dict.add(feat_name)
Beispiel #15
0
def get_w_indices(targets, vocab):
    if not targets:
        return {}
    w_dict = LabelDictionary(read_vocab(vocab))
    return {w_dict.get_label_id(t) for t in targets if t in w_dict}
Beispiel #16
0
class Conll2002NerCorpus():
    """
    Optionally reads text to which we want to apply a wordrep such as hmm.
    - no update of the wordrep_dict; every word not in it (from x_dict),
    gets *unk* id needed for successful decoding
    "
    """
    def __init__(self,
                 wordrep_dict=None,
                 eval_spec_rel=False,
                 dirname=None,
                 lr=False,
                 use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None

    def read_sequence_list_conll(self,
                                 train_file,
                                 train_file_parsed=None,
                                 train_files_parsed_path=None,
                                 max_sent_len=100000,
                                 max_nr_sent=100000):
        """
        Read a conll2002 or conll2003 file into a sequence list.
        Optionally add a sequence list/tree with *unk* for decoding in wordrep.
        """
        instance_list = self.read_conll_instances(train_file,
                                                  train_file_parsed,
                                                  train_files_parsed_path,
                                                  max_sent_len, max_nr_sent)

        if self.wordrep_dict is not None:

            seq_list = SequenceListLabel(self.word_dict, self.tag_dict,
                                         self.wordrep_dict)  # for indices
            for sent_x, sent_y, sent_ in instance_list:
                # sent_ is a normalized tree
                if self.use_wordrep_tree:
                    seq_list.add_sequence(sent_x, sent_y, None, sent_)
                # sent is a normalized chain
                else:
                    seq_list.add_sequence(sent_x, sent_y, sent_)
        else:
            seq_list = SequenceListLabel(self.word_dict,
                                         self.tag_dict)  # for indices
            for sent_x, sent_y in instance_list:
                seq_list.add_sequence(sent_x, sent_y)

        return seq_list

    def read_conll_instances(self, file, file_parsed, files_parsed_path,
                             max_sent_len, max_nr_sent):
        """
        TODO: refactor the entire method, lots of overlap chain/tree/token/lemma
        """
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed,
                                howbig=1000000,
                                lemmas=True,
                                eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname,
                                lr=self.lr)
            trees.prepare_trees()
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = self.normalize_tree(trees.train[c_append],
                                               trees.x_dict, c_append)
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        if self.use_wordrep_tree:
            if file_parsed is None or files_parsed_path is None:
                sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_y = []
        include_ex_z = (self.wordrep_dict is not None
                        and not self.use_wordrep_tree)
        if include_ex_z:
            ex_z = []

        for line in contents:
            if line.startswith("-DOCSTART"):
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(
                        ex_x
                ) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instances.append(
                        [ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y])
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_y = []
            else:
                tag = toks[2]
                word = toks[0]
                if word not in self.word_dict:
                    self.word_dict.add(word)
                if tag not in self.tag_dict:
                    self.tag_dict.add(tag)
                ex_x.append(word)
                ex_y.append(tag)
                if include_ex_z:
                    ex_z.append(self.normalize_word(word))

        # add parsed data to use tree wordreps
        if self.use_wordrep_tree:
            for c, instance in enumerate(get_tree(len(instances))):
                # get parsed data
                inst = instance
                instances[c].append(inst)

        return instances  # try generator

    def prepare_lemmatized_conll2002(self,
                                     train_file,
                                     train_file_parsed=None,
                                     train_files_parsed_path=None,
                                     output_f=None):
        self.use_wordrep_tree = True  # need parsed data
        docstarts, instances = self.prepare_conll_instances(
            train_file, train_file_parsed, train_files_parsed_path)
        if output_f is None:
            return instances
        else:
            header = "-DOCSTART- -DOCSTART- O"
            with open(output_f, "w") as outfile:
                for n, instance in enumerate(instances):
                    # doc headers
                    if n in docstarts:
                        outfile.write("{}\n".format(header))
                    if isinstance(instance, list):
                        for _, postag, tag, lemma in zip(*instance):
                            outfile.write("{} {} {}\n".format(
                                lemma, postag, tag))
                        outfile.write("\n")
                    else:
                        sys.exit("invalid instance")

    def prepare_conll_instances(self, file, file_parsed, files_parsed_path):
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed,
                                howbig=1000000,
                                lemmas=True,
                                eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname,
                                lr=self.lr)
            trees.prepare_trees()
            self.tree_vocab = trees.x_dict
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = trees.train[c_append]
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        max_sent_len = 1000000
        max_nr_sent = 1000000
        if file_parsed is None or files_parsed_path is None:
            sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_x_pos = []
        ex_y = []
        docstarts = set()  # track docstarts header

        for line in contents:
            if line.startswith("-DOCSTART"):
                docstarts.add(nr_sent)
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(
                        ex_x
                ) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instance = [ex_x, ex_x_pos, ex_y]
                    instances.append(instance)
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_x_pos = []
                ex_y = []
            else:
                tag = toks[2]
                postag = toks[1]
                word = toks[0]
                ex_x.append(word)
                ex_x_pos.append(postag)
                ex_y.append(tag)

        for c, instance in enumerate(get_tree(len(instances))):
            ex_z = self.get_words(
                instance,
                self.tree_vocab)  # should get lemmas (from ConllCorpus)
            if ex_z is None:
                inst = [i for i in instances[c][0]]
                print("None instance")
            else:
                assert len(ex_z) == len(instances[c][0])
                inst = ex_z
            instances[c].append(inst)

        return docstarts, instances  # try generator

    def normalize_word(self, word):
        if word not in self.wordrep_dict:
            return "*unk*" if word.lower(
            ) not in self.wordrep_dict else word.lower()
        else:
            return word

    def normalize_tree(self, tree, trees_vocab, c):
        """
        Recode the name index based on wordrep_dict.
        Modify tree.name such that *unk* or lowercase words are included.
        """
        for node in tree:
            w = trees_vocab.get_label_name(node.name)
            # if c==0:
            #    print("{}\t{}".format(w, self.normalize_word(w)))
            new_name = self.wordrep_dict.get_label_id(self.normalize_word(w))
            node.set_name(new_name)
        return tree

    def get_words(self, instance, vocab):
        if isinstance(instance, Tree):
            return get_words_from_tree(instance, vocab)
        print("None instance in Conll2002NerCorpus")
        return None

    def write_conll_instances(self, gold, predictions, file, sep=" "):
        """
        Create dataset with appended predictions as the last column.
        """
        assert len(gold) == len(predictions)
        contents = open(file, "w", encoding="iso-8859-1")
        for gold_seq, pred_seq in zip(gold.seq_list, predictions):
            for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y):
                contents.write("{}{sep}{}{sep}{}\n".format(
                    gold_seq.sequence_list.x_dict.get_label_name(x),
                    gold_seq.sequence_list.y_dict.get_label_name(y),
                    pred_seq.sequence_list.y_dict.get_label_name(y_hat),
                    sep=sep))
            contents.write("\n")

    # # Dumps a corpus into a file
    def save_corpus(self, dirname):
        if not os.path.isdir(dirname + "/"):
            os.mkdir(dirname + "/")
        #word_fn = open(dir+"word.dic","w")
        #for word_id,word in enumerate(self.int_to_word):
        #    word_fn.write("{}\t{}\n".format(word_id, word))
        #word_fn.close()
        #tag_fn = open(dir+"tag.dic","w")
        #for tag_id,tag in enumerate(self.int_to_tag):
        #    tag_fn.write("{}\t{}\n".format(tag_id, tag))
        #tag_fn.close()
        #word_count_fn = open(dir+"word.count","w")
        #for word_id,counts in self.word_counts.iteritems():
        #    word_count_fn.write("{}\t{}\n".format(word_id,counts))
        #word_count_fn.close()
        self.sequence_list.save(dirname + "sequence_list")

    ## Loads a corpus from a file
    def load_corpus(self, dirname):
        word_fn = open(dirname + "word.dic")
        for line in word_fn:
            word_nr, word = line.strip().split("\t")
            self.int_to_word.append(word)
            self.word_dict[word] = int(word_nr)
        word_fn.close()
        tag_fn = open(dirname + "tag.dic")
        for line in tag_fn:
            tag_nr, tag = line.strip().split("\t")
            if tag not in self.tag_dict:
                self.int_to_tag.append(tag)
                self.tag_dict[tag] = int(tag_nr)
        tag_fn.close()
        word_count_fn = open(dirname + "word.count")
        for line in word_count_fn:
            word_nr, word_count = line.strip().split("\t")
            self.word_counts[int(word_nr)] = int(word_count)
        word_count_fn.close()
        self.sequence_list.load(dirname + "sequence_list")
 def __init__(self):
     self.word_dict = LabelDictionary()
     self.sequence_list = SequenceList(self.word_dict)
Beispiel #18
0
def get_w_indices(targets, vocab):
    if not targets:
        return {}
    w_dict = LabelDictionary(read_vocab(vocab))
    return {w_dict.get_label_id(t) for t in targets if t in w_dict}
Beispiel #19
0
class ConllCorpus:
    def __init__(self,
                 corpus_file,
                 minfreq=0,
                 howbig=1000,
                 lemmas=True,
                 spec_rels=None,
                 dirname=None,
                 eval_spec_rels=False,
                 lr=False):
        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file,
                                                 howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(
                    open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id(
                        (set(r_dict.names) - set(spec_rels)),
                        self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")

    def prepare_trees(self):
        self.train = TreeList()
        #print(self.train)
        reader = Conll07Reader(self.corpus_file)
        sent = reader.getNext()
        c = 1
        while sent and (c <= self.howbig):
            t = self.prepare(sent, lr=self.lr)
            if t is not None:
                self.train.add_tree(t)
                #tracker.create_snapshot()
            #tracker.stats.print_summary()
            sent = reader.getNext()
            c += 1

    def prepare_trees_gen(self):
        reader = Conll07Reader(self.corpus_file)
        sent = reader.getNext()
        c = 1
        while sent and (c <= self.howbig):
            t = self.prepare(sent, lr=self.lr)
            if t is not None:
                yield t
                #tracker.create_snapshot()
            #tracker.stats.print_summary()
            sent = reader.getNext()
            c += 1

    def prepare(self, sent, lr=False):
        t = BPTree()
        #tracker = ClassTracker()
        #tracker.track_object(t)
        #tracker.create_snapshot()
        #1.pass: create nodes
        elems = sent.getSentenceLemmas() if self.lemmas else sent.getSentence()

        if lr:
            for w, i in zip(elems, sent.getIds()):
                idx = self.x_dict.get_label_id(w)
                t.add_node(BPNode(i, idx))
        else:
            for w, i, r in zip(elems, sent.getIds(), sent.deprel):
                idx = self.x_dict.get_label_id(w)
                ridx = self.r_dict.get_label_id(r)
                t.add_node(BPNode(i, idx, rel=ridx))
        #add root
        #tracker.create_snapshot("add words of sent")
        idx = self.x_dict.get_label_id("*root*")
        t.add_node(BPNode(0, idx))
        #tracker.create_snapshot("add ROOT")
        #2.pass: create edges
        seen = set()  # catch direct loops
        for i, i_head in sent.getHeads():
            # this only catches direct loops; TODO: use is_acyclic check
            if (i, i_head) in seen or (i_head, i) in seen:
                print("Tree with loop caught")
                t = None
                break
            else:
                seen.add((i, i_head))
            if i == i_head:  # not allowed
                print("Skipping sentence: parent is its own child")
                t = None
                break
            parent = t[i_head]
            child = t[i]
            if lr:
                child.rel = self.r_dict.get_label_id(
                    "left") if i_head > i else self.r_dict.get_label_id(
                        "right")  #w occurs left/right of its parent
            if parent is None or child is None:
                print()
            edge = BPEdge(parent, child)
            t.add_edge(edge)
            #tracker.create_snapshot("add edge")
            t.add_edge_to_map(parent, child, edge)
            #tracker.create_snapshot("add edge to map")

        return t

    def prepare_vocab_dict(self):
        reader = Conll07Reader(self.corpus_file)
        vocab_dict = reader.getVocabulary(n_sent=self.howbig,
                                          add_root=True,
                                          lemmas=self.lemmas)

        with open(self.vocab_file, "w") as OUT:
            for w, f in vocab_dict.items():
                OUT.write("{}\t{}\n".format(w, f))

        print("Vocabulary file prepared.")

    def prepare_rel_vocab_dict(self):
        reader = Conll07Reader(self.corpus_file)
        vocab = reader.getRelationVocabulary(n_sent=self.howbig)

        with open(self.rel_file, "w") as OUT:
            for r in vocab:
                OUT.write("{}\n".format(r))

        print("Relation vocabulary file prepared.")
Beispiel #20
0
    def __init__(self,
                 corpus_file,
                 minfreq=0,
                 howbig=1000,
                 lemmas=True,
                 spec_rels=None,
                 dirname=None,
                 eval_spec_rels=False,
                 lr=False):
        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file,
                                                 howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(
                    open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id(
                        (set(r_dict.names) - set(spec_rels)),
                        self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")
Beispiel #21
0
class ConllCorpus:
    def __init__(self, corpus_file, minfreq=0, howbig=1000, lemmas=True, spec_rels=None, dirname=None,
                 eval_spec_rels=False, lr=False):

        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file, howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary([l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id((set(r_dict.names) - set(spec_rels)), self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")

    def prepare_trees(self):
        self.train = TreeList()
        #print(self.train)
        reader = Conll07Reader(self.corpus_file)
        sent = reader.getNext()
        c = 1
        while sent and (c <= self.howbig):
            t = self.prepare(sent, lr=self.lr)
            if t is not None:
                self.train.add_tree(t)
                #tracker.create_snapshot()
            #tracker.stats.print_summary()
            sent = reader.getNext()
            c += 1

    def prepare_trees_gen(self):
        reader = Conll07Reader(self.corpus_file)
        sent = reader.getNext()
        c = 1
        while sent and (c <= self.howbig):
            t = self.prepare(sent, lr=self.lr)
            if t is not None:
                yield t
                #tracker.create_snapshot()
            #tracker.stats.print_summary()
            sent = reader.getNext()
            c += 1

    def prepare(self, sent, lr=False):
        t = BPTree()
        #tracker = ClassTracker()
        #tracker.track_object(t)
        #tracker.create_snapshot()
        #1.pass: create nodes
        elems = sent.getSentenceLemmas() if self.lemmas else sent.getSentence()

        if lr:
            for w, i in zip(elems, sent.getIds()):
                idx = self.x_dict.get_label_id(w)
                t.add_node(BPNode(i, idx))
        else:
            for w, i, r in zip(elems, sent.getIds(), sent.deprel):
                idx = self.x_dict.get_label_id(w)
                ridx = self.r_dict.get_label_id(r)
                t.add_node(BPNode(i, idx, rel=ridx))
        #add root
        #tracker.create_snapshot("add words of sent")
        idx = self.x_dict.get_label_id("*root*")
        t.add_node(BPNode(0, idx))
        #tracker.create_snapshot("add ROOT")
        #2.pass: create edges
        seen = set()  # catch direct loops
        for i, i_head in sent.getHeads():
            # this only catches direct loops; TODO: use is_acyclic check
            if (i, i_head) in seen or (i_head, i) in seen:
                print("Tree with loop caught")
                t = None
                break
            else:
                seen.add((i, i_head))
            if i == i_head:  # not allowed
                print("Skipping sentence: parent is its own child")
                t = None
                break
            parent = t[i_head]
            child = t[i]
            if lr:
                child.rel = self.r_dict.get_label_id("left") if i_head > i else self.r_dict.get_label_id(
                    "right")  #w occurs left/right of its parent
            if parent is None or child is None:
                print()
            edge = BPEdge(parent, child)
            t.add_edge(edge)
            #tracker.create_snapshot("add edge")
            t.add_edge_to_map(parent, child, edge)
            #tracker.create_snapshot("add edge to map")

        return t

    def prepare_vocab_dict(self):
        reader = Conll07Reader(self.corpus_file)
        vocab_dict = reader.getVocabulary(n_sent=self.howbig, add_root=True, lemmas=self.lemmas)

        with open(self.vocab_file, "w") as OUT:
            for w, f in vocab_dict.items():
                OUT.write("{}\t{}\n".format(w, f))

        print("Vocabulary file prepared.")

    def prepare_rel_vocab_dict(self):
        reader = Conll07Reader(self.corpus_file)
        vocab = reader.getRelationVocabulary(n_sent=self.howbig)

        with open(self.rel_file, "w") as OUT:
            for r in vocab:
                OUT.write("{}\n".format(r))

        print("Relation vocabulary file prepared.")
Beispiel #22
0
class Conll2002NerCorpus():
    """
    Optionally reads text to which we want to apply a wordrep such as hmm.
    - no update of the wordrep_dict; every word not in it (from x_dict),
    gets *unk* id needed for successful decoding
    "
    """

    def __init__(self, wordrep_dict=None, eval_spec_rel=False, dirname=None, lr=False, use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None

    def read_sequence_list_conll(self, train_file, train_file_parsed=None, train_files_parsed_path=None,
                                 max_sent_len=100000, max_nr_sent=100000):
        """
        Read a conll2002 or conll2003 file into a sequence list.
        Optionally add a sequence list/tree with *unk* for decoding in wordrep.
        """
        instance_list = self.read_conll_instances(train_file, train_file_parsed, train_files_parsed_path, max_sent_len,
                                                  max_nr_sent)

        if self.wordrep_dict is not None:

            seq_list = SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)  # for indices
            for sent_x, sent_y, sent_ in instance_list:
                # sent_ is a normalized tree
                if self.use_wordrep_tree:
                    seq_list.add_sequence(sent_x, sent_y, None, sent_)
                # sent is a normalized chain
                else:
                    seq_list.add_sequence(sent_x, sent_y, sent_)
        else:
            seq_list = SequenceListLabel(self.word_dict, self.tag_dict)  # for indices
            for sent_x, sent_y in instance_list:
                seq_list.add_sequence(sent_x, sent_y)

        return seq_list

    def read_conll_instances(self, file, file_parsed, files_parsed_path, max_sent_len, max_nr_sent):
        """
        TODO: refactor the entire method, lots of overlap chain/tree/token/lemma
        """

        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname, lr=self.lr)
            trees.prepare_trees()
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append)
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        if self.use_wordrep_tree:
            if file_parsed is None or files_parsed_path is None:
                sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_y = []
        include_ex_z = (self.wordrep_dict is not None and not self.use_wordrep_tree)
        if include_ex_z:
            ex_z = []

        for line in contents:
            if line.startswith("-DOCSTART"):
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(ex_x) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instances.append([ex_x, ex_y, ex_z] if include_ex_z else [ex_x, ex_y])
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_y = []
            else:
                tag = toks[2]
                word = toks[0]
                if word not in self.word_dict:
                    self.word_dict.add(word)
                if tag not in self.tag_dict:
                    self.tag_dict.add(tag)
                ex_x.append(word)
                ex_y.append(tag)
                if include_ex_z:
                    ex_z.append(self.normalize_word(word))

        # add parsed data to use tree wordreps
        if self.use_wordrep_tree:
            for c, instance in enumerate(get_tree(len(instances))):
                # get parsed data
                inst = instance
                instances[c].append(inst)

        return instances  # try generator

    def prepare_lemmatized_conll2002(self, train_file, train_file_parsed=None, train_files_parsed_path=None,
                                     output_f=None):
        self.use_wordrep_tree = True  # need parsed data
        docstarts, instances = self.prepare_conll_instances(train_file, train_file_parsed, train_files_parsed_path)
        if output_f is None:
            return instances
        else:
            header = "-DOCSTART- -DOCSTART- O"
            with open(output_f, "w") as outfile:
                for n, instance in enumerate(instances):
                    # doc headers
                    if n in docstarts:
                        outfile.write("{}\n".format(header))
                    if isinstance(instance, list):
                        for _, postag, tag, lemma in zip(*instance):
                            outfile.write("{} {} {}\n".format(lemma, postag, tag))
                        outfile.write("\n")
                    else:
                        sys.exit("invalid instance")

    def prepare_conll_instances(self, file, file_parsed, files_parsed_path):
        def get_tree(n_inst):
            trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel,
                                dirname=self.dirname, lr=self.lr)
            trees.prepare_trees()
            self.tree_vocab = trees.x_dict
            # not every instance has a corresponding tree due to errors in parsing
            conll_idx = ConllFilesIndex(files_parsed_path)
            conll_idx.create_ids_set()
            # extend instances with trees
            c_append = 0
            for i in range(n_inst):
                # we have a parse:
                if i + 1 in conll_idx.fileids:
                    inst = trees.train[c_append]
                    c_append += 1
                # we don't have a parse:
                else:
                    inst = None
                yield inst

        max_sent_len = 1000000
        max_nr_sent = 1000000
        if file_parsed is None or files_parsed_path is None:
            sys.exit("Missing parsed file.")

        contents = open(file, encoding="iso-8859-1")
        nr_sent = 0
        instances = []
        ex_x = []
        ex_x_pos = []
        ex_y = []
        docstarts = set()  # track docstarts header

        for line in contents:
            if line.startswith("-DOCSTART"):
                docstarts.add(nr_sent)
                continue
            toks = line.split()
            if len(toks) < 3:
                if 0 < len(ex_x) < max_sent_len:  # len(ex_x) > 1 # escape one-word sentences
                    nr_sent += 1
                    instance = [ex_x, ex_x_pos, ex_y]
                    instances.append(instance)
                if nr_sent >= max_nr_sent:
                    break
                ex_x = []
                ex_x_pos = []
                ex_y = []
            else:
                tag = toks[2]
                postag = toks[1]
                word = toks[0]
                ex_x.append(word)
                ex_x_pos.append(postag)
                ex_y.append(tag)

        for c, instance in enumerate(get_tree(len(instances))):
            ex_z = self.get_words(instance, self.tree_vocab)  # should get lemmas (from ConllCorpus)
            if ex_z is None:
                inst = [i for i in instances[c][0]]
                print("None instance")
            else:
                assert len(ex_z) == len(instances[c][0])
                inst = ex_z
            instances[c].append(inst)

        return docstarts, instances  # try generator

    def normalize_word(self, word):
        if word not in self.wordrep_dict:
            return "*unk*" if word.lower() not in self.wordrep_dict else word.lower()
        else:
            return word

    def normalize_tree(self, tree, trees_vocab, c):
        """
        Recode the name index based on wordrep_dict.
        Modify tree.name such that *unk* or lowercase words are included.
        """
        for node in tree:
            w = trees_vocab.get_label_name(node.name)
            # if c==0:
            #    print("{}\t{}".format(w, self.normalize_word(w)))
            new_name = self.wordrep_dict.get_label_id(self.normalize_word(w))
            node.set_name(new_name)
        return tree

    def get_words(self, instance, vocab):
        if isinstance(instance, Tree):
            return get_words_from_tree(instance, vocab)
        print("None instance in Conll2002NerCorpus")
        return None

    def write_conll_instances(self, gold, predictions, file, sep=" "):
        """
        Create dataset with appended predictions as the last column.
        """
        assert len(gold) == len(predictions)
        contents = open(file, "w", encoding="iso-8859-1")
        for gold_seq, pred_seq in zip(gold.seq_list, predictions):
            for x, y, y_hat in zip(gold_seq.x, gold_seq.y, pred_seq.y):
                contents.write("{}{sep}{}{sep}{}\n".format(gold_seq.sequence_list.x_dict.get_label_name(x),
                                                           gold_seq.sequence_list.y_dict.get_label_name(y),
                                                           pred_seq.sequence_list.y_dict.get_label_name(y_hat),
                                                           sep=sep))
            contents.write("\n")

    # # Dumps a corpus into a file
    def save_corpus(self, dirname):
        if not os.path.isdir(dirname + "/"):
            os.mkdir(dirname + "/")
        #word_fn = open(dir+"word.dic","w")
        #for word_id,word in enumerate(self.int_to_word):
        #    word_fn.write("{}\t{}\n".format(word_id, word))
        #word_fn.close()
        #tag_fn = open(dir+"tag.dic","w")
        #for tag_id,tag in enumerate(self.int_to_tag):
        #    tag_fn.write("{}\t{}\n".format(tag_id, tag))
        #tag_fn.close()
        #word_count_fn = open(dir+"word.count","w")
        #for word_id,counts in self.word_counts.iteritems():
        #    word_count_fn.write("{}\t{}\n".format(word_id,counts))
        #word_count_fn.close()
        self.sequence_list.save(dirname + "sequence_list")

    ## Loads a corpus from a file
    def load_corpus(self, dirname):
        word_fn = open(dirname + "word.dic")
        for line in word_fn:
            word_nr, word = line.strip().split("\t")
            self.int_to_word.append(word)
            self.word_dict[word] = int(word_nr)
        word_fn.close()
        tag_fn = open(dirname + "tag.dic")
        for line in tag_fn:
            tag_nr, tag = line.strip().split("\t")
            if tag not in self.tag_dict:
                self.int_to_tag.append(tag)
                self.tag_dict[tag] = int(tag_nr)
        tag_fn.close()
        word_count_fn = open(dirname + "word.count")
        for line in word_count_fn:
            word_nr, word_count = line.strip().split("\t")
            self.word_counts[int(word_nr)] = int(word_count)
        word_count_fn.close()
        self.sequence_list.load(dirname + "sequence_list")
Beispiel #23
0
 def __init__(self, wordrep_dict=None):
     self.word_dict = LabelDictionary()
     self.tag_dict = LabelDictionary()
     self.sequence_list = None