Example #1
0
    def __init__(self, corpus_file, minfreq=0, howbig=10000):
        """
        :param minfreq: minimum frequency of a word in order to be taken into account
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab".format(
            self.corpus_file)  # file of form: w\tf\n

        self.minfreq = minfreq
        self.howbig = howbig
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))

        print("LabelDictionary created.")
Example #2
0
    def __init__(self):
        #observation vocabulary
        self.x_dict = LabelDictionary(["walk", "shop", "clean", "tennis"])

        #training sequences
        train_seqs = SequenceList(self.x_dict)
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "walk", "shop", "clean"])
        train_seqs.add_sequence(["walk", "shop", "shop", "clean"])

        self.train = train_seqs
Example #3
0
    def prepare_seqs_nl(self, vocab_f):
        self.ner_corpus = Conll2002NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)

        return train_seq, dev_seq, test_seq
Example #4
0
def get_w_reps(idx, w_reps, vocab):
    ws = []
    reps = []
    if not idx:
        return ws, reps

    w_dict = LabelDictionary(read_vocab(vocab))
    for w, rep in w_reps:
        if w_dict.get_label_id(w) in idx:
            assert not np.isnan(np.sum(rep))
            ws.append(w)
            reps.append(rep)

    return ws, reps
Example #5
0
    def __init__(self,
                 wordrep_dict=None,
                 eval_spec_rel=False,
                 dirname=None,
                 lr=False,
                 use_wordrep_tree=False):
        """
        :param wordrep_dict: x_dictionary from training of word representations
        :param use_wordrep_tree: use parse tree representations
        """

        self.wordrep_dict = wordrep_dict
        if self.wordrep_dict is not None:
            self.word_dict = self.wordrep_dict.copy()
        else:
            self.word_dict = LabelDictionary()
        self.tag_dict = LabelDictionary()  # ner tag
        self.use_wordrep_tree = use_wordrep_tree
        self.sequence_list = None  # SequenceListLabel(self.word_dict, self.tag_dict, self.wordrep_dict)
        self.eval_spec_rel = eval_spec_rel
        self.dirname = dirname
        self.lr = lr
        # for conll2002 lemma format preparation:
        self.tree_vocab = None
Example #6
0
    def __init__(self, dataset):
        '''dataset is a sequence list.'''
        self.feature_dict = LabelDictionary()
        self.feature_list = []

        self.add_features = False
        self.dataset = dataset

        #Speed up
        self.node_feature_cache = {}
        self.initial_state_feature_cache = {}
        self.final_state_feature_cache = {}
        self.edge_feature_cache = {}

        self.features_used = set()
Example #7
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
Example #8
0
    def prepare_seqs_en(self, vocab_f):
        self.ner_corpus = Conll2003NerCorpus(
            wordrep_dict=LabelDictionary(read_vocab(vocab_f)))

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        mapper_corpus(train_seq, self.embeddings)
        mapper_corpus(dev_seq, self.embeddings)
        mapper_corpus(test_seq, self.embeddings)
        if self.use_muc:
            mapper_corpus(muc_seq, self.embeddings)

        return train_seq, dev_seq, test_seq, muc_seq
Example #9
0
    def __init__(self):
        self.x_dict = LabelDictionary(
            ["write", "that", "code", "ROOT", "don't"])
        self.train_trees = TreeList()
        tree_ex1 = Tree()  # container for node_list and edge_list
        idx = self.x_dict.get_label_id("write")
        n0 = Node(len(tree_ex1), idx)  # len is 0
        tree_ex1.add_node(n0)
        idx = self.x_dict.get_label_id("that")
        n1 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n1)
        idx = self.x_dict.get_label_id("code")
        n2 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n2)
        idx = self.x_dict.get_label_id("ROOT")
        n3 = Node(len(tree_ex1), idx)
        tree_ex1.add_node(n3)

        tree_ex1.add_edge(Edge(n0, n2))
        tree_ex1.add_edge(Edge(n2, n1))
        tree_ex1.add_edge(Edge(n3, n0))

        self.train_trees.add_tree(tree_ex1)

        tree_ex2 = Tree()
        idx = self.x_dict.get_label_id("don't")
        n0 = Node(len(tree_ex1), idx)  # len is 0
        tree_ex2.add_node(n0)
        idx = self.x_dict.get_label_id("write")
        n1 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n1)
        idx = self.x_dict.get_label_id("code")
        n2 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n2)
        idx = self.x_dict.get_label_id("ROOT")
        n3 = Node(len(tree_ex1), idx)
        tree_ex2.add_node(n3)

        tree_ex2.add_edge(Edge(n0, n1))
        tree_ex2.add_edge(Edge(n1, n2))
        tree_ex2.add_edge(Edge(n3, n0))

        self.train_trees.add_tree(tree_ex2)
 def __init__(self):
     self.word_dict = LabelDictionary()
     self.sequence_list = SequenceList(self.word_dict)
Example #11
0
    def __init__(self,
                 corpus_file,
                 minfreq=0,
                 howbig=1000,
                 lemmas=True,
                 spec_rels=None,
                 dirname=None,
                 eval_spec_rels=False,
                 lr=False):
        """
        :param howbig: number of sentences to take into account
        """
        self.corpus_file = corpus_file
        self.vocab_file = "{}.vocab{}".format(self.corpus_file, howbig)
        self.rel_file = "{}.rels.vocab{}".format(self.corpus_file,
                                                 howbig)  # dependency labels

        self.minfreq = minfreq
        self.howbig = howbig
        self.lemmas = lemmas
        self.lr = lr
        #read built vocab
        try:
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        #except FileNotFoundError:
        except IOError:
            self.prepare_vocab_dict()
            self.x_dict = LabelDictionary(
                read_vocab(self.vocab_file, self.minfreq))
        print("LabelDictionary created.")

        if eval_spec_rels:  # in evaluation
            try:
                import pickle

                self.r_dict = pickle.load(
                    open("{}/r_dict.pickle".format(dirname), "rb"))
            except IOError:
                sys.exit("r_dict does not exist.")
        else:
            if self.lr:
                self.r_dict = RelationDictionary(["left", "right"])
                self.r_dict.write("{}/r_dict.pickle".format(dirname))
            else:
                try:
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                except IOError:
                    self.prepare_rel_vocab_dict()
                    r_dict = LabelDictionary(
                        [l.strip() for l in open(self.rel_file)])
                if spec_rels:
                    self.r_dict = RelationDictionary(spec_rels)
                    self.r_dict.add("OTHER")
                    self.r_dict.add_fixed_id(
                        (set(r_dict.names) - set(spec_rels)),
                        self.r_dict.get_label_id("OTHER"))
                    self.r_dict.write("{}/r_dict.pickle".format(dirname))
                else:
                    self.r_dict = r_dict
        print("Relation/LabelDictionary created.")
Example #12
0
def get_w_indices(targets, vocab):
    if not targets:
        return {}
    w_dict = LabelDictionary(read_vocab(vocab))
    return {w_dict.get_label_id(t) for t in targets if t in w_dict}
Example #13
0
 def __init__(self, wordrep_dict=None):
     self.word_dict = LabelDictionary()
     self.tag_dict = LabelDictionary()
     self.sequence_list = None