Esempio n. 1
0
    def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False,
                 logger=None, ignore_rel=None, lr=False, use_muc=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        self.use_wordrep_rel = use_wordrep_rel
        self.eval_spec_rel = eval_spec_rel
        self.use_muc = use_muc
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree or self.use_wordrep_rel:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None
        self.logger = logger
        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \
            read_params_from_path(self.path)
        if self.logger is not None:
            self.logger.debug("Preparing self.dataset")
        if self.use_wordrep_tree or self.use_wordrep_rel:
            lemmas = False if self.lang == "en" else True
            self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas,
                                       eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr)
            self.ignore_rel = self.dataset.r_dict.get_label_id(ignore_rel) if ignore_rel is not None else None
            if decoding == "posterior_cont_type":
                self.dataset.train = self.dataset.prepare_trees_gen()  # generator
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent)
            if decoding == "posterior_cont_type":
                self.dataset.prepare_chains()

        self.ner_corpus = None

        if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(self.decoding, lr=lr)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(self.decoding)
        elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(self.decoding, lr=lr)
        else:
            sys.exit("invalid option in PrepareHmmRep")

        self.dataset = None
Esempio n. 2
0
    def main(self, path):
        """

        :param path: path to dir containing npy and settings files from the experiment
        """
        self.path = path
        self.ep = np.load("{}/ep.npy".format(self.path))
        #get some info from the setttings file
        with open("{}/settings".format(self.path)) as infile:
            data_name = None
            n_sent = None
            for l in infile:
                if l.startswith("Name of the corpus file: "):
                    data_name = l.strip().split(" ")[-1]
                elif l.startswith("Number of sentences: "):
                    n_sent = l.strip().split(" ")[-1]
            if data_name is None:
                print("Not able to retrieve the dataset name.")
            if n_sent is None:
                print("Not able to retrieve the number of sentences.")

        self.data_name = data_name
        self.n_sent = eval(n_sent)
        if "tree" or "_rel_" in path:
            if "_en_" in path:
                self.data = ConllCorpus(self.data_name,
                                        howbig=self.n_sent,
                                        lemmas=False)
            elif "_nl_" in path:
                self.data = ConllCorpus(self.data_name, howbig=self.n_sent)
        else:
            self.data = TextCorpus(self.data_name, howbig=self.n_sent)

        self.prob_thresh = None
        self.n = None  # max n of clusters per w
Esempio n. 3
0
    def __init__(self, path, lang, decoding=None, use_wordrep_tree=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None

        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file = \
            self.read_params_from_path()

        if self.use_wordrep_tree:
            if self.lang == "en":
                self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                           howbig=self.n_sent,
                                           lemmas=False)
            elif self.lang == "nl":
                self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                           howbig=self.n_sent)
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file),
                                      howbig=self.n_sent)
        self.ner_corpus = None

        if self.lang == "nl" and not self.use_wordrep_tree:
            self.dev_seq, self.test_seq = self.prepare_seqs_nl_dbg(
                self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and self.use_wordrep_tree:
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(
                self.decoding)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not self.use_wordrep_tree:
            self.dev_seq = self.prepare_seqs_en_dbg(self.decoding)

        elif self.lang == "en" and self.use_wordrep_tree:
            self.dev_seq = self.prepare_trees_en_dbg(self.decoding)
Esempio n. 4
0
    def __init__(self,
                 path,
                 lang,
                 decoding=None,
                 use_wordrep_tree=False,
                 use_wordrep_rel=False,
                 eval_spec_rel=False,
                 logger=None,
                 ignore_rel=None,
                 lr=False,
                 use_muc=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        self.use_wordrep_rel = use_wordrep_rel
        self.eval_spec_rel = eval_spec_rel
        self.use_muc = use_muc
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree or self.use_wordrep_rel:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None
        self.logger = logger
        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \
            read_params_from_path(self.path)
        if self.logger is not None:
            self.logger.debug("Preparing self.dataset")
        if self.use_wordrep_tree or self.use_wordrep_rel:
            lemmas = False if self.lang == "en" else True
            self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                       howbig=self.n_sent,
                                       lemmas=lemmas,
                                       eval_spec_rels=self.eval_spec_rel,
                                       dirname=self.path,
                                       lr=lr)
            self.ignore_rel = self.dataset.r_dict.get_label_id(
                ignore_rel) if ignore_rel is not None else None
            if decoding == "posterior_cont_type":
                self.dataset.train = self.dataset.prepare_trees_gen(
                )  # generator
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file),
                                      howbig=self.n_sent)
            if decoding == "posterior_cont_type":
                self.dataset.prepare_chains()

        self.ner_corpus = None

        if self.lang == "nl" and not (self.use_wordrep_tree
                                      or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(
                self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(
                self.decoding, lr=lr)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not (self.use_wordrep_tree
                                        or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(
                self.decoding)
        elif self.lang == "en" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(
                self.decoding, lr=lr)
        else:
            sys.exit("invalid option in PrepareHmmRep")

        self.dataset = None
Esempio n. 5
0
class PrepareHmmRep():
    """
    Applying hmm-based representations to the evaluation dataset. This includes decoding.
    """
    def __init__(self,
                 path,
                 lang,
                 decoding=None,
                 use_wordrep_tree=False,
                 use_wordrep_rel=False,
                 eval_spec_rel=False,
                 logger=None,
                 ignore_rel=None,
                 lr=False,
                 use_muc=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        self.use_wordrep_rel = use_wordrep_rel
        self.eval_spec_rel = eval_spec_rel
        self.use_muc = use_muc
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree or self.use_wordrep_rel:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None
        self.logger = logger
        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \
            read_params_from_path(self.path)
        if self.logger is not None:
            self.logger.debug("Preparing self.dataset")
        if self.use_wordrep_tree or self.use_wordrep_rel:
            lemmas = False if self.lang == "en" else True
            self.dataset = ConllCorpus("{}".format(self.corpus_file),
                                       howbig=self.n_sent,
                                       lemmas=lemmas,
                                       eval_spec_rels=self.eval_spec_rel,
                                       dirname=self.path,
                                       lr=lr)
            self.ignore_rel = self.dataset.r_dict.get_label_id(
                ignore_rel) if ignore_rel is not None else None
            if decoding == "posterior_cont_type":
                self.dataset.train = self.dataset.prepare_trees_gen(
                )  # generator
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file),
                                      howbig=self.n_sent)
            if decoding == "posterior_cont_type":
                self.dataset.prepare_chains()

        self.ner_corpus = None

        if self.lang == "nl" and not (self.use_wordrep_tree
                                      or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(
                self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(
                self.decoding, lr=lr)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not (self.use_wordrep_tree
                                        or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(
                self.decoding)
        elif self.lang == "en" and (self.use_wordrep_tree
                                    or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(
                self.decoding, lr=lr)
        else:
            sys.exit("invalid option in PrepareHmmRep")

        self.dataset = None

    def prepare_seqs_nl(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states,
                self.n_obs,
                params=params_fixed,
                writeout=False,
                dirname=self.path)

        self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict,
                                             eval_spec_rel=self.eval_spec_rel,
                                             dirname=self.path)
        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(
            train_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(
            dev_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(
            test_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(test_seq)

        return train_seq, dev_seq, test_seq

    def prepare_seqs_en(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states,
                self.n_obs,
                params=params_fixed,
                writeout=False,
                dirname=self.path)

        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict)

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test) if self.use_muc else None

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined correctly, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(
            train_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(
            dev_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(
            test_seq, self.dataset,
            self.logger) if type_decoder is not None else decoder(test_seq)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(
                muc_seq, self.dataset,
                self.logger) if type_decoder is not None else decoder(muc_seq)

        return train_seq, dev_seq, test_seq, muc_seq

    def prepare_trees_nl(self, decoding="max-product", lr=False):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        if self.use_wordrep_rel:
            h = HMRTM(self.n_states,
                      self.n_obs,
                      R=len(self.dataset.r_dict),
                      params=params_fixed,
                      writeout=False,
                      dirname=self.path,
                      omit_class_cond=self.omit_class_cond,
                      omit_emis_cond=self.omit_emis_cond)
        else:
            h = HMTM(self.n_states,
                     self.n_obs,
                     params=params_fixed,
                     writeout=False,
                     dirname=self.path)
        # h.dirname = self.path
        self.logger.debug("Creating self.ner_corpus")
        self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict,
                                             eval_spec_rel=self.eval_spec_rel,
                                             dirname=self.path,
                                             lr=lr,
                                             use_wordrep_tree=True)

        self.logger.debug("Reading ner data from self.ner_corpus")
        train_seq = self.ner_corpus.read_sequence_list_conll(
            ned_train, ned_train_parsed, ned_train_parsed_files_path)
        dev_seq = self.ner_corpus.read_sequence_list_conll(
            ned_dev, ned_dev_parsed, ned_dev_parsed_files_path)
        test_seq = self.ner_corpus.read_sequence_list_conll(
            ned_test, ned_test_parsed, ned_test_parsed_files_path)

        decoder = None
        type_decoder = None
        if decoding == "max-product":
            decoder = h.max_product_decode_corpus
        # elif decoding == "max_emission":
        # decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Max-product message passing.")
            decoder = h.max_product_decode_corpus

        self.logger.debug("Decoding.")
        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         train_seq, self.ignore_rel)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         dev_seq, self.ignore_rel)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         test_seq, self.ignore_rel)

        return train_seq, dev_seq, test_seq

    def prepare_trees_en(self, decoding="max-product", lr=False):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        if self.use_wordrep_rel:
            h = HMRTM(self.n_states,
                      self.n_obs,
                      R=len(self.dataset.r_dict),
                      params=params_fixed,
                      writeout=False,
                      dirname=self.path,
                      omit_class_cond=self.omit_class_cond,
                      omit_emis_cond=self.omit_emis_cond)
        else:
            h = HMTM(self.n_states,
                     self.n_obs,
                     params=params_fixed,
                     writeout=False,
                     dirname=self.path)

        self.logger.debug("Reading ner data from self.ner_corpus")
        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict,
                                             eval_spec_rel=self.eval_spec_rel,
                                             dirname=self.path,
                                             lr=lr,
                                             use_wordrep_tree=True)

        train_seq = self.ner_corpus.read_sequence_list_conll(
            eng_train, eng_train_parsed)
        dev_seq = self.ner_corpus.read_sequence_list_conll(
            eng_dev, eng_dev_parsed)
        test_seq = self.ner_corpus.read_sequence_list_conll(
            eng_test, eng_test_parsed)
        muc_seq = self.ner_corpus.read_sequence_list_conll(
            muc_test, muc_test_parsed) if self.use_muc else None

        # return train_seq, dev_seq, test_seq
        decoder = None
        type_decoder = None
        if decoding == "max-product":
            decoder = h.max_product_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Max-product message passing.")
            decoder = h.max_product_decode_corpus

        print(
            "Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         train_seq, self.ignore_rel)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         dev_seq, self.ignore_rel)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset,
                     self.logger) if type_decoder is not None else decoder(
                         test_seq, self.ignore_rel)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(muc_seq, self.dataset,
                         self.logger) if type_decoder is not None else decoder(
                             muc_seq, self.ignore_rel)

        return train_seq, dev_seq, test_seq, muc_seq
Esempio n. 6
0
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
                              lemmas=lemmas,
                              spec_rels=args.rel_spec_nl,
                              dirname=dirname,
                              lr=args.lr)
    else:
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
                              lemmas=lemmas,
                              spec_rels=None,
                              dirname=dirname,
                              lr=args.lr)
    n_rels = len(dataset.r_dict)
else:
    dataset = TextCorpus(args.dataset, howbig=n_sent)
    dataset.prepare_chains()

n_obs = len(dataset.x_dict)

writeout = args.writeout

if args.rel or args.lr:
    model = HMRTM
elif args.tree:
    model = HMTM
else:
    model = HMM
if args.params is not None:
    params_fixed_path = args.params
    if args.params_trained:
Esempio n. 7
0
class PrepareHmmRep():
    """
    Applying hmm-based representations to the evaluation dataset. This includes decoding.
    """

    def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False,
                 logger=None, ignore_rel=None, lr=False, use_muc=False):

        self.path = path
        self.lang = lang
        self.decoding = decoding
        self.use_wordrep_tree = use_wordrep_tree
        self.use_wordrep_rel = use_wordrep_rel
        self.eval_spec_rel = eval_spec_rel
        self.use_muc = use_muc
        if self.decoding is None:
            print("Decoding method not specified.")
            if self.use_wordrep_tree or self.use_wordrep_rel:
                self.decoding = "max-product"
            else:
                self.decoding = "viterbi"
        print("Using default: {}".format(self.decoding))
        self.n_states = None
        self.n_obs = None
        self.n_sent = None
        self.n_toks = None
        self.corpus_file = None
        self.logger = logger
        self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \
            read_params_from_path(self.path)
        if self.logger is not None:
            self.logger.debug("Preparing self.dataset")
        if self.use_wordrep_tree or self.use_wordrep_rel:
            lemmas = False if self.lang == "en" else True
            self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas,
                                       eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr)
            self.ignore_rel = self.dataset.r_dict.get_label_id(ignore_rel) if ignore_rel is not None else None
            if decoding == "posterior_cont_type":
                self.dataset.train = self.dataset.prepare_trees_gen()  # generator
        else:
            self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent)
            if decoding == "posterior_cont_type":
                self.dataset.prepare_chains()

        self.ner_corpus = None

        if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl(self.decoding)
            # self.test_seq = self.prepare_seqs_nl(self.decoding)
        elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl(self.decoding, lr=lr)
            # self.test_seq = self.prepare_trees_nl(self.decoding)
        elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en(self.decoding)
        elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel):
            self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en(self.decoding, lr=lr)
        else:
            sys.exit("invalid option in PrepareHmmRep")

        self.dataset = None

    def prepare_seqs_nl(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path)

        self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path)
        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test)

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print("Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq)

        return train_seq, dev_seq, test_seq

    def prepare_seqs_en(self, decoding="viterbi"):
        params_fixed = (np.load("{}/ip.npy".format(self.path)),
                        np.load("{}/tp.npy".format(self.path)),
                        np.load("{}/fp.npy".format(self.path)),
                        np.load("{}/ep.npy".format(self.path)))

        h = HMM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path)

        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict)

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test)
        muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test) if self.use_muc else None

        decoder = None
        type_decoder = None
        if decoding == "viterbi":
            decoder = h.viterbi_decode_corpus
        elif decoding == "max_emission":
            decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined correctly, using Viterbi.")
            decoder = h.viterbi_decode_corpus

        print("Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq)

        return train_seq, dev_seq, test_seq, muc_seq

    def prepare_trees_nl(self, decoding="max-product", lr=False):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        if self.use_wordrep_rel:
            h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False,
                      dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond)
        else:
            h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path)
        # h.dirname = self.path
        self.logger.debug("Creating self.ner_corpus")
        self.ner_corpus = Conll2002NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path,
                                             lr=lr, use_wordrep_tree=True)

        self.logger.debug("Reading ner data from self.ner_corpus")
        train_seq = self.ner_corpus.read_sequence_list_conll(ned_train, ned_train_parsed, ned_train_parsed_files_path)
        dev_seq = self.ner_corpus.read_sequence_list_conll(ned_dev, ned_dev_parsed, ned_dev_parsed_files_path)
        test_seq = self.ner_corpus.read_sequence_list_conll(ned_test, ned_test_parsed, ned_test_parsed_files_path)

        decoder = None
        type_decoder = None
        if decoding == "max-product":
            decoder = h.max_product_decode_corpus
        # elif decoding == "max_emission":
        # decoder = h.max_emission_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Max-product message passing.")
            decoder = h.max_product_decode_corpus

        self.logger.debug("Decoding.")
        print("Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq,
                                                                                                    self.ignore_rel)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq,
                                                                                                  self.ignore_rel)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq,
                                                                                                   self.ignore_rel)

        return train_seq, dev_seq, test_seq

    def prepare_trees_en(self, decoding="max-product", lr=False):
        params_fixed = (np.load("{}ip.npy".format(self.path)),
                        np.load("{}tp.npy".format(self.path)),
                        np.load("{}fp.npy".format(self.path)),
                        np.load("{}ep.npy".format(self.path)))

        if self.use_wordrep_rel:
            h = HMRTM(self.n_states, self.n_obs, R=len(self.dataset.r_dict), params=params_fixed, writeout=False,
                      dirname=self.path, omit_class_cond=self.omit_class_cond, omit_emis_cond=self.omit_emis_cond)
        else:
            h = HMTM(self.n_states, self.n_obs, params=params_fixed, writeout=False, dirname=self.path)

        self.logger.debug("Reading ner data from self.ner_corpus")
        self.ner_corpus = Conll2003NerCorpus(self.dataset.x_dict, eval_spec_rel=self.eval_spec_rel, dirname=self.path,
                                             lr=lr, use_wordrep_tree=True)

        train_seq = self.ner_corpus.read_sequence_list_conll(eng_train, eng_train_parsed)
        dev_seq = self.ner_corpus.read_sequence_list_conll(eng_dev, eng_dev_parsed)
        test_seq = self.ner_corpus.read_sequence_list_conll(eng_test, eng_test_parsed)
        muc_seq = self.ner_corpus.read_sequence_list_conll(muc_test, muc_test_parsed) if self.use_muc else None

        # return train_seq, dev_seq, test_seq
        decoder = None
        type_decoder = None
        if decoding == "max-product":
            decoder = h.max_product_decode_corpus
        elif decoding == "posterior":
            decoder = h.posterior_decode_corpus
        elif decoding == "posterior_cont":
            decoder = h.posterior_cont_decode_corpus
        elif decoding == "posterior_cont_type":
            type_decoder = h.posterior_cont_type_decode_corpus
        else:
            print("Decoder not defined, using Max-product message passing.")
            decoder = h.max_product_decode_corpus

        print("Decoding word representations on train. This may take a while...")
        type_decoder(train_seq, self.dataset, self.logger) if type_decoder is not None else decoder(train_seq,
                                                                                                    self.ignore_rel)
        print("Decoding word representations on dev.")
        type_decoder(dev_seq, self.dataset, self.logger) if type_decoder is not None else decoder(dev_seq,
                                                                                                  self.ignore_rel)
        print("Decoding word representations on test.")
        type_decoder(test_seq, self.dataset, self.logger) if type_decoder is not None else decoder(test_seq,
                                                                                                   self.ignore_rel)
        if self.use_muc:
            print("Decoding word representations on MUC.")
            type_decoder(muc_seq, self.dataset, self.logger) if type_decoder is not None else decoder(muc_seq,
                                                                                                  self.ignore_rel)

        return train_seq, dev_seq, test_seq, muc_seq
Esempio n. 8
0
 def create_vocab(self, dataset):
     d = TextCorpus(dataset, howbig=1e10)
     return d.x_dict