def main(self, path): """ :param path: path to dir containing npy and settings files from the experiment """ self.path = path self.ep = np.load("{}/ep.npy".format(self.path)) #get some info from the setttings file with open("{}/settings".format(self.path)) as infile: data_name = None n_sent = None for l in infile: if l.startswith("Name of the corpus file: "): data_name = l.strip().split(" ")[-1] elif l.startswith("Number of sentences: "): n_sent = l.strip().split(" ")[-1] if data_name is None: print("Not able to retrieve the dataset name.") if n_sent is None: print("Not able to retrieve the number of sentences.") self.data_name = data_name self.n_sent = eval(n_sent) if "tree" or "_rel_" in path: if "_en_" in path: self.data = ConllCorpus(self.data_name, howbig=self.n_sent, lemmas=False) elif "_nl_" in path: self.data = ConllCorpus(self.data_name, howbig=self.n_sent) else: self.data = TextCorpus(self.data_name, howbig=self.n_sent) self.prob_thresh = None self.n = None # max n of clusters per w
def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=False, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() # extend instances with trees assert len(trees.train) == n_inst, "Number of parses not equal to number of classification instances." c_append = 0 for i in range(n_inst): # we have a parse: inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append) c_append += 1 # we don't have a parse: yield inst
def __init__(self, path, lang, decoding=None, use_wordrep_tree=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file = \ self.read_params_from_path() if self.use_wordrep_tree: if self.lang == "en": self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=False) elif self.lang == "nl": self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent) else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) self.ner_corpus = None if self.lang == "nl" and not self.use_wordrep_tree: self.dev_seq, self.test_seq = self.prepare_seqs_nl_dbg( self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and self.use_wordrep_tree: self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl( self.decoding) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not self.use_wordrep_tree: self.dev_seq = self.prepare_seqs_en_dbg(self.decoding) elif self.lang == "en" and self.use_wordrep_tree: self.dev_seq = self.prepare_trees_en_dbg(self.decoding)
def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append) c_append += 1 # we don't have a parse: else: inst = None yield inst
def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=False, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() # extend instances with trees assert len( trees.train ) == n_inst, "Number of parses not equal to number of classification instances." c_append = 0 for i in range(n_inst): # we have a parse: inst = self.normalize_tree(trees.train[c_append], trees.x_dict, c_append) c_append += 1 # we don't have a parse: yield inst
def get_tree(n_inst): trees = ConllCorpus(file_parsed, howbig=1000000, lemmas=True, eval_spec_rels=self.eval_spec_rel, dirname=self.dirname, lr=self.lr) trees.prepare_trees() self.tree_vocab = trees.x_dict # not every instance has a corresponding tree due to errors in parsing conll_idx = ConllFilesIndex(files_parsed_path) conll_idx.create_ids_set() # extend instances with trees c_append = 0 for i in range(n_inst): # we have a parse: if i + 1 in conll_idx.fileids: inst = trees.train[c_append] c_append += 1 # we don't have a parse: else: inst = None yield inst
import numpy as np from eval.ner.PrepareHmmRep import read_params_from_path from hmrtm import HMRTM from readers.conll_corpus import ConllCorpus if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-rep", "--rep_path", help="directory containing (hmm) word representations files") parser.add_argument("--use_lemmas", action='store_true', default=False, help="") args = parser.parse_args() path = args.rep_path posttype_f = "{}posttype_cumul.npy".format(path) n_states, n_obs, n_sent, n_toks, corpus_file, omit_class_cond, omit_emis_cond = read_params_from_path(path) lemmas = args.use_lemmas eval_spec_rel = True lr = False params_fixed = (np.load("{}ip.npy".format(path)), np.load("{}tp.npy".format(path)), np.load("{}fp.npy".format(path)), np.load("{}ep.npy".format(path))) dataset = ConllCorpus("{}".format(corpus_file), howbig=n_sent, lemmas=lemmas, eval_spec_rels=eval_spec_rel, dirname=path, lr=lr) dataset.train = dataset.prepare_trees_gen() # generator h = HMRTM(n_states, n_obs, R=len(dataset.r_dict), params=params_fixed, writeout=False, dirname=path, omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond) h.obtain_posttypes_cumul(posttype_f, dataset, n_types=h.M, logger=None)
# obtain model parameters n_states, n_obs, _, _, _, omit_class_cond, omit_emis_cond = read_params_from_path(path) lemmas = args.use_lemmas eval_spec_rel = args.synfunc lr = False # load model params_fixed = (np.load("{}ip.npy".format(path)), np.load("{}tp.npy".format(path)), np.load("{}fp.npy".format(path)), np.load("{}ep.npy".format(path))) # prepare sents for decoding sents = ConllCorpus(infile, howbig=1000000, lemmas=lemmas, eval_spec_rels=eval_spec_rel, dirname=path, lr=lr) sents.prepare_trees() h = HMRTM(n_states, n_obs, R=len(sents.r_dict), params=params_fixed, writeout=False, dirname=path, omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond) if eval_spec_rel else \ HMTM(n_states, n_obs, params=params_fixed, writeout=False, dirname=path) with open(args.outfile, "w") as out: for tree in sents.train: # obtain posteriors for all nodes node_to_rep = h.posterior_decode(tree, cont=True) # get words for node in tree.get_nonroots(): out.write( "{} {}\n".format(sents.x_dict.get_label_name(node.name), nparr_to_str(node_to_rep[node.index]))) out.write("\n")
def __init__(self, path, lang, decoding=None, use_wordrep_tree=False, use_wordrep_rel=False, eval_spec_rel=False, logger=None, ignore_rel=None, lr=False, use_muc=False): self.path = path self.lang = lang self.decoding = decoding self.use_wordrep_tree = use_wordrep_tree self.use_wordrep_rel = use_wordrep_rel self.eval_spec_rel = eval_spec_rel self.use_muc = use_muc if self.decoding is None: print("Decoding method not specified.") if self.use_wordrep_tree or self.use_wordrep_rel: self.decoding = "max-product" else: self.decoding = "viterbi" print("Using default: {}".format(self.decoding)) self.n_states = None self.n_obs = None self.n_sent = None self.n_toks = None self.corpus_file = None self.logger = logger self.n_states, self.n_obs, self.n_sent, self.n_toks, self.corpus_file, self.omit_class_cond, self.omit_emis_cond = \ read_params_from_path(self.path) if self.logger is not None: self.logger.debug("Preparing self.dataset") if self.use_wordrep_tree or self.use_wordrep_rel: lemmas = False if self.lang == "en" else True self.dataset = ConllCorpus("{}".format(self.corpus_file), howbig=self.n_sent, lemmas=lemmas, eval_spec_rels=self.eval_spec_rel, dirname=self.path, lr=lr) self.ignore_rel = self.dataset.r_dict.get_label_id( ignore_rel) if ignore_rel is not None else None if decoding == "posterior_cont_type": self.dataset.train = self.dataset.prepare_trees_gen( ) # generator else: self.dataset = TextCorpus("{}".format(self.corpus_file), howbig=self.n_sent) if decoding == "posterior_cont_type": self.dataset.prepare_chains() self.ner_corpus = None if self.lang == "nl" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_seqs_nl( self.decoding) # self.test_seq = self.prepare_seqs_nl(self.decoding) elif self.lang == "nl" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq = self.prepare_trees_nl( self.decoding, lr=lr) # self.test_seq = self.prepare_trees_nl(self.decoding) elif self.lang == "en" and not (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_seqs_en( self.decoding) elif self.lang == "en" and (self.use_wordrep_tree or self.use_wordrep_rel): self.train_seq, self.dev_seq, self.test_seq, self.muc_seq = self.prepare_trees_en( self.decoding, lr=lr) else: sys.exit("invalid option in PrepareHmmRep") self.dataset = None
n_sent += 1 dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter, N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size) if args.tree or args.rel or args.lr: if args.lang == "en": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_en, dirname=dirname, lr=args.lr) elif args.lang == "nl": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl, dirname=dirname, lr=args.lr) else: dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None, dirname=dirname,
if args.tree or args.rel or args.lr: reader = Conll07Reader(args.dataset) sent = reader.getNext() while sent: n_sent += 1 sent = reader.getNext() else: for l in line_reader(args.dataset): n_sent += 1 dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter, N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size) if args.tree or args.rel or args.lr: if args.lang == "en": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_en, dirname=dirname, lr=args.lr) elif args.lang == "nl": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl, dirname=dirname, lr=args.lr) else: dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None, dirname=dirname, lr=args.lr) n_rels = len(dataset.r_dict) else: dataset = TextCorpus(args.dataset, howbig=n_sent) dataset.prepare_chains() n_obs = len(dataset.x_dict) writeout = args.writeout
path = args.rep_path posttype_f = "{}posttype_cumul.npy".format(path) n_states, n_obs, n_sent, n_toks, corpus_file, omit_class_cond, omit_emis_cond = read_params_from_path( path) lemmas = args.use_lemmas eval_spec_rel = True lr = False params_fixed = (np.load("{}ip.npy".format(path)), np.load("{}tp.npy".format(path)), np.load("{}fp.npy".format(path)), np.load("{}ep.npy".format(path))) dataset = ConllCorpus("{}".format(corpus_file), howbig=n_sent, lemmas=lemmas, eval_spec_rels=eval_spec_rel, dirname=path, lr=lr) dataset.train = dataset.prepare_trees_gen() # generator h = HMRTM(n_states, n_obs, R=len(dataset.r_dict), params=params_fixed, writeout=False, dirname=path, omit_class_cond=omit_class_cond, omit_emis_cond=omit_emis_cond) h.obtain_posttypes_cumul(posttype_f, dataset, n_types=h.M, logger=None)
def create_vocab(self, dataset, lemmas): d = ConllCorpus(dataset, howbig=1e10, lemmas=lemmas) return d.x_dict