def read_vocab(vocab_f, minfreq=0): vocab = [] for l in line_reader(vocab_f): if eval(l.rstrip().split("\t")[1]) > minfreq: vocab.append(l.rstrip().split("\t")[0]) return vocab
def prepare_cluster_map(brown_cluster_file): """ Build a word-to-clusterid map. """ mapping = {} for l in line_reader(brown_cluster_file): c_id, w, fq = l.strip().split("\t") mapping[w] = c_id # keep string cluster ids return mapping
def prepare_cluster_to_word_map(brown_cluster_file): """ Build a clusterid-to-word map. """ mapping = defaultdict(set) for l in line_reader(brown_cluster_file): c_id, w, fq = l.strip().split("\t") mapping[c_id].add(w) # keep string cluster ids return mapping
def read_embed(embed_f): """ simply read the embedding file into two structures, one for vocabulary and one for embeddings """ w_to_emb = {} for c, l in enumerate(line_reader(embed_f)): if c == 0: m, n = map(eval, l.strip().split()) continue w, *e = l.strip().split() assert len(e) == n w_to_emb[w] = e return w_to_emb
def load_embed(embed_f, vocab_f): """ Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids """ w_dict = LabelDictionary(read_vocab(vocab_f)) with open(embed_f) as in_f: m, n = map(eval, in_f.readline().strip().split()) e_m = np.zeros((m - 1, n)) # embedding matrix; m-1 to leave out </s> for l in line_reader(embed_f, skip=1): w, *e = l.strip().split() assert len(e) == n if w not in w_dict: continue e_m[w_dict.get_label_id(w)] = e return e_m
def read_vocab_freq(vocab_f): vocab = {} for l in line_reader(vocab_f): vocab[l.rstrip().split("\t")[0]] = eval(l.rstrip().split("\t")[1]) return vocab
sm = True # split-merge procedure if start_n_states is None: # no split-merge sm = False start_n_states = desired_n n_sent = 0 if args.tree or args.rel or args.lr: reader = Conll07Reader(args.dataset) sent = reader.getNext() while sent: n_sent += 1 sent = reader.getNext() else: for l in line_reader(args.dataset): n_sent += 1 dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter, N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size) if args.tree or args.rel or args.lr: if args.lang == "en": dataset = ConllCorpus(args.dataset, howbig=n_sent,
sm = True # split-merge procedure if start_n_states is None: # no split-merge sm = False start_n_states = desired_n n_sent = 0 if args.tree or args.rel or args.lr: reader = Conll07Reader(args.dataset) sent = reader.getNext() while sent: n_sent += 1 sent = reader.getNext() else: for l in line_reader(args.dataset): n_sent += 1 dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter, N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size) if args.tree or args.rel or args.lr: if args.lang == "en": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_en, dirname=dirname, lr=args.lr) elif args.lang == "nl": dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl, dirname=dirname, lr=args.lr) else: dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None, dirname=dirname, lr=args.lr)