Ejemplo n.º 1
0
def read_vocab(vocab_f, minfreq=0):
    vocab = []
    for l in line_reader(vocab_f):
        if eval(l.rstrip().split("\t")[1]) > minfreq:
            vocab.append(l.rstrip().split("\t")[0])

    return vocab
Ejemplo n.º 2
0
def prepare_cluster_map(brown_cluster_file):
    """
    Build a word-to-clusterid map.
    """
    mapping = {}
    for l in line_reader(brown_cluster_file):
        c_id, w, fq = l.strip().split("\t")
        mapping[w] = c_id  # keep string cluster ids

    return mapping
Ejemplo n.º 3
0
def prepare_cluster_map(brown_cluster_file):
    """
    Build a word-to-clusterid map.
    """
    mapping = {}
    for l in line_reader(brown_cluster_file):
        c_id, w, fq = l.strip().split("\t")
        mapping[w] = c_id  # keep string cluster ids

    return mapping
Ejemplo n.º 4
0
def prepare_cluster_to_word_map(brown_cluster_file):
    """
    Build a clusterid-to-word map.
    """
    mapping = defaultdict(set)
    for l in line_reader(brown_cluster_file):
        c_id, w, fq = l.strip().split("\t")
        mapping[c_id].add(w)  # keep string cluster ids

    return mapping
Ejemplo n.º 5
0
def prepare_cluster_to_word_map(brown_cluster_file):
    """
    Build a clusterid-to-word map.
    """
    mapping = defaultdict(set)
    for l in line_reader(brown_cluster_file):
        c_id, w, fq = l.strip().split("\t")
        mapping[c_id].add(w)  # keep string cluster ids

    return mapping
Ejemplo n.º 6
0
def read_embed(embed_f):
    """
    simply read the embedding file into two structures, one for vocabulary and one for embeddings
    """
    w_to_emb = {}
    for c, l in enumerate(line_reader(embed_f)):
        if c == 0:
            m, n = map(eval, l.strip().split())
            continue
        w, *e = l.strip().split()
        assert len(e) == n
        w_to_emb[w] = e

    return w_to_emb
Ejemplo n.º 7
0
def read_embed(embed_f):
    """
    simply read the embedding file into two structures, one for vocabulary and one for embeddings
    """
    w_to_emb = {}
    for c, l in enumerate(line_reader(embed_f)):
        if c == 0:
            m, n = map(eval, l.strip().split())
            continue
        w, *e = l.strip().split()
        assert len(e) == n
        w_to_emb[w] = e

    return w_to_emb
Ejemplo n.º 8
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
Ejemplo n.º 9
0
def load_embed(embed_f, vocab_f):
    """
    Reads the embedding file and returns the numpy matrix, where row ids corresponds to vocab ids
    """
    w_dict = LabelDictionary(read_vocab(vocab_f))
    with open(embed_f) as in_f:
        m, n = map(eval, in_f.readline().strip().split())
    e_m = np.zeros((m - 1, n))  # embedding matrix; m-1 to leave out </s>
    for l in line_reader(embed_f, skip=1):
        w, *e = l.strip().split()
        assert len(e) == n

        if w not in w_dict:
            continue
        e_m[w_dict.get_label_id(w)] = e
    return e_m
Ejemplo n.º 10
0
def read_vocab_freq(vocab_f):
    vocab = {}
    for l in line_reader(vocab_f):
        vocab[l.rstrip().split("\t")[0]] = eval(l.rstrip().split("\t")[1])

    return vocab
Ejemplo n.º 11
0
sm = True  # split-merge procedure

if start_n_states is None:
    # no split-merge
    sm = False
    start_n_states = desired_n

n_sent = 0
if args.tree or args.rel or args.lr:
    reader = Conll07Reader(args.dataset)
    sent = reader.getNext()
    while sent:
        n_sent += 1
        sent = reader.getNext()
else:
    for l in line_reader(args.dataset):
        n_sent += 1

dirname = prepare_dirname(hmm_type=hmm_type,
                          append_string=append_string,
                          lang=args.lang,
                          max_iter=max_iter,
                          N=start_n_states,
                          n_sent=n_sent,
                          alpha=alpha,
                          minibatch_size=minibatch_size)

if args.tree or args.rel or args.lr:
    if args.lang == "en":
        dataset = ConllCorpus(args.dataset,
                              howbig=n_sent,
Ejemplo n.º 12
0
sm = True  # split-merge procedure

if start_n_states is None:
    # no split-merge
    sm = False
    start_n_states = desired_n

n_sent = 0
if args.tree or args.rel or args.lr:
    reader = Conll07Reader(args.dataset)
    sent = reader.getNext()
    while sent:
        n_sent += 1
        sent = reader.getNext()
else:
    for l in line_reader(args.dataset):
        n_sent += 1

dirname = prepare_dirname(hmm_type=hmm_type, append_string=append_string, lang=args.lang, max_iter=max_iter,
                          N=start_n_states, n_sent=n_sent, alpha=alpha, minibatch_size=minibatch_size)

if args.tree or args.rel or args.lr:
    if args.lang == "en":
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_en,
                              dirname=dirname, lr=args.lr)
    elif args.lang == "nl":
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=args.rel_spec_nl,
                              dirname=dirname, lr=args.lr)
    else:
        dataset = ConllCorpus(args.dataset, howbig=n_sent, lemmas=lemmas, spec_rels=None,
                              dirname=dirname, lr=args.lr)