Esempio n. 1
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 elu_dim=64,
                 dep_dim=100,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            # training
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(LSTMParser,
              self).__init__(emb_word=L.EmbedID(self.n_words, self.word_dim),
                             emb_suf=L.EmbedID(self.n_suffixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             emb_prf=L.EmbedID(self.n_prefixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             lstm_f=L.NStepLSTM(nlayers, self.in_dim,
                                                self.hidden_dim,
                                                self.dropout_ratio),
                             lstm_b=L.NStepLSTM(nlayers, self.in_dim,
                                                self.hidden_dim,
                                                self.dropout_ratio),
                             linear_cat1=L.Linear(2 * self.hidden_dim,
                                                  self.elu_dim),
                             linear_cat2=L.Linear(self.elu_dim,
                                                  len(self.targets)),
                             linear_dep=L.Linear(2 * self.hidden_dim,
                                                 self.dep_dim),
                             linear_head=L.Linear(2 * self.hidden_dim,
                                                  self.dep_dim),
                             biaffine=Biaffine(self.dep_dim))
Esempio n. 2
0
 def __init__(self, model_path):
     self.model_path = model_path
     self.words = read_model_defs(model_path / 'words.txt')
     self.chars = read_model_defs(model_path / 'chars.txt')
     self.unk_word = self.words[UNK]
     self.start_word = self.words[START]
     self.end_word = self.words[END]
     self.unk_char = self.chars[UNK]
     self.start_char = self.chars[START]
     self.end_char = self.chars[END]
Esempio n. 3
0
 def __init__(self, model_path):
     self.words = read_model_defs(model_path + "/words.txt")
     self.suffixes = read_model_defs(model_path + "/suffixes.txt")
     self.prefixes = read_model_defs(model_path + "/prefixes.txt")
     self.unk_word = self.words[UNK]
     self.start_word = self.words[START]
     self.end_word = self.words[END]
     self.unk_suf = self.suffixes[UNK]
     self.unk_prf = self.prefixes[UNK]
     self.start_pre = [[self.prefixes[START]] + [-1] * 3]
     self.start_suf = [[self.suffixes[START]] + [-1] * 3]
     self.end_pre = [[self.prefixes[END]] + [-1] * 3]
     self.end_suf = [[self.suffixes[END]] + [-1] * 3]
Esempio n. 4
0
 def __init__(self, model_path, length=False):
     self.words = read_model_defs(model_path / 'words.txt')
     self.suffixes = read_model_defs(model_path / 'suffixes.txt')
     self.prefixes = read_model_defs(model_path / 'prefixes.txt')
     self.unk_word = self.words[UNK]
     self.start_word = self.words[START]
     self.end_word = self.words[END]
     self.unk_suf = self.suffixes[UNK]
     self.unk_prf = self.prefixes[UNK]
     self.start_pre = [[self.prefixes[START]] + [IGNORE] * 3]
     self.start_suf = [[self.suffixes[START]] + [IGNORE] * 3]
     self.end_pre = [[self.prefixes[END]] + [IGNORE] * 3]
     self.end_suf = [[self.suffixes[END]] + [IGNORE] * 3]
     self.length = length
Esempio n. 5
0
def augment_pretrained_with_random_initialization(args):
    words = OrderedDict()
    # words in pretrained word embedding
    for word in open(args.pretrained_vocab):
        words[word.strip()] = 1

    # words in specials e.g. PAD, START, END
    for word in args.specials:
        words[word] = 1

    # words found in training data
    for word, freq in read_model_defs(args.new_words).items():
        if freq >= args.freq_cut:
            words[word.encode("utf-8")] = freq

    new_pretrained_vocab = os.path.join(args.out, "new_words.txt")
    print >> sys.stderr, "writing to", new_pretrained_vocab
    with open(new_pretrained_vocab, "w") as f:
        for word, freq in words.items():
            f.write("{} {}\n".format(word, freq))

    embeddings = read_pretrained_embeddings(args.pretrained)
    assert embeddings.shape[0] <= len(words), "pretrained size: {}, read words: {}".format(embeddings.shape[0], len(words))
    new_embeddings = 0.02 * np.random.random_sample(
            (len(words), embeddings.shape[1])).astype('f') - 0.01
    for i in range(embeddings.shape[0]):
        new_embeddings[i] = embeddings[i]

    new_pretrained = os.path.join(args.out, "new_embeddings.txt")
    print >> sys.stderr, "writing to", new_pretrained
    np.savetxt(new_pretrained, new_embeddings)
    print >> sys.stderr, "vocabulary size", len(embeddings), "-->", len(new_embeddings)
Esempio n. 6
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 relu_dim=64,
                 dropout_ratio=0.5):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.relu_dim = relu_dim
            p.nlayers = nlayers
            p.dump(defs_file)

        self.targets = read_model_defs(model_path + "/target.txt")
        self.words = read_model_defs(model_path + "/words.txt")
        self.suffixes = read_model_defs(model_path + "/suffixes.txt")
        self.prefixes = read_model_defs(model_path + "/prefixes.txt")
        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(LSTMTagger, self).__init__(
            emb_word=L.EmbedID(len(self.words), self.word_dim),
            emb_suf=L.EmbedID(len(self.suffixes),
                              self.afix_dim,
                              ignore_label=IGNORE),
            emb_prf=L.EmbedID(len(self.prefixes),
                              self.afix_dim,
                              ignore_label=IGNORE),
            lstm_f=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.),
            lstm_b=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.),
            linear1=L.Linear(2 * self.hidden_dim, self.relu_dim),
            linear2=L.Linear(self.relu_dim, len(self.targets)),
        )
Esempio n. 7
0
def extract_subset_of_pretrained_embeddings(args):
    embeddings = read_pretrained_embeddings(args.pretrained)
    emb_words = [word.strip().decode("utf-8") for word in open(args.pretrained_vocab)]
    subset = read_model_defs(args.new_words).keys()

    new_pretrained = os.path.join(args.out, "extracted_embeddings.vector")
    new_vocab = os.path.join(args.out, "extracted_embeddings.words")
    print >> sys.stderr, "writing to", new_pretrained
    with open(new_vocab, "w") as v:
        with open(new_pretrained, "w") as f:
            for i, word in enumerate(emb_words):
                if word in subset:
                    f.write(" ".join([str(u) for u in embeddings[i]]) + "\n")
                    v.write(word.encode("utf-8") + "\n")
Esempio n. 8
0
    def __init__(self, model_path, ccgbank_path, tritrain_path, weight):
        self.model_path = model_path
        self.targets = read_model_defs(model_path + "/target.txt")
        self.extractor = FeatureExtractor(model_path)
        self.weight = weight
        self.ncopies = 15
        with open(ccgbank_path) as f:
            self.ccgbank_samples = json.load(f)
            self.ccgbank_size = len(self.ccgbank_samples)
        with open(tritrain_path) as f:
            self.tritrain_samples = json.load(f)
            self.tritrain_size = len(self.tritrain_samples)

        print >> sys.stderr, "len(ccgbank):", self.ccgbank_size
        print >> sys.stderr, "len(ccgbank) * # copies:", self.ccgbank_size * self.ncopies
        print >> sys.stderr, "len(tritrain):", self.tritrain_size
Esempio n. 9
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.targets = read_model_defs(model_path + "/target.txt")
     self.extractor = FeatureExtractor(model_path)
     with open(samples_path) as f:
         self.samples = json.load(f).items()
Esempio n. 10
0
    def __init__(self,
                 model_path,
                 word_dim=None,
                 afix_dim=None,
                 nlayers=2,
                 hidden_dim=128,
                 elu_dim=64,
                 dep_dim=100,
                 dropout_ratio=0.5,
                 use_cudnn=False):
        self.model_path = model_path
        defs_file = model_path + "/tagger_defs.txt"
        if word_dim is None:
            self.train = False
            Param.load(self, defs_file)
            self.extractor = FeatureExtractor(model_path)
        else:
            self.train = True
            p = Param(self)
            p.dep_dim = dep_dim
            p.word_dim = word_dim
            p.afix_dim = afix_dim
            p.hidden_dim = hidden_dim
            p.elu_dim = elu_dim
            p.nlayers = nlayers
            p.n_words = len(read_model_defs(model_path + "/words.txt"))
            p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt"))
            p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt"))
            p.targets = read_model_defs(model_path + "/target.txt")
            p.dump(defs_file)

        self.in_dim = self.word_dim + 8 * self.afix_dim
        self.dropout_ratio = dropout_ratio
        super(QRNNParser,
              self).__init__(emb_word=L.EmbedID(self.n_words,
                                                self.word_dim,
                                                ignore_label=IGNORE),
                             emb_suf=L.EmbedID(self.n_suffixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             emb_prf=L.EmbedID(self.n_prefixes,
                                               self.afix_dim,
                                               ignore_label=IGNORE),
                             qrnn_fs=ChainList(),
                             qrnn_bs=ChainList(),
                             arc_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             arc_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             rel_dep=L.Linear(2 * self.hidden_dim,
                                              self.dep_dim),
                             rel_head=L.Linear(2 * self.hidden_dim,
                                               self.dep_dim),
                             biaffine_arc=Biaffine(self.dep_dim),
                             biaffine_tag=Bilinear(self.dep_dim, self.dep_dim,
                                                   len(self.targets)))
        in_dim = self.in_dim
        for _ in range(self.nlayers):
            self.qrnn_fs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            self.qrnn_bs.add_link(QRNNLayer(in_dim, self.hidden_dim))
            in_dim = self.hidden_dim
Esempio n. 11
0
 def __init__(self, model_path, samples_path):
     self.model_path = model_path
     self.targets = read_model_defs(model_path + "/target.txt")
     self.extractor = FeatureExtractor(model_path)
     with open(samples_path) as f:
         self.samples = sorted(json.load(f), key=lambda x: len(x[1][0]))