def __init__(self, model_path, word_dim=None, afix_dim=None, nlayers=2, hidden_dim=128, elu_dim=64, dep_dim=100, dropout_ratio=0.5): self.model_path = model_path defs_file = model_path + "/tagger_defs.txt" if word_dim is None: self.train = False Param.load(self, defs_file) self.extractor = FeatureExtractor(model_path) else: # training self.train = True p = Param(self) p.dep_dim = dep_dim p.word_dim = word_dim p.afix_dim = afix_dim p.hidden_dim = hidden_dim p.elu_dim = elu_dim p.nlayers = nlayers p.n_words = len(read_model_defs(model_path + "/words.txt")) p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt")) p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt")) p.targets = read_model_defs(model_path + "/target.txt") p.dump(defs_file) self.in_dim = self.word_dim + 8 * self.afix_dim self.dropout_ratio = dropout_ratio super(LSTMParser, self).__init__(emb_word=L.EmbedID(self.n_words, self.word_dim), emb_suf=L.EmbedID(self.n_suffixes, self.afix_dim, ignore_label=IGNORE), emb_prf=L.EmbedID(self.n_prefixes, self.afix_dim, ignore_label=IGNORE), lstm_f=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, self.dropout_ratio), lstm_b=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, self.dropout_ratio), linear_cat1=L.Linear(2 * self.hidden_dim, self.elu_dim), linear_cat2=L.Linear(self.elu_dim, len(self.targets)), linear_dep=L.Linear(2 * self.hidden_dim, self.dep_dim), linear_head=L.Linear(2 * self.hidden_dim, self.dep_dim), biaffine=Biaffine(self.dep_dim))
def __init__(self, model_path): self.model_path = model_path self.words = read_model_defs(model_path / 'words.txt') self.chars = read_model_defs(model_path / 'chars.txt') self.unk_word = self.words[UNK] self.start_word = self.words[START] self.end_word = self.words[END] self.unk_char = self.chars[UNK] self.start_char = self.chars[START] self.end_char = self.chars[END]
def __init__(self, model_path): self.words = read_model_defs(model_path + "/words.txt") self.suffixes = read_model_defs(model_path + "/suffixes.txt") self.prefixes = read_model_defs(model_path + "/prefixes.txt") self.unk_word = self.words[UNK] self.start_word = self.words[START] self.end_word = self.words[END] self.unk_suf = self.suffixes[UNK] self.unk_prf = self.prefixes[UNK] self.start_pre = [[self.prefixes[START]] + [-1] * 3] self.start_suf = [[self.suffixes[START]] + [-1] * 3] self.end_pre = [[self.prefixes[END]] + [-1] * 3] self.end_suf = [[self.suffixes[END]] + [-1] * 3]
def __init__(self, model_path, length=False): self.words = read_model_defs(model_path / 'words.txt') self.suffixes = read_model_defs(model_path / 'suffixes.txt') self.prefixes = read_model_defs(model_path / 'prefixes.txt') self.unk_word = self.words[UNK] self.start_word = self.words[START] self.end_word = self.words[END] self.unk_suf = self.suffixes[UNK] self.unk_prf = self.prefixes[UNK] self.start_pre = [[self.prefixes[START]] + [IGNORE] * 3] self.start_suf = [[self.suffixes[START]] + [IGNORE] * 3] self.end_pre = [[self.prefixes[END]] + [IGNORE] * 3] self.end_suf = [[self.suffixes[END]] + [IGNORE] * 3] self.length = length
def augment_pretrained_with_random_initialization(args): words = OrderedDict() # words in pretrained word embedding for word in open(args.pretrained_vocab): words[word.strip()] = 1 # words in specials e.g. PAD, START, END for word in args.specials: words[word] = 1 # words found in training data for word, freq in read_model_defs(args.new_words).items(): if freq >= args.freq_cut: words[word.encode("utf-8")] = freq new_pretrained_vocab = os.path.join(args.out, "new_words.txt") print >> sys.stderr, "writing to", new_pretrained_vocab with open(new_pretrained_vocab, "w") as f: for word, freq in words.items(): f.write("{} {}\n".format(word, freq)) embeddings = read_pretrained_embeddings(args.pretrained) assert embeddings.shape[0] <= len(words), "pretrained size: {}, read words: {}".format(embeddings.shape[0], len(words)) new_embeddings = 0.02 * np.random.random_sample( (len(words), embeddings.shape[1])).astype('f') - 0.01 for i in range(embeddings.shape[0]): new_embeddings[i] = embeddings[i] new_pretrained = os.path.join(args.out, "new_embeddings.txt") print >> sys.stderr, "writing to", new_pretrained np.savetxt(new_pretrained, new_embeddings) print >> sys.stderr, "vocabulary size", len(embeddings), "-->", len(new_embeddings)
def __init__(self, model_path, word_dim=None, afix_dim=None, nlayers=2, hidden_dim=128, relu_dim=64, dropout_ratio=0.5): self.model_path = model_path defs_file = model_path + "/tagger_defs.txt" if word_dim is None: self.train = False Param.load(self, defs_file) self.extractor = FeatureExtractor(model_path) else: self.train = True p = Param(self) p.word_dim = word_dim p.afix_dim = afix_dim p.hidden_dim = hidden_dim p.relu_dim = relu_dim p.nlayers = nlayers p.dump(defs_file) self.targets = read_model_defs(model_path + "/target.txt") self.words = read_model_defs(model_path + "/words.txt") self.suffixes = read_model_defs(model_path + "/suffixes.txt") self.prefixes = read_model_defs(model_path + "/prefixes.txt") self.in_dim = self.word_dim + 8 * self.afix_dim self.dropout_ratio = dropout_ratio super(LSTMTagger, self).__init__( emb_word=L.EmbedID(len(self.words), self.word_dim), emb_suf=L.EmbedID(len(self.suffixes), self.afix_dim, ignore_label=IGNORE), emb_prf=L.EmbedID(len(self.prefixes), self.afix_dim, ignore_label=IGNORE), lstm_f=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.), lstm_b=L.NStepLSTM(nlayers, self.in_dim, self.hidden_dim, 0.), linear1=L.Linear(2 * self.hidden_dim, self.relu_dim), linear2=L.Linear(self.relu_dim, len(self.targets)), )
def extract_subset_of_pretrained_embeddings(args): embeddings = read_pretrained_embeddings(args.pretrained) emb_words = [word.strip().decode("utf-8") for word in open(args.pretrained_vocab)] subset = read_model_defs(args.new_words).keys() new_pretrained = os.path.join(args.out, "extracted_embeddings.vector") new_vocab = os.path.join(args.out, "extracted_embeddings.words") print >> sys.stderr, "writing to", new_pretrained with open(new_vocab, "w") as v: with open(new_pretrained, "w") as f: for i, word in enumerate(emb_words): if word in subset: f.write(" ".join([str(u) for u in embeddings[i]]) + "\n") v.write(word.encode("utf-8") + "\n")
def __init__(self, model_path, ccgbank_path, tritrain_path, weight): self.model_path = model_path self.targets = read_model_defs(model_path + "/target.txt") self.extractor = FeatureExtractor(model_path) self.weight = weight self.ncopies = 15 with open(ccgbank_path) as f: self.ccgbank_samples = json.load(f) self.ccgbank_size = len(self.ccgbank_samples) with open(tritrain_path) as f: self.tritrain_samples = json.load(f) self.tritrain_size = len(self.tritrain_samples) print >> sys.stderr, "len(ccgbank):", self.ccgbank_size print >> sys.stderr, "len(ccgbank) * # copies:", self.ccgbank_size * self.ncopies print >> sys.stderr, "len(tritrain):", self.tritrain_size
def __init__(self, model_path, samples_path): self.model_path = model_path self.targets = read_model_defs(model_path + "/target.txt") self.extractor = FeatureExtractor(model_path) with open(samples_path) as f: self.samples = json.load(f).items()
def __init__(self, model_path, word_dim=None, afix_dim=None, nlayers=2, hidden_dim=128, elu_dim=64, dep_dim=100, dropout_ratio=0.5, use_cudnn=False): self.model_path = model_path defs_file = model_path + "/tagger_defs.txt" if word_dim is None: self.train = False Param.load(self, defs_file) self.extractor = FeatureExtractor(model_path) else: self.train = True p = Param(self) p.dep_dim = dep_dim p.word_dim = word_dim p.afix_dim = afix_dim p.hidden_dim = hidden_dim p.elu_dim = elu_dim p.nlayers = nlayers p.n_words = len(read_model_defs(model_path + "/words.txt")) p.n_suffixes = len(read_model_defs(model_path + "/suffixes.txt")) p.n_prefixes = len(read_model_defs(model_path + "/prefixes.txt")) p.targets = read_model_defs(model_path + "/target.txt") p.dump(defs_file) self.in_dim = self.word_dim + 8 * self.afix_dim self.dropout_ratio = dropout_ratio super(QRNNParser, self).__init__(emb_word=L.EmbedID(self.n_words, self.word_dim, ignore_label=IGNORE), emb_suf=L.EmbedID(self.n_suffixes, self.afix_dim, ignore_label=IGNORE), emb_prf=L.EmbedID(self.n_prefixes, self.afix_dim, ignore_label=IGNORE), qrnn_fs=ChainList(), qrnn_bs=ChainList(), arc_dep=L.Linear(2 * self.hidden_dim, self.dep_dim), arc_head=L.Linear(2 * self.hidden_dim, self.dep_dim), rel_dep=L.Linear(2 * self.hidden_dim, self.dep_dim), rel_head=L.Linear(2 * self.hidden_dim, self.dep_dim), biaffine_arc=Biaffine(self.dep_dim), biaffine_tag=Bilinear(self.dep_dim, self.dep_dim, len(self.targets))) in_dim = self.in_dim for _ in range(self.nlayers): self.qrnn_fs.add_link(QRNNLayer(in_dim, self.hidden_dim)) self.qrnn_bs.add_link(QRNNLayer(in_dim, self.hidden_dim)) in_dim = self.hidden_dim
def __init__(self, model_path, samples_path): self.model_path = model_path self.targets = read_model_defs(model_path + "/target.txt") self.extractor = FeatureExtractor(model_path) with open(samples_path) as f: self.samples = sorted(json.load(f), key=lambda x: len(x[1][0]))