def gen_batches(self, docs, y=None): docs = list(zip(docs, y)) if y is not None else list(docs) if y is not None: get_rng().shuffle(docs) for i in range(0, len(docs), self.batch_size): cur_docs = docs[i:i + self.batch_size] if len(cur_docs) < self.batch_size: cur_docs.extend(docs[i] for i in get_rng().choice(len(docs), self.batch_size - len(cur_docs), False)) yield self.gen_batch(*zip(*cur_docs)) if y is not None else self.gen_batch(cur_docs)
def gen_batches(self, docs, y=None): docs = list(zip(docs, y)) if y is not None else list(docs) if y is not None: get_rng().shuffle(docs) for i in range(0, len(docs), self.batch_size): cur_docs = docs[i:i + self.batch_size] if len(cur_docs) < self.batch_size: cur_docs.extend( docs[i] for i in get_rng().choice(len(docs), self.batch_size - len(cur_docs), False)) yield self.gen_batch(*zip( *cur_docs)) if y is not None else self.gen_batch(cur_docs)
def fit(self, docs, y): self.joint_model.build_vocab(docs) freqs = Counter(y) classes = sorted(y.keys()) self.class_scores = np.log([freqs[c] for c in classes]) self.models = [deepcopy(self.joint_model) for _ in classes] for class_, model in zip(classes, self.models): cur_docs = [doc for doc, c in zip(docs, y) if c == class_] for epoch in range(20): logger.info('epoch {}'.format(epoch + 1)) get_rng().shuffle(cur_docs) model.train(cur_docs) model.alpha *= 0.9 model.min_alpha = model.alpha
def fit(self, docs, y): self.joint_model.build_vocab(docs) freqs = Counter(y) classes = sorted(freqs.keys()) self.class_scores = np.log([freqs[c] for c in classes]) self.models = [deepcopy(self.joint_model) for _ in classes] for class_, model in zip(classes, self.models): cur_docs = [doc for doc, c in zip(docs, y) if c == class_] for epoch in range(20): logging.info('epoch {}'.format(epoch + 1)) get_rng().shuffle(cur_docs) model.train(cur_docs) model.alpha *= 0.9 model.min_alpha = model.alpha
def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None): if embedding_type == 'google': embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle') embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()}) elif embedding_type == 'twitter': estimator = Pipeline([ ('tokenize', MapCorporas(tokenize_)), ('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec( sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 ), self.memory))), ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]) embeddings_ = estimator.named_steps['word2vec'].estimator embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()}) else: embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={}) estimator = Pipeline([ ('tokenize', MapCorporas(tokenize_)), # 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones ('embeddings', MapCorporas(Embeddings( embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'), include_zero=True ))), ]) estimator.fit(construct_docs) return estimator.named_steps['embeddings'].estimator
def transform(self, docs): for doc in docs: res = '' if isinstance(doc, str) else [] is_ = sorted(get_rng().choice(len(doc), min(len(doc), get_rng().geometric(self.p) - 1), replace=False)) prev_i = -1 for i in is_: # delete, insert, substitute op = get_rng().choice(3) if self.alphabet else 0 if op == 0: res += doc[prev_i + 1:i] elif op == 1: res += doc[prev_i + 1:i + 1] + self.alphabet[get_rng().choice(len(self.alphabet))] else: res += doc[prev_i + 1:i] + self.alphabet[get_rng().choice(len(self.alphabet))] prev_i = i res += doc[prev_i + 1:] yield res
def transform(self, docs): for doc in docs: res = [word for word, tag, confidence in doc] is_ = get_rng().choice(len(doc), min(len(doc), get_rng().geometric(0.5)), replace=False) for i in is_: word, tag, confidence = doc[i] words = [] if tag not in 'nvar': continue for synset in wn.synsets(word, pos=tag): for lemma in synset.lemma_names(): replace_word = lemma.replace('_', ' ') if replace_word.lower() != word.lower(): words.append(replace_word) word_i = get_rng().geometric(0.5) if word_i < len(words): res[i] = words[word_i] yield res
def transform(self, docs): for doc in docs: res = '' if isinstance(doc, str) else [] is_ = sorted(get_rng().choice(len(doc), min(len(doc), get_rng().geometric(self.p) - 1), replace=False)) prev_i = -1 for i in is_: # delete, insert, substitute op = get_rng().choice(3) if self.alphabet else 0 if op == 0: res += doc[prev_i + 1:i] elif op == 1: res += doc[prev_i + 1:i + 1] + self.alphabet[get_rng().choice( len(self.alphabet))] else: res += doc[prev_i + 1:i] + self.alphabet[get_rng().choice( len(self.alphabet))] prev_i = i res += doc[prev_i + 1:] yield res
def _fit_embedding_char(embedding_type, alphabet, d=None): if embedding_type == 'onehot': X = np.identity(len(alphabet), dtype='float32') else: X = get_rng().uniform(-0.25, 0.25, (len(alphabet), d)).astype('float32') return Embeddings(SimpleNamespace(vocab=dict(zip(alphabet, range(len(alphabet)))), X=X), include_zero=True)