def create_embeddings(model, examps): """Embed textual examples :param examps: A list of text to embed :return: A len(examps) by embedding size numpy matrix of embeddings """ # Preprocess examples print(f'Preprocessing {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr) data = [] for i, line in enumerate(examps): p1 = " ".join(entok.tokenize(line, escape=False)).lower() if model.sp is not None: p1 = model.sp.EncodeAsPieces(p1) p1 = " ".join(p1) wp1 = Example(p1) wp1.populate_embeddings(model.vocab, model.zero_unk, model.args.ngrams) if len(wp1.embeddings) == 0: wp1.embeddings.append(model.vocab[unk_string]) data.append(wp1) print_progress(i, BATCH_SIZE) print("", file=sys.stderr) # Create embeddings print(f'Embedding {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr) embeddings = np.zeros( (len(examps), model.args.dim) ) for i in range(0, len(data), BATCH_SIZE): max_idx = min(i+BATCH_SIZE,len(data)) curr_batch = data[i:max_idx] wx1, wl1 = model.torchify_batch(curr_batch) vecs = model.encode(wx1, wl1) vecs = vecs.detach().cpu().numpy() vecs = vecs / np.sqrt((vecs * vecs).sum(axis=1))[:, None] #normalize for NN search embeddings[i:max_idx] = vecs print_progress(i, BATCH_SIZE) print("", file=sys.stderr) return embeddings
def pair_up_data(self): idx = random.randint(0, self.seg_length) pairs = [] for i in self.raw_data: sent = i.sentence sent = sent.split() idx = min(idx, len(sent) - 2) splits = [] start = 0 while idx < len(sent): seg1 = sent[start:idx] splits.append(seg1) start = idx idx += self.seg_length idx = min(idx, len(sent)) if idx > len(sent): seg = sent[start:len(sent)] splits.append(seg) splits = [" ".join(i) for i in splits] random.shuffle(splits) mid = len(splits) // 2 pairs.append((Example(splits[0:mid]), Example(splits[mid:]))) return pairs
def get_data(params): examples = [] finished = set([]) #check for duplicates with io.open(params.data_file, 'r', encoding='utf-8') as f: for i in f: if i in finished: continue else: finished.add(i) i = i.strip() if len(i) == 0: continue examples.append(Example(i.lower())) return examples
def get_sequences(p1, p2, model, params, fr0=0, fr1=0): wp1 = Example(p1) wp2 = Example(p2) if fr0 == 1 and fr1 == 1 and not model.share_vocab: wp1.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams) wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams) if len(wp1.embeddings) == 0: wp1.embeddings.append(model.vocab_fr[unk_string]) if len(wp2.embeddings) == 0: wp2.embeddings.append(model.vocab_fr[unk_string]) elif fr0 == 0 and fr1 == 1 and not model.share_vocab: wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams) wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams) if len(wp1.embeddings) == 0: wp1.embeddings.append(model.vocab[unk_string]) if len(wp2.embeddings) == 0: wp2.embeddings.append(model.vocab_fr[unk_string]) else: wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams) wp2.populate_embeddings(model.vocab, model.zero_unk, params.ngrams) if len(wp1.embeddings) == 0: wp1.embeddings.append(model.vocab[unk_string]) if len(wp2.embeddings) == 0: wp2.embeddings.append(model.vocab[unk_string]) return wp1, wp2