Ejemplo n.º 1
0
def create_embeddings(model, examps):
    """Embed textual examples

    :param examps: A list of text to embed
    :return: A len(examps) by embedding size numpy matrix of embeddings
    """
    # Preprocess examples
    print(f'Preprocessing {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr)
    data = []
    for i, line in enumerate(examps):
        p1 = " ".join(entok.tokenize(line, escape=False)).lower()
        if model.sp is not None:
            p1 = model.sp.EncodeAsPieces(p1)
            p1 = " ".join(p1)
        wp1 = Example(p1)
        wp1.populate_embeddings(model.vocab, model.zero_unk, model.args.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab[unk_string])
        data.append(wp1)
        print_progress(i, BATCH_SIZE)
    print("", file=sys.stderr)
    # Create embeddings
    print(f'Embedding {len(examps)} examples (.={BATCH_SIZE} examples)', file=sys.stderr)
    embeddings = np.zeros( (len(examps), model.args.dim) )
    for i in range(0, len(data), BATCH_SIZE):
        max_idx = min(i+BATCH_SIZE,len(data))
        curr_batch = data[i:max_idx]
        wx1, wl1 = model.torchify_batch(curr_batch)
        vecs = model.encode(wx1, wl1)
        vecs = vecs.detach().cpu().numpy()
        vecs = vecs / np.sqrt((vecs * vecs).sum(axis=1))[:, None] #normalize for NN search
        embeddings[i:max_idx] = vecs
        print_progress(i, BATCH_SIZE)
    print("", file=sys.stderr)
    return embeddings
Ejemplo n.º 2
0
 def pair_up_data(self):
     idx = random.randint(0, self.seg_length)
     pairs = []
     for i in self.raw_data:
         sent = i.sentence
         sent = sent.split()
         idx = min(idx, len(sent) - 2)
         splits = []
         start = 0
         while idx < len(sent):
             seg1 = sent[start:idx]
             splits.append(seg1)
             start = idx
             idx += self.seg_length
             idx = min(idx, len(sent))
         if idx > len(sent):
             seg = sent[start:len(sent)]
             splits.append(seg)
         splits = [" ".join(i) for i in splits]
         random.shuffle(splits)
         mid = len(splits) // 2
         pairs.append((Example(splits[0:mid]), Example(splits[mid:])))
     return pairs
def get_data(params):
    examples = []

    finished = set([])  #check for duplicates
    with io.open(params.data_file, 'r', encoding='utf-8') as f:
        for i in f:
            if i in finished:
                continue
            else:
                finished.add(i)

            i = i.strip()
            if len(i) == 0:
                continue

            examples.append(Example(i.lower()))

    return examples
def get_sequences(p1, p2, model, params, fr0=0, fr1=0):
    wp1 = Example(p1)
    wp2 = Example(p2)

    if fr0 == 1 and fr1 == 1 and not model.share_vocab:
        wp1.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab_fr[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab_fr[unk_string])
    elif fr0 == 0 and fr1 == 1 and not model.share_vocab:
        wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab_fr, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab_fr[unk_string])
    else:
        wp1.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        wp2.populate_embeddings(model.vocab, model.zero_unk, params.ngrams)
        if len(wp1.embeddings) == 0:
            wp1.embeddings.append(model.vocab[unk_string])
        if len(wp2.embeddings) == 0:
            wp2.embeddings.append(model.vocab[unk_string])

    return wp1, wp2