Exemple #1
0
 def gen_batch(data):
     for batch in iter_to_batches(data, config.sys.batch_size):
         yield (
             {
                 "x":
                 torch.stack([
                     torch.FloatTensor(b.dense_data) for b in batch
                 ]).to(device)
             },
             torch.LongTensor([b.label for b in batch]).to(device),
         )
Exemple #2
0
  def apply_faiss_to_edges(
      hash_and_embedding:Iterable[Record],
  )->Iterable[nx.Graph]:

    # The only reason we need parts_written_to_db is to make sure that the
    # writing happens before this point
    index = dpg.get(f"knn_util:faiss_{faiss_index_name}")
    inverted_index = {}

    graph = nx.Graph()
    for batch in iter_to_batches(hash_and_embedding, batch_size):
      hashes, embeddings = records_to_ids_and_embeddings(
          records=batch,
      )
      _, neighs_per_root = index.search(embeddings, num_neighbors)

      hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist())
      hashes = list(set(hashes) - set(inverted_index.keys()))

      graph_keys = database_util.get(
          values=hashes,
          collection=inverted_index_collection,
          field_name="hash",
          desired_fields=["strid"]
      )
      for k, v in zip(hashes, graph_keys):
        inverted_index[k] = v["strid"]

      # Create records
      for root_idx, neigh_indices in zip(hashes, neighs_per_root):
        root = inverted_index[root_idx]
        if root is None:
          continue
        for neigh_idx in neigh_indices:
          if neigh_idx == root_idx:
            continue
          neigh = inverted_index[neigh_idx]
          if neigh is None:
            continue
          graph.add_edge(root, neigh, weight=weight)
          graph.add_edge(neigh, root, weight=weight)
    return [graph]
Exemple #3
0
def add_points_to_index(
    records:Iterable[Record],
    init_index_path:Path,
    batch_size:int,
    output_path:Path,
    embedding_field:str="embedding",
    id_field:str="id",
)->Path:
  "Loads an initial index, adds the partition to the index, and writes result"
  index = faiss.read_index(str(init_index_path))
  assert index.is_trained

  for batch in iter_to_batches(records, batch_size):
    ids, embeddings = records_to_ids_and_embeddings(
        records=batch,
        id_field=id_field,
        embedding_field=embedding_field
    )
    index.add_with_ids(embeddings, ids)
  faiss.write_index(index, str(output_path))
  return output_path
Exemple #4
0
def apply_sentence_classifier_to_part(
    records: Iterable[Record],
    batch_size: int,
    sentence_classifier_name="sentence_classifier",
    predicted_type_suffix=":pred",
    sentence_type_field="sent_type",
) -> Iterable[Record]:
    device = dpg.get("embedding_util:device")
    model = dpg.get(f"embedding_util:{sentence_classifier_name}")

    res = []
    for rec_batch in iter_to_batches(records, batch_size):
        model_input = torch.stack([
            record_to_sentence_classifier_input(r) for r in rec_batch
        ]).to(device)
        predicted_labels = sentence_classifier_output_to_labels(
            model(model_input))
        for r, lbl in zip(rec_batch, predicted_labels):
            r[sentence_type_field] = lbl + predicted_type_suffix
            res.append(r)
    print(len(res))
    return res
Exemple #5
0
def embed_records(
    records: Iterable[Record],
    batch_size: int,
    text_field: str,
    max_sequence_length: int,
    out_embedding_field: str = "embedding",
    show_pbar: bool = False,
) -> Iterable[Record]:
    """
  Introduces an embedding field to each record, indicated the bert embedding
  of the supplied text field.
  """

    dev = dpg.get("embedding_util:device")
    tok, model = dpg.get("embedding_util:tok,model")

    res = []
    # PBar is necessary when using embed_records in helper scripts.
    pbar = tqdm(
        iter_to_batches(records, batch_size),
        total=int(len(records) / batch_size),
        disable=not show_pbar,
    )
    for batch in pbar:
        texts = list(map(lambda x: x[text_field], batch))
        sequs = pad_sequence(
            sequences=[
                torch.tensor(tok.encode(t)[:max_sequence_length])
                for t in texts
            ],
            batch_first=True,
        ).to(dev)
        with torch.no_grad():
            embs = (model(sequs)[-2].mean(axis=1).cpu().detach().numpy())
        for record, emb in zip(batch, embs):
            record[out_embedding_field] = emb
            res.append(record)
    return res