def gen_batch(data): for batch in iter_to_batches(data, config.sys.batch_size): yield ( { "x": torch.stack([ torch.FloatTensor(b.dense_data) for b in batch ]).to(device) }, torch.LongTensor([b.label for b in batch]).to(device), )
def apply_faiss_to_edges( hash_and_embedding:Iterable[Record], )->Iterable[nx.Graph]: # The only reason we need parts_written_to_db is to make sure that the # writing happens before this point index = dpg.get(f"knn_util:faiss_{faiss_index_name}") inverted_index = {} graph = nx.Graph() for batch in iter_to_batches(hash_and_embedding, batch_size): hashes, embeddings = records_to_ids_and_embeddings( records=batch, ) _, neighs_per_root = index.search(embeddings, num_neighbors) hashes = hashes.tolist() + flatten_list(neighs_per_root.tolist()) hashes = list(set(hashes) - set(inverted_index.keys())) graph_keys = database_util.get( values=hashes, collection=inverted_index_collection, field_name="hash", desired_fields=["strid"] ) for k, v in zip(hashes, graph_keys): inverted_index[k] = v["strid"] # Create records for root_idx, neigh_indices in zip(hashes, neighs_per_root): root = inverted_index[root_idx] if root is None: continue for neigh_idx in neigh_indices: if neigh_idx == root_idx: continue neigh = inverted_index[neigh_idx] if neigh is None: continue graph.add_edge(root, neigh, weight=weight) graph.add_edge(neigh, root, weight=weight) return [graph]
def add_points_to_index( records:Iterable[Record], init_index_path:Path, batch_size:int, output_path:Path, embedding_field:str="embedding", id_field:str="id", )->Path: "Loads an initial index, adds the partition to the index, and writes result" index = faiss.read_index(str(init_index_path)) assert index.is_trained for batch in iter_to_batches(records, batch_size): ids, embeddings = records_to_ids_and_embeddings( records=batch, id_field=id_field, embedding_field=embedding_field ) index.add_with_ids(embeddings, ids) faiss.write_index(index, str(output_path)) return output_path
def apply_sentence_classifier_to_part( records: Iterable[Record], batch_size: int, sentence_classifier_name="sentence_classifier", predicted_type_suffix=":pred", sentence_type_field="sent_type", ) -> Iterable[Record]: device = dpg.get("embedding_util:device") model = dpg.get(f"embedding_util:{sentence_classifier_name}") res = [] for rec_batch in iter_to_batches(records, batch_size): model_input = torch.stack([ record_to_sentence_classifier_input(r) for r in rec_batch ]).to(device) predicted_labels = sentence_classifier_output_to_labels( model(model_input)) for r, lbl in zip(rec_batch, predicted_labels): r[sentence_type_field] = lbl + predicted_type_suffix res.append(r) print(len(res)) return res
def embed_records( records: Iterable[Record], batch_size: int, text_field: str, max_sequence_length: int, out_embedding_field: str = "embedding", show_pbar: bool = False, ) -> Iterable[Record]: """ Introduces an embedding field to each record, indicated the bert embedding of the supplied text field. """ dev = dpg.get("embedding_util:device") tok, model = dpg.get("embedding_util:tok,model") res = [] # PBar is necessary when using embed_records in helper scripts. pbar = tqdm( iter_to_batches(records, batch_size), total=int(len(records) / batch_size), disable=not show_pbar, ) for batch in pbar: texts = list(map(lambda x: x[text_field], batch)) sequs = pad_sequence( sequences=[ torch.tensor(tok.encode(t)[:max_sequence_length]) for t in texts ], batch_first=True, ).to(dev) with torch.no_grad(): embs = (model(sequs)[-2].mean(axis=1).cpu().detach().numpy()) for record, emb in zip(batch, embs): record[out_embedding_field] = emb res.append(record) return res