async def update_masked_attention( payload: api.MaskUpdatePayload, ): # -> api.AttentionResponse: """ Return attention information from tokens and mask indices. Object: {"a" : {"sentence":__, "mask_inds"}, "b" : {...}} """ model = payload.model details = aconf.from_pretrained(model) tokens = payload.tokens sentence = payload.sentence mask = payload.mask layer = payload.layer MASK = details.aligner.mask_token mask_tokens = lambda toks, maskinds: [ t if i not in maskinds else ifnone(MASK, t) for (i, t) in enumerate(toks) ] token_inputs = mask_tokens(tokens, mask) deets = details.att_from_tokens(token_inputs, sentence) payload_out = deets.to_json(layer) return { "status": 200, "payload": payload_out, }
def update_masked_attention(**request): """From tokens and indices of what should be masked, get the attentions and predictions payload = request['payload'] Args: payload['model'] (str): Model name payload['tokens'] (List[str]): Tokens to pass through the model payload['sentence'] (str): Original sentence the tokens came from payload['mask'] (List[int]): Which indices to mask payload['layer'] (int): Which layer to extract information from Returns: { status: 200 payload: { aa: { att: Array((nheads, ntoks, ntoks)) left: [{ text (str), topk_words (List[str]), topk_probs (List[float]) }, ...] right: [{ text (str), topk_words (List[str]), topk_probs (List[float]) }, ...] } } } """ payload = request["payload"] model = payload['model'] details = get_details(model) tokens = payload["tokens"] sentence = payload["sentence"] mask = payload["mask"] layer = int(payload["layer"]) MASK = details.tok.mask_token mask_tokens = lambda toks, maskinds: [ t if i not in maskinds else ifnone(MASK, t) for (i, t) in enumerate(toks) ] token_inputs = mask_tokens(tokens, mask) deets = details.from_tokens(token_inputs, sentence) payload_out = deets.to_json(layer) return { "status": 200, "payload": payload_out, }
def search_nearest(payload: api.QueryNearestPayload, kind: str): """Search annotated corpus by `kind` (either 'embeddings' or 'contexts')""" assert ( kind == "embeddings" or kind == "contexts" ), f"Expected `kind` to be 'embeddings' or 'contexts'. Received {kind}" model = payload.model corpus = payload.corpus embedding = payload.embedding layer = payload.layer heads = payload.heads k = payload.k try: details = aconf.from_pretrained(model) except: return {"status": 405, "payload": None} try: if aconf.has_corpus: if '/' in aconf.model: # If transformer model is in format `user/model` cc = from_base_dir(aconf.corpus, aconf.model) else: cc = from_base_dir(aconf.corpus) else: model_name = ifnone(aconf.model_name, model) cc = from_model(model_name, corpus) except FileNotFoundError as e: return {"status": 406, "payload": None} q = np.array(embedding).reshape((1, -1)).astype(np.float32) heads = list(set(heads)) if kind == "embeddings": print("\n\nSEARCHING EMBEDDINGS\n\n") out = cc.search_embeddings(layer, q, k) elif kind == "contexts": print("\n\nSEARCHING CONTEXTS\n\n") out = cc.search_contexts(layer, heads, q, k) payload_out = [o.to_json(layer, heads) for o in out] return {"status": 200, "payload": payload_out}
def __init__(self, fname, name=None): """Open an hdf5 file of the format designed and provide easy access to its contents""" # For iterating through the dataset self.__curr = 0 self.__name = ifnone(name, "CorpusData") self.fname = fname self.data = h5py.File(fname, 'r') main_keys = self.data.keys() self.__len = len(main_keys) assert self.__len > 0, "Cannot process an empty file" embeds = self[0].embeddings self.embedding_dim = embeds.shape[-1] self.n_layers = embeds.shape[0] - 1 # 1 was added for the input layer self.refmap, self.total_vectors = self._init_vector_map()
import argparse from pathlib import Path def parse_args(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--file', help="Path to .txt file to analyze and annotate") parser.add_argument("-o", "--outdir", help="Path of output directory inside of which to place <model>/<corpus>/ directory containing hdf5 and faiss files") parser.add_argument("-n", "--name", default=None, help="Name the corpus with a code name. If not given, default to the name of the provided .txt file") parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 and faiss files.") parser.add_argument("-m", "--model", help="Specify the huggingface model to use for attentions") parser.add_argument("--nomask", action='store_false', help="INCLUDE attentions from special tokens like [CLS] and [SEP]. By default, ignore these attentions") return parser.parse_args() if __name__ == "__main__": from utils.f import ifnone import create_hdf5 import create_faiss args = parse_args() f = Path(args.file) corpus_name = ifnone(args.name, f.stem) output_dir = Path(args.outdir) / args.model / corpus_name output_dir.mkdir(parents=True, exist_ok=True) create_hdf5.main(args.file, output_dir, args.force, args.model, args.nomask) create_faiss.main(output_dir)