コード例 #1
0
async def update_masked_attention(
    payload: api.MaskUpdatePayload,
):  # -> api.AttentionResponse:
    """
    Return attention information from tokens and mask indices.

    Object: {"a" : {"sentence":__, "mask_inds"}, "b" : {...}}
    """
    model = payload.model
    details = aconf.from_pretrained(model)

    tokens = payload.tokens
    sentence = payload.sentence
    mask = payload.mask
    layer = payload.layer

    MASK = details.aligner.mask_token
    mask_tokens = lambda toks, maskinds: [
        t if i not in maskinds else ifnone(MASK, t) for (i, t) in enumerate(toks)
    ]

    token_inputs = mask_tokens(tokens, mask)

    deets = details.att_from_tokens(token_inputs, sentence)
    payload_out = deets.to_json(layer)

    return {
        "status": 200,
        "payload": payload_out,
    }
コード例 #2
0
ファイル: main.py プロジェクト: julien-c/exformer
def update_masked_attention(**request):
    """From tokens and indices of what should be masked, get the attentions and predictions
    
    payload = request['payload']

    Args:
        payload['model'] (str): Model name
        payload['tokens'] (List[str]): Tokens to pass through the model
        payload['sentence'] (str): Original sentence the tokens came from
        payload['mask'] (List[int]): Which indices to mask
        payload['layer'] (int): Which layer to extract information from

    Returns:
        {
            status: 200
            payload: {
                aa: {
                    att: Array((nheads, ntoks, ntoks))
                    left: [{
                        text (str), 
                        topk_words (List[str]),
                        topk_probs (List[float])
                    }, ...]
                    right: [{
                        text (str), 
                        topk_words (List[str]),
                        topk_probs (List[float])
                    }, ...]
                }
            }
        }
    """
    payload = request["payload"]

    model = payload['model']
    details = get_details(model)

    tokens = payload["tokens"]
    sentence = payload["sentence"]
    mask = payload["mask"]
    layer = int(payload["layer"])

    MASK = details.tok.mask_token
    mask_tokens = lambda toks, maskinds: [
        t if i not in maskinds else ifnone(MASK, t)
        for (i, t) in enumerate(toks)
    ]

    token_inputs = mask_tokens(tokens, mask)

    deets = details.from_tokens(token_inputs, sentence)
    payload_out = deets.to_json(layer)

    return {
        "status": 200,
        "payload": payload_out,
    }
コード例 #3
0
def search_nearest(payload: api.QueryNearestPayload, kind: str):
    """Search annotated corpus by `kind` (either 'embeddings' or 'contexts')"""

    assert (
        kind == "embeddings" or kind == "contexts"
    ), f"Expected `kind` to be 'embeddings' or 'contexts'. Received {kind}"

    model = payload.model
    corpus = payload.corpus
    embedding = payload.embedding
    layer = payload.layer
    heads = payload.heads
    k = payload.k

    try:
        details = aconf.from_pretrained(model)
    except:
        return {"status": 405, "payload": None}

    try:
        if aconf.has_corpus:
            if '/' in aconf.model:
                # If transformer model is in format `user/model`
                cc = from_base_dir(aconf.corpus, aconf.model)
            else:
                cc = from_base_dir(aconf.corpus)

        else:
            model_name = ifnone(aconf.model_name, model)
            cc = from_model(model_name, corpus)
    except FileNotFoundError as e:
        return {"status": 406, "payload": None}

    q = np.array(embedding).reshape((1, -1)).astype(np.float32)
    heads = list(set(heads))

    if kind == "embeddings":
        print("\n\nSEARCHING EMBEDDINGS\n\n")
        out = cc.search_embeddings(layer, q, k)
    elif kind == "contexts":
        print("\n\nSEARCHING CONTEXTS\n\n")
        out = cc.search_contexts(layer, heads, q, k)

    payload_out = [o.to_json(layer, heads) for o in out]

    return {"status": 200, "payload": payload_out}
コード例 #4
0
    def __init__(self, fname, name=None):
        """Open an hdf5 file of the format designed and provide easy access to its contents"""

        # For iterating through the dataset
        self.__curr = 0

        self.__name = ifnone(name, "CorpusData")
        self.fname = fname
        self.data = h5py.File(fname, 'r')

        main_keys = self.data.keys()
        self.__len = len(main_keys)

        assert self.__len > 0, "Cannot process an empty file"

        embeds = self[0].embeddings
        self.embedding_dim = embeds.shape[-1]
        self.n_layers = embeds.shape[0] - 1  # 1 was added for the input layer
        self.refmap, self.total_vectors = self._init_vector_map()
コード例 #5
0
import argparse
from pathlib import Path

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--file', help="Path to .txt file to analyze and annotate")
    parser.add_argument("-o", "--outdir", help="Path of output directory inside of which to place <model>/<corpus>/ directory containing hdf5 and faiss files")
    parser.add_argument("-n", "--name", default=None, help="Name the corpus with a code name. If not given, default to the name of the provided .txt file")
    parser.add_argument("--force", action="store_true", help="If given, overwrite existing hdf5 and faiss files.")
    parser.add_argument("-m", "--model", help="Specify the huggingface model to use for attentions")
    parser.add_argument("--nomask", action='store_false', help="INCLUDE attentions from special tokens like [CLS] and [SEP]. By default, ignore these attentions")

    return parser.parse_args()

if __name__ == "__main__":
    from utils.f import ifnone
    import create_hdf5
    import create_faiss

    args = parse_args()

    f = Path(args.file)
    corpus_name = ifnone(args.name, f.stem)
    output_dir = Path(args.outdir) / args.model / corpus_name
    output_dir.mkdir(parents=True, exist_ok=True)

    create_hdf5.main(args.file, output_dir, args.force, args.model, args.nomask)
    create_faiss.main(output_dir)