Example #1
0
 def __init__(self,
              hierarchy: Hierarchy,
              delimiter: str = "/",
              model: str = "elmo-original"):
     super(CachedMentionReader, self).__init__(lazy=True)
     self.hierarchy = hierarchy
     self.delimiter = delimiter
     self.contextualizer: Contextualizer = get_contextualizer(
         model, device="cpu", tokenizer_only=True)
Example #2
0
def main(*,
         input_fp: str,
         output: str,
         model: str = "elmo-original",
         unit: str = "subword",
         batch_size: int = 64,
         layers: List[int],
         gpuid: int = 0,
         ):

    for k, v in reversed(list(locals().items())):  # seems that `locals()` stores the args in reverse order
        logger.info(f"{blue('--' + k)} \"{v}\"")
    
    file_length = get_file_length(input_fp)

    if gpuid >= 0:
        torch.cuda.set_device(gpuid)

    contextualizer: Contextualizer = get_contextualizer(
        model,
        device="cpu" if gpuid < 0 else f"cuda:{gpuid}",
        tokenizer_only=False
    )
    with h5py.File(output, 'w') as f, \
         open(input_fp, 'r') as inp:

        lines = tqdm(inp)

        spans: Iterator[Tuple[List[str], int, int]] = get_spans(lines)

        i = 0
        resized = False
        for batch in batched(spans, batch_size=batch_size):
            sentences, ls, rs, orig_lines = zip(*batch)

            tokenized_sentences, mappings = zip(*[
                contextualizer.tokenize_with_mapping(sentence)
                for sentence in sentences
            ])
            encoded = contextualizer.encode(tokenized_sentences, frozen=True)

            for j, emb in enumerate(select_embeddings(encoded, mappings, layers, unit)):
                x: np.ndarray = emb.detach().cpu().numpy()
                x_data = x.astype(np.float32)

                dset = f.create_dataset(str(i), data=x_data)
                dset.attrs['str'] = orig_lines[j]

                i += 1

    logger.info("Data preparation complete.")
def main(*,
         input: str,
         output: str,
         model: str = "elmo-original",
         unit: str = "subword",
         batch_size: int = 64,
         layers: List[int],
         gpuid: int = 0
         ):

    for k, v in reversed(list(locals().items())):  # seems that `locals()` stores the args in reverse order
        print(f"{blue('--' + k)} \"{v}\"", file=sys.stderr)

    if gpuid >= 0:
        torch.cuda.set_device(gpuid)

    contextualizer: Contextualizer = get_contextualizer(
        model,
        device="cpu" if gpuid < 0 else f"cuda:{gpuid}",
        tokenizer_only=False
    )
    dump = StringNdArrayBerkeleyDBStorage.open(output, mode='w')
    print(input)

    lines: Iterator[str] = tqdm(open(input, mode='r', encoding='utf8'))
    spans: Iterator[Tuple[List[str], int, int]] = get_spans(lines)

    i = 0
    for batch in batched(spans, batch_size=batch_size):
        sentences, ls, rs = zip(*batch)

        tokenized_sentences, mappings = zip(*[
            contextualizer.tokenize_with_mapping(sentence)
            for sentence in sentences
        ])

        encoded = contextualizer.encode(tokenized_sentences, frozen=True)

        for emb in select_embeddings(encoded, mappings, layers, unit):
            x: np.ndarray = emb.detach().cpu().numpy()
            dump[str(i)] = x.astype(np.float32)
            i += 1

    dump.close()
    print("Job complete.", file=sys.stderr)
 def __init__(
     self,
     hierarchy: Hierarchy,
     delimiter: str = "/",
     model: str = "elmo-original",
     layers=None,
     gpuid=-1,
 ):
     super(UncachedMentionReader, self).__init__(lazy=True)
     if layers is None:
         layers = [0, 10, 11, 12]
     self.hierarchy = hierarchy
     self.delimiter = delimiter
     self.contextualizer: Contextualizer = get_contextualizer(
         model,
         device="cpu" if gpuid < 0 else f"cuda:{gpuid}",
         tokenizer_only=False)
     self.layers = layers
Example #5
0
from typing import *
from hiertype.contextualizers import get_contextualizer

s = "He found a leprechaun in his walnut shell .".split(' ')

contextualizer = get_contextualizer("xlm-roberta-base", device='cuda:0')

t, m = contextualizer.tokenize_with_mapping(s)
encoded = contextualizer.encode([t], frozen=True)
pass