def __init__(self, hierarchy: Hierarchy, delimiter: str = "/", model: str = "elmo-original"): super(CachedMentionReader, self).__init__(lazy=True) self.hierarchy = hierarchy self.delimiter = delimiter self.contextualizer: Contextualizer = get_contextualizer( model, device="cpu", tokenizer_only=True)
def main(*, input_fp: str, output: str, model: str = "elmo-original", unit: str = "subword", batch_size: int = 64, layers: List[int], gpuid: int = 0, ): for k, v in reversed(list(locals().items())): # seems that `locals()` stores the args in reverse order logger.info(f"{blue('--' + k)} \"{v}\"") file_length = get_file_length(input_fp) if gpuid >= 0: torch.cuda.set_device(gpuid) contextualizer: Contextualizer = get_contextualizer( model, device="cpu" if gpuid < 0 else f"cuda:{gpuid}", tokenizer_only=False ) with h5py.File(output, 'w') as f, \ open(input_fp, 'r') as inp: lines = tqdm(inp) spans: Iterator[Tuple[List[str], int, int]] = get_spans(lines) i = 0 resized = False for batch in batched(spans, batch_size=batch_size): sentences, ls, rs, orig_lines = zip(*batch) tokenized_sentences, mappings = zip(*[ contextualizer.tokenize_with_mapping(sentence) for sentence in sentences ]) encoded = contextualizer.encode(tokenized_sentences, frozen=True) for j, emb in enumerate(select_embeddings(encoded, mappings, layers, unit)): x: np.ndarray = emb.detach().cpu().numpy() x_data = x.astype(np.float32) dset = f.create_dataset(str(i), data=x_data) dset.attrs['str'] = orig_lines[j] i += 1 logger.info("Data preparation complete.")
def main(*, input: str, output: str, model: str = "elmo-original", unit: str = "subword", batch_size: int = 64, layers: List[int], gpuid: int = 0 ): for k, v in reversed(list(locals().items())): # seems that `locals()` stores the args in reverse order print(f"{blue('--' + k)} \"{v}\"", file=sys.stderr) if gpuid >= 0: torch.cuda.set_device(gpuid) contextualizer: Contextualizer = get_contextualizer( model, device="cpu" if gpuid < 0 else f"cuda:{gpuid}", tokenizer_only=False ) dump = StringNdArrayBerkeleyDBStorage.open(output, mode='w') print(input) lines: Iterator[str] = tqdm(open(input, mode='r', encoding='utf8')) spans: Iterator[Tuple[List[str], int, int]] = get_spans(lines) i = 0 for batch in batched(spans, batch_size=batch_size): sentences, ls, rs = zip(*batch) tokenized_sentences, mappings = zip(*[ contextualizer.tokenize_with_mapping(sentence) for sentence in sentences ]) encoded = contextualizer.encode(tokenized_sentences, frozen=True) for emb in select_embeddings(encoded, mappings, layers, unit): x: np.ndarray = emb.detach().cpu().numpy() dump[str(i)] = x.astype(np.float32) i += 1 dump.close() print("Job complete.", file=sys.stderr)
def __init__( self, hierarchy: Hierarchy, delimiter: str = "/", model: str = "elmo-original", layers=None, gpuid=-1, ): super(UncachedMentionReader, self).__init__(lazy=True) if layers is None: layers = [0, 10, 11, 12] self.hierarchy = hierarchy self.delimiter = delimiter self.contextualizer: Contextualizer = get_contextualizer( model, device="cpu" if gpuid < 0 else f"cuda:{gpuid}", tokenizer_only=False) self.layers = layers
from typing import * from hiertype.contextualizers import get_contextualizer s = "He found a leprechaun in his walnut shell .".split(' ') contextualizer = get_contextualizer("xlm-roberta-base", device='cuda:0') t, m = contextualizer.tokenize_with_mapping(s) encoded = contextualizer.encode([t], frozen=True) pass