def __init__(self, lang): """ :param lang: language code of precomputed fasttext encodings, see https://fasttext.cc/docs/en/crawl-vectors.html """ import fasttext import fasttext.util super().__init__() self._lang = lang download_path = make_cache_path() / 'models' download_path.mkdir(exist_ok=True, parents=True) filename = "cc.%s.300.bin" % self._lang if not (download_path / filename).exists(): os.chdir(download_path) with tqdm(desc="Downloading " + self.name, total=10000, bar_format='{desc:<30}{percentage:3.2f}%|{bar:40}' ) as pbar: with contextlib.redirect_stdout(ProgressParser(pbar)): filename = fasttext.util.download_model(self._lang, if_exists='ignore') with tqdm(desc="Opening " + self.name, total=1, bar_format='{l_bar}{bar}') as pbar: ft = fasttext.load_model(str(download_path / filename)) pbar.update(1) self._ft = ft
def extract_numberbatch(path, languages): # e.g. extract_numberbatch("/path/to/numberbatch-19.08.txt", ["en", "de"]) # then use KeyedVectors.load() path = Path(path) languages = set(languages) pattern = re.compile(r"^/c/([a-z]+)/") with open(path, "r") as f: num_lines, num_dimensions = [int(x) for x in f.readline().split()] vectors = collections.defaultdict(lambda: { "keys": [], "vectors": [] }) for _ in tqdm(range(num_lines)): line = f.readline() m = pattern.match(line) if m: lang = m.group(1) if lang in languages: line = line[len(m.group(0)):] cols = line.split() key = cols[0] if key.isalpha(): record = vectors[lang] record["keys"].append(key) record["vectors"].append( np.array([float(x) for x in cols[1:]])) for lang, record in vectors.items(): wv = gensim.models.KeyedVectors(num_dimensions) wv.add_vectors(record["keys"], record["vectors"]) wv.save(str(path.parent / f"{path.stem}-{lang}.kv"))
def gen_spans(): with tqdm(desc="Encoding", total=i_spans[-1], disable=not pbar) as pbar_instance: for doc in docs: spans = list(doc.spans(partition)) yield doc, spans pbar_instance.update(len(spans))
def create_encoder(self, session): normalizers = session.normalizers normalizer = normalizers['text'].to_callable() key = json.dumps({ 'emb': self.unique_name, 'nrm': normalizer.ident, 'sampling': self._embedding_sampling, 'tfm': [t.name for t in self._transforms] }) loaded = self._loaded.get(key) if loaded is None: name = self.unique_name dat_path = self._cache.get(key) #dat_path = normalized_cache_path / f"{name}-{normalizer.name}-{self._embedding_sampling}.dat" if dat_path and dat_path.exists(): with tqdm(desc="Opening " + self.name, total=1, bar_format='{l_bar}{bar}', disable=not CachedWordEmbedding.pbar_on_open) as pbar: with open(dat_path.with_suffix('.json'), 'r') as f: data = json.loads(f.read()) tokens = data['tokens'] vectors_mmap = np.memmap( dat_path, dtype=np.float32, mode='r', shape=tuple(data['shape'])) pbar.update(1) else: tokens, vectors = self._load() tokens, vectors = normalize_word2vec( self.name, tokens, vectors, normalizer.unpack(), self._embedding_sampling) for t in self._transforms: vectors = t.apply(Vectors(vectors)).unmodified dat_path = self._cache.create_new_data_path() vectors_mmap = np.memmap( dat_path, dtype=np.float32, mode='w+', shape=vectors.shape) vectors_mmap[:, :] = vectors[:, :] vectors = None with open(dat_path.with_suffix('.json'), 'w') as f: f.write(json.dumps({ 'tokens': tokens, 'shape': tuple(vectors_mmap.shape) })) self._cache.put(key, dat_path) loaded = CachedWordEmbedding.Encoder( self, name, tokens, vectors_mmap) self._loaded[key] = loaded return loaded
def make(root, normalization, docs): builder = FlavorBuilder(root, normalization.normalizers) with tqdm(total=2 * len(docs), desc=f"Adding Normalization '{normalization.name}'") as pbar: for stage in (FlavorBuilder.Stage.PREFLIGHT, FlavorBuilder.Stage.ADD): builder.set_stage(stage) for unique_id, doc in docs.items(): builder.add(unique_id, doc) pbar.update(1)
def normalize_word2vec(name, tokens, embeddings, normalizer, sampling='nearest'): if sampling not in ('nearest', 'average'): raise ValueError(f'Expected "nearest" or "average", got "{sampling}"') embeddings = embeddings.astype(np.float32) f_mask = np.zeros((embeddings.shape[0],), dtype=np.bool) f_tokens = [] token_to_ids = dict() for i, t in enumerate(tqdm(tokens, desc=f"Normalizing tokens in {name}")): nt = normalizer(t) if nt is None: continue if sampling != 'average' and nt != t: continue indices = token_to_ids.get(nt) if indices is None: token_to_ids[nt] = [i] f_tokens.append(nt) f_mask[i] = True else: indices.append(i) if sampling == 'average': for indices in tqdm(token_to_ids.values(), desc=f"Merging tokens in {name}", total=len(token_to_ids)): if len(indices) > 1: i = indices[0] embeddings[i] = np.mean(embeddings[indices], axis=0) f_embeddings = embeddings[f_mask] embeddings = None assert f_embeddings.shape[0] == len(f_tokens) return f_tokens, f_embeddings
def __init__(self, session, vocab, corpus): self._vocab = vocab self._docs = [] with corpus.flavor_cache(session.normalization.name) as flavor_cache: with tqdm(desc="Preparing Documents", total=len(corpus)) as pbar: def prepare_doc(doc): pbar.update(1) return doc.prepare(corpus, flavor_cache, session) with concurrent.futures.ThreadPoolExecutor( max_workers=2) as executor: self._docs = list(executor.map(prepare_doc, corpus.docs))
def download(url, path, force_download=False): path = Path(path) download_path = path / urllib.parse.urlparse(url).path.split("/")[-1] is_zip = download_path.suffix == ".zip" if is_zip: result_path = path / download_path.stem else: result_path = download_path if result_path.exists() and not force_download: return result_path with tqdm(desc="Downloading " + url, unit='iB', unit_scale=True) as pbar: response = requests.get(url, stream=True) total_length = int(response.headers.get('content-length', 0)) pbar.reset(total=total_length) try: with open(download_path, "wb") as f: for data in response.iter_content(chunk_size=4096): pbar.update(len(data)) f.write(data) except: download_path.unlink(missing_ok=True) raise if download_path != result_path: extracted = [] with zipfile.ZipFile(download_path, 'r') as zf: for zi in zf.infolist(): if zi.filename[-1] == '/': continue zi.filename = os.path.basename(zi.filename) p = zf.extract(zi, result_path.parent) extracted.append(Path(p)) if len(extracted) == 1: extracted[0].rename(result_path) download_path.unlink() return result_path if result_path.exists() else None
def save(self, path): path = Path(path) path.mkdir(exist_ok=True) offset = 0 session = self._partition.session corpus_vec = self.corpus_vec.ummodified for doc in tqdm(session.documents, desc="Saving"): size = doc.n_spans(self._partition) np.save(str(path / (doc.caching_name + ".npy")), corpus_vec[offset:size], allow_pickle=False) offset += size with open(path / "index.json", "w") as f: f.write( json.dumps({ 'class': self.__class__.__name__, 'partition': self._partition.to_args() }))
def load_glove_txt(csv_path): tokens = [] with open(csv_path, "r") as f: text = f.read() lines = text.split("\n") n_rows = len(lines) n_cols = len(lines[0].strip().split()) - 1 embeddings = np.empty( shape=(n_rows, n_cols), dtype=np.float32) for line in tqdm(lines, desc="Importing " + str(csv_path)): values = line.strip().split() if values: t = values[0] if t: embeddings[len(tokens), :] = values[1:] tokens.append(t) embeddings = embeddings[:len(tokens), :] return tokens, embeddings
def extraction_tqdm(tokens, name): return tqdm(tokens, desc=f"Extracting {name}", disable=len(tokens) < 5000)
def __init__(self, path, mutable=False): path = Path(path) if not path.exists(): path.mkdir() elif not path.is_dir(): raise ValueError(f"expected directory path, got '{path}'") self._path = path self._documents_path = path / "documents" self._flavors_path = path / "flavors" self._flavors_path.mkdir(exist_ok=True) if not (path / "corpus.h5").exists(): mutable = True self._corpus_h5 = h5py.File(path / "corpus.h5", "a" if mutable else "r") self._documents_group = self._corpus_h5.require_group("documents") #self._flavors_group = self._corpus_h5.require_group("flavors") self._corpus_sql = sqlite3.connect(path / "corpus.db") self._mutable = mutable with self._corpus_sql: self._corpus_sql.execute(''' CREATE TABLE IF NOT EXISTS text( unique_id TEXT PRIMARY KEY, content TEXT, content_hash TEXT)''') self._corpus_sql.execute(''' CREATE INDEX IF NOT EXISTS text_hash ON text(content_hash); ''') data = threading.local() def create_catalog(): return EmbeddingCatalog(path / "embeddings.db") def load_doc(unique_id): if 'catalog' in data.__dict__: embedding_catalog = data.catalog else: embedding_catalog = create_catalog() data.catalog = embedding_catalog p = self._documents_path / unique_id doc = Document.load_from_corpus(unique_id, p, self._corpus_sql, self._documents_group[unique_id], embedding_catalog) return unique_id, doc unique_ids = list(self._documents_group.keys()) self._docs = {} self._doc_to_unique_id = {} self._unique_id_to_index = {} self._ordered_docs = [] self._catalog = create_catalog() with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: for unique_id, doc in tqdm(executor.map(load_doc, unique_ids), total=len(unique_ids), desc="Opening Corpus", disable=len(unique_ids) < 1): self._add_doc(unique_id, doc)
def cache_contextual_embeddings(self): for doc in tqdm(self.documents, desc="Loading Vectors"): doc.cache_contextual_embeddings()
def _make_doc(self, md, partitions, loc_ax, locations, show_progress=True, extra_metadata=None): pipe = self._nlp.pipe(partitions, batch_size=self._batch_size, disable=['ner', 'lemmatizer']) # check nlp.pipe_names contextual_vectors = dict((e.name, []) for e in self._embeddings) texts = [] tokens = [] sents = {'start': [], 'end': [], 'loc': []} text_len = 0 encoders = [e.create_encoder(None) for e in self._embeddings] for location, doc in tqdm(zip(locations, pipe), total=len(locations), desc=f'Importing {md.origin}', disable=not show_progress): if len(doc) < 1: continue doc_json = doc.to_json() partition_tokens = doc_json['tokens'] for token in partition_tokens: token['start'] += text_len token['end'] += text_len tokens.extend(partition_tokens) for sent in doc_json['sents']: sents['start'].append(text_len + sent['start']) sents['end'].append(text_len + sent['end']) sents['loc'].append(location) texts.append(doc_json['text']) text_len += len(doc_json['text']) for e in encoders: v = e.encode([doc]) if len(v.shape) != 3: raise ValueError( f'expected shape (a, b, c), got {v.shape}') if v.shape[0] != 1: raise ValueError(f"expected one vector, got {v.shape[0]}") v = v[0] if v.shape[0] != len(doc): raise ValueError( f'expected ({len(doc)}, ...), got {v.shape}') if v.shape[1] == 0: raise ValueError(f'doc "{doc}" got illegal encoding: {v}') contextual_vectors[e.name].append(v) if not tokens: return None spans = { 'sentence': compile_spans(sents, tokens, loc_ax), 'document': compile_doc_spans(tokens) } if extra_metadata is None: extra_metadata = {} extended_metadata = dict(**md._asdict(), **extra_metadata, loc_ax=loc_ax) from vectorian.corpus.document import Document, InternalMemoryDocumentStorage emb_by_name = dict((e.name, e) for e in self._embeddings) def transformed(k, v): v = Vectors(np.vstack(v)) embedding = emb_by_name[k] if embedding.transform: v = embedding.transform.apply(v) return ProxyVectorsRef(v) contextual_embeddings = dict( (k, transformed(k, v)) for k, v in contextual_vectors.items()) return Document( InternalMemoryDocumentStorage(extended_metadata, ''.join(texts), make_tokens_dict(tokens), spans), contextual_embeddings)