import attr import nltk from tqdm import tqdm as _tqdm import re import os import sys import enum import functools import multiprocessing as mp from typing import Tuple from typing import Generator log = logger.get('index.setup') tqdm = functools.partial(_tqdm, ncols=80) # --- sys.path.append('lib/CharSplit') import char_split # noqa # --- # multiprocessing infrastructure: # # Topology: # rq wq
from tqdm import tqdm as _tqdm from nltk.tokenize import sent_tokenize as tok_sent from nltk.tokenize import word_tokenize as tok_wrd import pickle import pathlib import argparse import functools import multiprocessing as mp from collections import defaultdict dumpr = ucu.load_module('dumpr.common') log = logger.get('sentemb.prep') tqdm = functools.partial(_tqdm, ncols=80) # # use dumpr to produce sentence files # class Writer: def __init__(self, f_out: str): self._f_out = f_out def __enter__(self): self._fd_out = open(self._f_out, mode='w')
# =*= coding: utf-8 -*- from ungol.common import logger from ungol.index import index as uii from ungol.similarity import measures as usm import numpy as np from tabulate import tabulate import enum from typing import Tuple log = logger.get('similarity.rhwmd') class Strategy(enum.Enum): # selecting max(score(d1, d2), score(d2, d1)) MAX = enum.auto() # selecting min(score(d1, d2), score(d2, d1)) MIN = enum.auto() # only use score(ds, dl), where ds = argmin(|d1|, |d2|) # and dl = argmax(|d1|, |d2|) ADAPTIVE_SMALL = enum.auto() # only use score(dl, ds), where ds = argmin(|d1|, |d2|) # and dl = argmax(|d1|, |d2|) ADAPTIVE_BIG = enum.auto()
import sys import pickle import pathlib import argparse import functools import multiprocessing as mp from typing import List from typing import Dict from typing import Union from typing import Tuple from typing import Generator # --- log = logger.get('models.analyze') tqdm = functools.partial(_tqdm, ncols=80, disable=False) # --- DEV_CPU = torch.device('cpu') DEV_GPU = torch.device('cuda') RETAINED = 2000 # the k in k-NN BUF_SIZE = 4000 # resulting file chunk size (BUF_SIZE * RETAINED * 4 Byte) # --- @attr.s(frozen=True) class Neighbour:
import attr import h5py import torch import numpy as np from tqdm import tqdm as _tqdm import math import pathlib import argparse import functools from typing import Any from typing import Dict from typing import Tuple log = logger.get('models.embcodr') tqdm = functools.partial(_tqdm, ncols=80, disable=False) # --- external interface def create_codes(compr: ume.Compressor, batch: torch.Tensor, components: int) -> np.array: onehot = compr.encoder(batch) codemap = onehot.nonzero()[:, 2].to(dtype=torch.uint8) codes = codemap.view(-1, components).numpy() assert codes.shape[1] == components return codes
import argparse import attr import h5py import torch import configobj import numpy as np from typing import Dict from typing import Tuple from typing import Generator from ungol.common import logger from ungol.common import util as ucu log = logger.get('common.embed') class Embed: """ The embedding provider interface to be implemented. """ CHUNK_SIZE = 8192 @property def name(self) -> str: raise NotImplementedError() @property
import os import math import enum import pathlib import multiprocessing as mp from typing import Any from typing import List from typing import Tuple from typing import Union from typing import Callable # --- log = logger.get('models.stats') DEV_CPU = torch.device('cpu') DEV_GPU = torch.device('cuda') # --- class Kind(enum.Enum): train = enum.auto() valid = enum.auto() flush = enum.auto() @attr.s(frozen=True) class Update:
import functools import multiprocessing as mp from typing import Dict from typing import Tuple from typing import Generator from typing import Collection # --- sys.path.append('lib/CharSplit') import char_split # noqa # --- log = logger.get('retrieval.setup') tqdm = functools.partial(_tqdm, ncols=80) # --- def _parse_topics( files: Collection[str]) -> Generator[common.Topic, None, None]: for xml in files: with open(xml, mode='r', encoding='utf-8') as f: topics_raw = f.read() soup = bs(topics_raw, 'xml') print('')
import requests import numpy as np from tabulate import tabulate from tqdm import tqdm as _tqdm import time import json from pprint import pformat from collections import defaultdict from typing import Callable from typing import Collection # --- log = logger.get('retrieval.experiment') def tqdm(*args, **kwargs): yield from _tqdm(*args, ncols=80, **kwargs) # --- experiments class Task(dict): """ Maps doc_id -> bool Named: task -> flag Currently depending on elasticsearch - could also
import random import pickle import pathlib import argparse import functools import collections from tabulate import tabulate from typing import List from typing import Tuple from typing import Union from typing import Generator log = logger.get('retrieval.evaluate') Stat = collections.namedtuple('Stat', ('name', 'f_dataset', 'f_name', 'stat')) # --- UNGOL_STRATS = { 'min': usr.Strategy.MIN, 'max': usr.Strategy.MAX, 'adaptive-small': usr.Strategy.ADAPTIVE_SMALL, 'adaptive-big': usr.Strategy.ADAPTIVE_BIG, 'sum': usr.Strategy.SUM, } UNGOL_SCORERS = { 'rhwmd': uss.rhwmd, 'bm25': uss.bm25,
""" A collection of different similarity and distance measure implementations. Sometimes batched or gpu accelerated variants exist. """ from ungol.common import logger import numpy as np log = logger.get('similarity.measures') # def m_cosine(train_data, test_data, tqdm=lambda x: x, max_k=100): # dists, train, test = None, None, None # try: # train = torch.from_numpy(train_data).to(device=DEV) # test = torch.from_numpy(test_data).to(device=DEV) # train /= train.norm(dim=1).unsqueeze(1) # test /= test.norm(dim=1).unsqueeze(1) # dists = torch.stack([ # (1-train.matmul(t).squeeze()) # for t in tqdm(test)]) # topkek = dists.topk(k=max_k, largest=False, dim=1) # sortdists, sortindices = map(lambda t: t.cpu().numpy(), topkek)
import numpy as np from tqdm import tqdm as _tqdm import math import pathlib import functools from typing import Any from typing import Dict from typing import List from typing import Generator # --- log = logger.get('models.training') tqdm = functools.partial(_tqdm, ncols=80, disable=False) # --- DEV_CPU = torch.device('cpu') DEV_GPU = torch.device('cuda') # --- # # TRAINING INFRASTRUCTURE #
import numpy as np from typing import Any from typing import Set from typing import Dict from typing import Tuple from typing import Collection # conditional imports skdecomp = ucu.load_module('sklearn.decomposition') sent2vec = ucu.load_module('sent2vec') infersent = ucu.load_module('infersent.models') log = logger.get('sentemb.redux') class Redux: @property def name(self) -> str: raise NotImplementedError() @property def dimensions(self) -> int: """ Output dimensionality. """ raise NotImplementedError
import numpy as np import os import json import queue import pathlib import functools import collections import multiprocessing as mp from datetime import datetime from typing import Tuple # --- log = logger.get('models.models') # --- DEV_CPU = torch.device('cpu') DEV_GPU = torch.device('cuda') log.warn('enabling cudnn benchmark') torch.backends.cudnn.benchmark = True # --- class ModelException(Exception): """
import pickle import random import multiprocessing as mp from collections import defaultdict import elasticsearch as es from wmd import WMD as WMDR from gensim.models.fasttext import FastText from typing import Any from typing import List from typing import Dict from typing import Tuple from typing import Callable log = logger.get('retrieval.clients') # --- @attr.s class Result: doc_id: str = attr.ib() score: float = attr.ib() class Client: @property def time(self) -> float: return sum(self._times) / len(self._times)
from ungol.common import logger from tqdm import tqdm as _tqdm import os import pickle import pathlib import functools import itertools import multiprocessing as mp log = logger.get('sentemb.common') tqdm = functools.partial(_tqdm, ncols=80) SENT_MIN_LEN = 2 SENT_MAX_LEN = 40 F_ARRS = 'sentences.arrs' F_TOKS = 'tokens.txt' F_VOCAB = 'vocab.pickle' F_COUNTS = 'counts.pickle' # --- def get_vocabs(f_in: str, prefix: str = None): log.info(f'loading vocabulary files from "{f_in}" (prefix={prefix})') if prefix is None: prefix = '' else:
from ungol.sentemb import common as usc from ungol.sentemb import redux as usr import attr import h5py from tqdm import tqdm as _tqdm import pickle import pathlib import argparse import functools from typing import Set from typing import Dict log = logger.get('sentemb.training') tqdm = functools.partial(_tqdm, ncols=80) # FIXME: make an option BATCH_SIZE = 2048 def btqdm(*args, step: int = 1, **kwargs): bar = tqdm(*args, **kwargs) for x in bar: yield x bar.update(step) @attr.s class Stats:
# Reference Code: https://github.com/zomux/neuralcompressor # from ungol.common import logger from ungol.common import embed as uce from ungol.models import models as umm from ungol.models import training as umt import re import argparse import functools import torch from tqdm import tqdm as _tqdm log = logger.get('models.embcompr') tqdm = functools.partial(_tqdm, ncols=80, disable=False) # --- DEV_CPU = torch.device('cpu') DEV_GPU = torch.device('cuda') # --- def _print_examples(compressor, training): vocab = list(training.ember.vocab.keys())[1000:1010] dist = torch.nn.PairwiseDistance() for word in vocab: