Python get Examples, ungol.common.logger.get Python Examples

Example #1

0

Show file

File: setup.py Project: nachtsky1077/ungol

import attr
import nltk
from tqdm import tqdm as _tqdm

import re
import os
import sys
import enum
import functools
import multiprocessing as mp

from typing import Tuple
from typing import Generator


log = logger.get('index.setup')
tqdm = functools.partial(_tqdm, ncols=80)


# ---

sys.path.append('lib/CharSplit')
import char_split  # noqa

# ---


#  multiprocessing infrastructure:
#
#  Topology:
#                          rq                        wq

Example #2

0

Show file

from tqdm import tqdm as _tqdm
from nltk.tokenize import sent_tokenize as tok_sent
from nltk.tokenize import word_tokenize as tok_wrd

import pickle
import pathlib
import argparse
import functools
import multiprocessing as mp
from collections import defaultdict

dumpr = ucu.load_module('dumpr.common')


log = logger.get('sentemb.prep')
tqdm = functools.partial(_tqdm, ncols=80)


#
#  use dumpr to produce sentence files
#


class Writer:

    def __init__(self, f_out: str):
        self._f_out = f_out

    def __enter__(self):
        self._fd_out = open(self._f_out, mode='w')

Example #3

0

Show file

File: rhwmd.py Project: nachtsky1077/ungol

# =*= coding: utf-8 -*-

from ungol.common import logger
from ungol.index import index as uii
from ungol.similarity import measures as usm

import numpy as np
from tabulate import tabulate

import enum
from typing import Tuple

log = logger.get('similarity.rhwmd')


class Strategy(enum.Enum):

    # selecting max(score(d1, d2), score(d2, d1))
    MAX = enum.auto()

    # selecting min(score(d1, d2), score(d2, d1))
    MIN = enum.auto()

    # only use score(ds, dl), where ds = argmin(|d1|, |d2|)
    # and dl = argmax(|d1|, |d2|)
    ADAPTIVE_SMALL = enum.auto()

    # only use score(dl, ds), where ds = argmin(|d1|, |d2|)
    # and dl = argmax(|d1|, |d2|)
    ADAPTIVE_BIG = enum.auto()

Example #4

0

Show file

File: analyze.py Project: nachtsky1077/ungol

import sys
import pickle
import pathlib
import argparse
import functools
import multiprocessing as mp

from typing import List
from typing import Dict
from typing import Union
from typing import Tuple
from typing import Generator

# ---

log = logger.get('models.analyze')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

RETAINED = 2000  # the k in k-NN
BUF_SIZE = 4000  # resulting file chunk size (BUF_SIZE * RETAINED * 4 Byte)

# ---


@attr.s(frozen=True)
class Neighbour:

Example #5

0

Show file

import attr
import h5py
import torch
import numpy as np
from tqdm import tqdm as _tqdm

import math
import pathlib
import argparse
import functools

from typing import Any
from typing import Dict
from typing import Tuple

log = logger.get('models.embcodr')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# --- external interface


def create_codes(compr: ume.Compressor, batch: torch.Tensor,
                 components: int) -> np.array:

    onehot = compr.encoder(batch)

    codemap = onehot.nonzero()[:, 2].to(dtype=torch.uint8)
    codes = codemap.view(-1, components).numpy()

    assert codes.shape[1] == components
    return codes

Example #6

0

Show file

File: embed.py Project: nachtsky1077/ungol

import argparse

import attr
import h5py
import torch
import configobj
import numpy as np

from typing import Dict
from typing import Tuple
from typing import Generator

from ungol.common import logger
from ungol.common import util as ucu

log = logger.get('common.embed')


class Embed:
    """
    The embedding provider interface to be implemented.

    """

    CHUNK_SIZE = 8192

    @property
    def name(self) -> str:
        raise NotImplementedError()

    @property

Example #7

0

Show file

File: stats.py Project: nachtsky1077/ungol

import os
import math
import enum
import pathlib
import multiprocessing as mp

from typing import Any
from typing import List
from typing import Tuple
from typing import Union
from typing import Callable

# ---

log = logger.get('models.stats')

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


class Kind(enum.Enum):
    train = enum.auto()
    valid = enum.auto()
    flush = enum.auto()


@attr.s(frozen=True)
class Update:

Example #8

0

Show file

import functools
import multiprocessing as mp

from typing import Dict
from typing import Tuple
from typing import Generator
from typing import Collection

# ---

sys.path.append('lib/CharSplit')
import char_split  # noqa

# ---

log = logger.get('retrieval.setup')
tqdm = functools.partial(_tqdm, ncols=80)

# ---


def _parse_topics(
        files: Collection[str]) -> Generator[common.Topic, None, None]:

    for xml in files:
        with open(xml, mode='r', encoding='utf-8') as f:
            topics_raw = f.read()

        soup = bs(topics_raw, 'xml')

        print('')

Example #9

0

Show file

File: experiment.py Project: nachtsky1077/ungol

import requests
import numpy as np
from tabulate import tabulate
from tqdm import tqdm as _tqdm

import time
import json
from pprint import pformat
from collections import defaultdict

from typing import Callable
from typing import Collection

# ---

log = logger.get('retrieval.experiment')


def tqdm(*args, **kwargs):
    yield from _tqdm(*args, ncols=80, **kwargs)


# --- experiments


class Task(dict):
    """
    Maps    doc_id -> bool
    Named:    task -> flag

    Currently depending on elasticsearch - could also

Example #10

0

Show file

File: evaluate.py Project: nachtsky1077/ungol

import random
import pickle
import pathlib
import argparse
import functools
import collections

from tabulate import tabulate

from typing import List
from typing import Tuple
from typing import Union
from typing import Generator

log = logger.get('retrieval.evaluate')
Stat = collections.namedtuple('Stat', ('name', 'f_dataset', 'f_name', 'stat'))

#  ---

UNGOL_STRATS = {
    'min': usr.Strategy.MIN,
    'max': usr.Strategy.MAX,
    'adaptive-small': usr.Strategy.ADAPTIVE_SMALL,
    'adaptive-big': usr.Strategy.ADAPTIVE_BIG,
    'sum': usr.Strategy.SUM,
}

UNGOL_SCORERS = {
    'rhwmd': uss.rhwmd,
    'bm25': uss.bm25,

Example #11

0

Show file

File: measures.py Project: nachtsky1077/ungol

"""

A collection of different similarity and distance measure implementations.
Sometimes batched or gpu accelerated variants exist.

"""

from ungol.common import logger

import numpy as np


log = logger.get('similarity.measures')


# def m_cosine(train_data, test_data, tqdm=lambda x: x, max_k=100):
#     dists, train, test = None, None, None

#     try:
#         train = torch.from_numpy(train_data).to(device=DEV)
#         test  = torch.from_numpy(test_data).to(device=DEV)

#         train /= train.norm(dim=1).unsqueeze(1)
#         test /= test.norm(dim=1).unsqueeze(1)

#         dists = torch.stack([
#             (1-train.matmul(t).squeeze())
#             for t in tqdm(test)])

#         topkek = dists.topk(k=max_k, largest=False, dim=1)
#         sortdists, sortindices = map(lambda t: t.cpu().numpy(), topkek)

Example #12

0

Show file

File: training.py Project: nachtsky1077/ungol

import numpy as np
from tqdm import tqdm as _tqdm

import math
import pathlib
import functools

from typing import Any
from typing import Dict
from typing import List
from typing import Generator


# ---

log = logger.get('models.training')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


#
#   TRAINING INFRASTRUCTURE
#

Example #13

0

Show file

File: redux.py Project: nachtsky1077/ungol

import numpy as np

from typing import Any
from typing import Set
from typing import Dict
from typing import Tuple
from typing import Collection


# conditional imports

skdecomp = ucu.load_module('sklearn.decomposition')
sent2vec = ucu.load_module('sent2vec')
infersent = ucu.load_module('infersent.models')

log = logger.get('sentemb.redux')


class Redux:

    @property
    def name(self) -> str:
        raise NotImplementedError()

    @property
    def dimensions(self) -> int:
        """
        Output dimensionality.
        """
        raise NotImplementedError

Example #14

0

Show file

import numpy as np

import os
import json
import queue
import pathlib
import functools
import collections
import multiprocessing as mp
from datetime import datetime

from typing import Tuple

# ---

log = logger.get('models.models')

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

log.warn('enabling cudnn benchmark')
torch.backends.cudnn.benchmark = True

# ---


class ModelException(Exception):
    """

Example #15

0

Show file

import pickle
import random
import multiprocessing as mp
from collections import defaultdict

import elasticsearch as es
from wmd import WMD as WMDR
from gensim.models.fasttext import FastText

from typing import Any
from typing import List
from typing import Dict
from typing import Tuple
from typing import Callable

log = logger.get('retrieval.clients')

# ---


@attr.s
class Result:

    doc_id: str = attr.ib()
    score: float = attr.ib()


class Client:
    @property
    def time(self) -> float:
        return sum(self._times) / len(self._times)

Example #16

0

Show file

File: common.py Project: nachtsky1077/ungol

from ungol.common import logger

from tqdm import tqdm as _tqdm

import os
import pickle
import pathlib
import functools
import itertools
import multiprocessing as mp

log = logger.get('sentemb.common')
tqdm = functools.partial(_tqdm, ncols=80)

SENT_MIN_LEN = 2
SENT_MAX_LEN = 40

F_ARRS = 'sentences.arrs'
F_TOKS = 'tokens.txt'
F_VOCAB = 'vocab.pickle'
F_COUNTS = 'counts.pickle'

# ---


def get_vocabs(f_in: str, prefix: str = None):
    log.info(f'loading vocabulary files from "{f_in}" (prefix={prefix})')

    if prefix is None:
        prefix = ''
    else:

Example #17

0

Show file

File: training.py Project: nachtsky1077/ungol

from ungol.sentemb import common as usc
from ungol.sentemb import redux as usr

import attr
import h5py
from tqdm import tqdm as _tqdm

import pickle
import pathlib
import argparse
import functools

from typing import Set
from typing import Dict

log = logger.get('sentemb.training')
tqdm = functools.partial(_tqdm, ncols=80)

# FIXME: make an option
BATCH_SIZE = 2048


def btqdm(*args, step: int = 1, **kwargs):
    bar = tqdm(*args, **kwargs)
    for x in bar:
        yield x
        bar.update(step)


@attr.s
class Stats:

Example #18

0

Show file

File: embcompr.py Project: nachtsky1077/ungol

#   Reference Code: https://github.com/zomux/neuralcompressor
#

from ungol.common import logger
from ungol.common import embed as uce
from ungol.models import models as umm
from ungol.models import training as umt

import re
import argparse
import functools

import torch
from tqdm import tqdm as _tqdm

log = logger.get('models.embcompr')
tqdm = functools.partial(_tqdm, ncols=80, disable=False)

# ---

DEV_CPU = torch.device('cpu')
DEV_GPU = torch.device('cuda')

# ---


def _print_examples(compressor, training):
    vocab = list(training.ember.vocab.keys())[1000:1010]
    dist = torch.nn.PairwiseDistance()

    for word in vocab: