def __init__(self):
     self.logger = get_logger(self.__class__.__name__)
     self.tokenizer = GeneralTokenizer()
     self.url_pattern = re.compile(
         r'(?:http[s]?://|www)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
     )
     self.dictionary = DictionaryES()
Exemple #2
0
 def _init_logger(self):
     log_dir = '../logs/'+ self.model_name + '/test' + '/{}'.format(self.cfg['data']['dataset']) \
               +'/{}'.format(time.strftime('%Y%m%d-%H%M'))
     self.logger = get_logger(log_dir)
     print('RUNDIR: {}'.format(log_dir))
     self.logger.info('{}-Train'.format(self.model_name))
     self.save_path = log_dir
     self.save_image_path = os.path.join(self.save_path, 'saved_val_images')
Exemple #3
0
 def __init__(
     self,
     user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 '
     '(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'):
     self.logger = get_logger(self.__class__.__name__)
     self.user_agent = user_agent
     self.redis = None
     self.expire_time = None
Exemple #4
0
 def __init__(self, *args, **kwargs):
     super(MyTestCase, self).__init__(*args, **kwargs)
     self.logger = get_logger(self.__class__.__name__)
     self.urls = [
         'http://vnexpress.net/tin-tuc/the-gioi/viet-nam-yeu-cau-trung-quoc-rut-cac-may-bay-chien-dau-khoi-hoang-sa-3387175.html',
         'http://vnexpress.net/tin-tuc/thoi-su/hai-quan-viet-nam-co-them-cap-tau-ten-lua-tan-cong-hien-dai-3387260.html',
         'http://kinhdoanh.vnexpress.net/tin-tuc/vi-mo/ong-bui-quang-vinh-40-nam-ngon-lua-luc-nao-cung-chay-trong-toi-3387136.html',
         'http://thethao.vnexpress.net/photo/hinh-bong-da/co-may-msn-vo-vun-barca-thanh-cuu-vuong-champions-league-3386815.html'
     ]
Exemple #5
0
    def __init__(self,
                 model,
                 optimizer,
                 lr_scheduler,
                 loss_criterion,
                 eval_criterion,
                 device,
                 loaders,
                 checkpoint_dir,
                 max_num_epochs=100,
                 max_num_iterations=1e5,
                 validate_after_iters=100,
                 log_after_iters=100,
                 validate_iters=None,
                 num_iterations=1,
                 num_epoch=0,
                 eval_score_higher_is_better=True,
                 best_eval_score=None,
                 logger=None):
        if logger is None:
            self.logger = utils.get_logger('UNet3DTrainer',
                                           level=logging.DEBUG)
        else:
            self.logger = logger

        self.logger.info(model)
        self.model = model
        self.optimizer = optimizer
        self.scheduler = lr_scheduler
        self.loss_criterion = loss_criterion
        self.eval_criterion = eval_criterion
        self.device = device
        self.loaders = loaders
        self.checkpoint_dir = checkpoint_dir
        self.max_num_epochs = max_num_epochs
        self.max_num_iterations = max_num_iterations
        self.validate_after_iters = validate_after_iters
        self.log_after_iters = log_after_iters
        self.validate_iters = validate_iters
        self.eval_score_higher_is_better = eval_score_higher_is_better
        logger.info(
            f'eval_score_higher_is_better: {eval_score_higher_is_better}')

        if best_eval_score is not None:
            self.best_eval_score = best_eval_score
        else:
            # initialize the best_eval_score
            if eval_score_higher_is_better:
                self.best_eval_score = float('-inf')
            else:
                self.best_eval_score = float('+inf')

        self.writer = SummaryWriter(
            log_dir=os.path.join(checkpoint_dir, 'logs'))

        self.num_iterations = num_iterations
        self.num_epoch = num_epoch
Exemple #6
0
    def setUp(self) -> None:
        dim = 10
        self.cs = CS.ConfigurationSpace()
        for d in range(dim):
            self.cs.add_hyperparameter(
                CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5))

        self.hp_names = list(self.cs._hyperparameters.keys())
        self.logger = get_logger(file_name='test', logger_name='test')
Exemple #7
0
 def __init__(self, urls, content_getter, model_file_path, tokenizer,
              min_ngram, max_ngram):
     self.logger = get_logger(self.__class__.__name__)
     self.urls = urls
     self.content_getter = content_getter
     self.model_file_path = model_file_path
     self.tokenizer = tokenizer
     self.min_ngram = min_ngram
     self.max_ngram = max_ngram
Exemple #8
0
 def _init_logger(self):
     log_dir = '../logs/nasunet/search' + '/{}'.format(self.cfg['data']['dataset']) + \
               '/search-{}'.format(time.strftime('%Y%m%d-%H%M%S'))
     self.logger = get_logger(log_dir)
     self.logger.info('RUNDIR: {}'.format(log_dir))
     shutil.copy(self.args.config, log_dir)
     self.logger.info('Nas-Search')
     self.save_path = log_dir
     self.save_tbx_log = self.save_path + '/tbx_log'
     self.writer = SummaryWriter(self.save_tbx_log)
 def _init_logger(self):
     log_dir = '../logs/'+ self.model_name + '/train' + '/{}'.format(self.cfg['data']['dataset']) \
               +'/{}'.format(time.strftime('%Y%m%d-%H%M%S'))
     self.logger = get_logger(log_dir)
     print('RUNDIR: {}'.format(log_dir))
     self.logger.info('{}-Train'.format(self.model_name))
     self.save_path = log_dir
     self.save_tbx_log = self.save_path + '/tbx_log'
     self.save_image_path = os.path.join(self.save_path, 'saved_val_images')
     self.writer = SummaryWriter(self.save_tbx_log)
     shutil.copy(self.args.config, self.save_path)
 def __init__(self):
     self.logger = get_logger(self.__class__.__name__)
     self.es = get_es_client()
     self.prefix_index_name = 'dic'
     self.doc_type = 'vocab'
     self.support_languages = {'arabic', 'armenian', 'basque', 'brazilian', 'bulgarian', 'catalan', 'cjk', 'czech',
                               'danish', 'dutch', 'english', 'finnish', 'french', 'galician', 'german', 'greek',
                               'hindi',
                               'hungarian', 'indonesian', 'irish', 'italian', 'latvian', 'lithuanian', 'norwegian',
                               'persian',
                               'portuguese', 'romanian', 'russian', 'sorani', 'spanish', 'swedish', 'turkish',
                               'thai'}
     logging.getLogger('elasticsearch').setLevel(logging.CRITICAL)
Exemple #11
0
 def __init__(self,
              user_agent='ClarityBot',
              page_load_timeout=15,
              wait_after_last_request=0.5):
     self.logger = get_logger(self.__class__.__name__)
     self.user_agent = user_agent
     self.redis = None
     self.expire_time = None
     self.cluster = os.environ.get('CRAWLER_URL',
                                   'http://174.138.126.116:3000/execute')
     self.access_key = os.environ.get('CRAWLER_ACCESS_KEY',
                                      'cHVwcmVuZGVyX3Nlb2NsYXJpdHk=')
     self.page_load_timeout = page_load_timeout
     self.wait_after_last_request = wait_after_last_request
 def __init__(self,
              model_loc_dir,
              model_name,
              content_getter,
              evaluate_mode=False):
     self.logger = get_logger(self.__class__.__name__)
     self.content_getter = content_getter
     self.web_page_type_classifier = None
     self.labels = None
     self.model_name = model_name
     self.model_name_key = 'current_page_type_classifier_model'
     self.model_loc_dir = model_loc_dir
     self.kv_storage = get_redis_conn()
     self.evaluate_mode = evaluate_mode
Exemple #13
0
def main():
    # Create main logger
    logger = get_logger('UNet3DTrainer')

    # Load and log experiment configuration
    config = load_config()
    logger.info(config)

    manual_seed = config.get('manual_seed', None)
    if manual_seed is not None:
        logger.info(f'Seed the RNG for all devices with {manual_seed}')
        torch.manual_seed(manual_seed)
        # see https://pytorch.org/docs/stable/notes/randomness.html
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    # Create the model
    model = get_model(config)
    # put the model on GPUs
    logger.info(f"Sending the model to '{config['device']}'")
    #     model = model.to(config['device'])
    #     # Log the number of learnable parameters
    logger.info(f'Number of learnable params {get_number_of_learnable_parameters(model)}')

    # Create loss criterion
    loss_criterion = torch.nn.BCELoss(reduction='mean')
    # Create evaluation metric
    eval_criterion = loss_criterion

    # Create data loaders
    loaders = get_train_loaders(config)

    # Create the optimizer
    optimizer = _create_optimizer(config, model)

    # Create learning rate adjustment strategy
    lr_scheduler = _create_lr_scheduler(config, optimizer)

    # Create model trainer
    trainer = _create_trainer(config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler,
                              loss_criterion=loss_criterion, eval_criterion=eval_criterion, loaders=loaders,
                              logger=logger)
    # Start training
    trainer.fit()
Exemple #14
0
 def __init__(self,
              content_getter,
              similarity,
              unit='word',
              min_ngram=1,
              max_ngram=1,
              main_page_selector=None,
              sub_page_selector=None,
              url_1_selector=None,
              url_2_selector=None,
              url_3_selector=None):
     self.similarity = similarity
     self.content_getter = content_getter
     self.main_page_selector = main_page_selector
     self.sub_page_selector = sub_page_selector
     self.url_1_selector = url_1_selector
     self.url_2_selector = url_2_selector
     self.url_3_selector = url_3_selector
     self.unit = unit
     self.min_ngram = min_ngram
     self.max_ngram = max_ngram
     self.logger = get_logger(self.__class__.__name__)
Exemple #15
0
def get_test_loaders(config):
    """
    Returns a list of DataLoader, one per each test file.

    :param config: a top level configuration object containing the 'datasets' key
    :return: generator of DataLoader objects
    """

    logger = get_logger('HDF5Dataset')

    assert 'datasets' in config, 'Could not find data sets configuration'
    loaders_config = config['loaders']

    # get train and validation files
    test_path = loaders_config['test_path']
    assert isinstance(test_path, str)
    # get h5 internal path
    raw_internal_path = loaders_config['raw_internal_path']
    # get train/validation patch size and stride

    logger.info(f'Loading test set from: {test_path}...')
    dataset = HDF5Dataset(test_path,
                          phase='test',
                          raw_internal_path=raw_internal_path,
                          transformer_config=loaders_config['transformer'])

    num_workers = loaders_config.get('num_workers', 1)
    batch_size = loaders_config.get('batch_size', 1)
    logger.info(f'Number of workers for test datasets: {num_workers}')
    logger.info(f'Batch size for test datasets: {batch_size}')
    return {
        'test':
        DataLoader(dataset,
                   batch_size=batch_size,
                   shuffle=True,
                   num_workers=num_workers)
    }
Exemple #16
0
    def setUp(self) -> None:
        dim = 10
        self.cs_cat = CS.ConfigurationSpace()
        self.cs = CS.ConfigurationSpace()
        self.cs_cat.add_hyperparameter(
            CSH.CategoricalHyperparameter('func', choices=['sine', 'cosine']))
        for d in range(dim):
            self.cs.add_hyperparameter(
                CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5))
            if d < dim - 1:
                self.cs_cat.add_hyperparameter(
                    CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5))
            else:
                self.cs_cat.add_hyperparameter(
                    CSH.OrdinalHyperparameter(f'x{d}',
                                              sequence=list(range(-5, 6)),
                                              meta={
                                                  'lower': -5,
                                                  'upper': 5
                                              }))

        self.hp_names = list(self.cs._hyperparameters.keys())
        self.hp_names_cat = list(self.cs_cat._hyperparameters.keys())
        self.logger = get_logger(file_name='test', logger_name='test')
Exemple #17
0
 def __init__(self, content_getter, es_client):
     self.content_getter = content_getter
     self.es_client = es_client
     self.logger = get_logger(self.__class__.__name__)
     self.index_name = 'web'
     self.doc_type = 'page'
 def __init__(self, urls, storage, classifier):
     self.logger = get_logger(self.__class__.__name__)
     self.urls = urls
     self.storage = storage
     self.classifier = classifier
Exemple #19
0
 def __init__(self):
     self.logger = get_logger(self.__class__.__name__)
Exemple #20
0
from typing import Dict

import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
from util.utils import get_logger
import numpy as np

from optimizer.tpe import TPEOptimizer


def sphere(eval_config: Dict[str, float]) -> float:
    vals = np.array(list(eval_config.values()))
    vals *= vals
    return np.sum(vals)


if __name__ == '__main__':
    dim = 10
    cs = CS.ConfigurationSpace()
    for d in range(dim):
        cs.add_hyperparameter(
            CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5))

    logger = get_logger(file_name='sphere', logger_name='sphere')
    opt = TPEOptimizer(obj_func=sphere,
                       config_space=cs,
                       mutation_prob=0.0,
                       resultfile='sphere')
    opt.optimize(logger)
Exemple #21
0
def get_train_loaders(config):
    """
    Returns dictionary containing the training and validation loaders
    (torch.utils.data.DataLoader) backed by the datasets.hdf5.HDF5Dataset.

    :param config: a top level configuration object containing the 'loaders' key
    :return: dict {
        'train': <train_loader>
        'val': <val_loader>
    }
    """
    assert 'loaders' in config, 'Could not find data loaders configuration'
    loaders_config = config['loaders']

    logger = get_logger('HDF5Dataset')
    logger.info('Creating training and validation set loaders...')

    # get train and validation files
    train_path = loaders_config['train_path']
    val_path = loaders_config['val_path']
    assert isinstance(train_path, str)
    assert isinstance(val_path, str)
    # get h5 internal paths for raw and label
    raw_internal_path = loaders_config['raw_internal_path']
    label_internal_path = loaders_config['label_internal_path']
    weight_internal_path = loaders_config.get('weight_internal_path', None)

    logger.info(f'Loading training set from: {train_path}...')
    # create H5 backed training and validation dataset with data augmentation
    train_dataset = HDF5Dataset(
        train_path,
        phase='train',
        transformer_config=loaders_config['transformer'],
        raw_internal_path=raw_internal_path,
        label_internal_path=label_internal_path,
        weight_internal_path=weight_internal_path)

    logger.info(f'Loading validation set from: {val_path}...')
    val_dataset = HDF5Dataset(val_path,
                              phase='val',
                              transformer_config=loaders_config['transformer'],
                              raw_internal_path=raw_internal_path,
                              label_internal_path=label_internal_path,
                              weight_internal_path=weight_internal_path)

    num_workers = loaders_config.get('num_workers', 1)
    batch_size = loaders_config.get('batch_size', 1)
    logger.info(f'Number of workers for train/val datasets: {num_workers}')
    logger.info(f'Batch size for train/val datasets: {batch_size}')
    # when training with volumetric data use batch_size of 1 due to GPU memory constraints
    return {
        'train':
        DataLoader(train_dataset,
                   batch_size=batch_size,
                   shuffle=True,
                   num_workers=num_workers),
        'val':
        DataLoader(val_dataset,
                   batch_size=batch_size,
                   shuffle=True,
                   num_workers=num_workers)
    }
Exemple #22
0
 def __init__(self, crawler, extractor):
     self.crawler = crawler
     self.extractor = extractor
     self.logger = get_logger(self.__class__.__name__)
Exemple #23
0
import json

from util.utils import get_config_space, get_logger, ParameterSettings
from optimizer.tpe import TPEOptimizer

from hpolib.tabular_benchmark import (FCNetNavalPropulsionBenchmark,
                                      FCNetParkinsonsTelemonitoringBenchmark,
                                      FCNetProteinStructureBenchmark,
                                      FCNetSliceLocalizationBenchmark)

if __name__ == '__main__':
    js = open('hpolib/params.json')
    searching_space: Dict[str, ParameterSettings] = json.load(js)
    config_space = get_config_space(searching_space, hp_module_path='hpolib')

    logger = get_logger(file_name='hpolib', logger_name='hpolib')
    benchmark = [
        FCNetNavalPropulsionBenchmark, FCNetParkinsonsTelemonitoringBenchmark,
        FCNetProteinStructureBenchmark, FCNetSliceLocalizationBenchmark
    ][0]

    # You need to change the path according to your path to the data
    data_dir = f'{os.environ["HOME"]}/research/nas_benchmarks/fcnet_tabular_benchmarks/'

    bm = benchmark(data_dir=data_dir)
    obj_func = bm.objective_func

    opt = TPEOptimizer(obj_func=obj_func,
                       config_space=config_space,
                       mutation_prob=0.05,
                       resultfile='hpolib')
Exemple #24
0
 def __init__(self):
     self.logger = get_logger(__name__)
 def __init__(self, *args, **kwargs):
     super(MyTestCase, self).__init__(*args, **kwargs)
     self.logger = get_logger(self.__class__.__name__)
Exemple #26
0
from typing import Dict

import json

from util.utils import get_config_space, get_logger, ParameterSettings
from cnn.train import get_objective_func
from optimizer.tpe import TPEOptimizer

if __name__ == '__main__':
    js = open('cnn/params.json')
    searching_space: Dict[str, ParameterSettings] = json.load(js)
    config_space = get_config_space(searching_space, hp_module_path='cnn')

    logger = get_logger(file_name='cnn', logger_name='cnn')
    obj_func = get_objective_func(logger=logger,
                                  searching_space=searching_space,
                                  config_space=config_space)

    opt = TPEOptimizer(obj_func=obj_func,
                       config_space=config_space,
                       mutation_prob=0.05,
                       resultfile='cnn')
    opt.optimize(logger)
Exemple #27
0
        logger.info('Val Acc.: {:.2f}%'.format(val_acc * 100))

        return 1.0 - val_acc

    return objective_func


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('--filename', type=str, default='example')
    parser.add_argument('--saved_config', type=str, default=None)
    args = parser.parse_args()

    # Logger setting
    logger = get_logger(file_name=args.filename, logger_name=args.filename)

    js = open('cnn/params.json')
    searching_space: Dict[str, ParameterSettings] = json.load(js)
    config_space = get_config_space(searching_space, hp_module_path='cnn')
    objective_func = get_objective_func(
        logger=logger,
        searching_space=searching_space,
        config_space=config_space
    )

    results = objective_func(
        config=Hyperparameters().__dict__,
        budget=BudgetConfig().__dict__
    )
Exemple #28
0
from werkzeug.utils import secure_filename, redirect

from api import get_similarity_checker
from app import app
from similarity_checker import tokenize_and_normalize_content
from util.utils import get_logger, get_unicode

# This is the path to the upload directory
app.config['UPLOAD_FOLDER'] = 'web/upload'
# These are the extension that we are accepting to be uploaded
excel_extensions = {'xls', 'xlsx'}
sup_file_type = {'csv', 'txt'} | excel_extensions
app.config['ALLOWED_EXTENSIONS'] = sup_file_type
app.config['MAX_CONTENT_LENGTH'] = 1024 * 1024 * 1024  # Accept max 1GB file

logger = get_logger(__name__)

redis = StrictRedis(db=1,
                    host=os.environ.get('REDIS_HOST', 'localhost'),
                    port=os.environ.get('REDIS_PORT', 6379))


def convert_to_utf8(source_file_path):
    result_file_path = source_file_path + '.utf8'
    import codecs
    block_size = 1048576  # or some other, desired size in bytes
    with codecs.open(source_file_path, "r", "utf-8") as source:
        with codecs.open(result_file_path, "w", "utf-8") as target_file:
            while True:
                contents = source.read(block_size)
                if not contents:
 def __init__(self, storage):
     self.logger = get_logger(self.__class__.__name__)
     self.storage = storage