def __init__(self): self.logger = get_logger(self.__class__.__name__) self.tokenizer = GeneralTokenizer() self.url_pattern = re.compile( r'(?:http[s]?://|www)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) self.dictionary = DictionaryES()
def _init_logger(self): log_dir = '../logs/'+ self.model_name + '/test' + '/{}'.format(self.cfg['data']['dataset']) \ +'/{}'.format(time.strftime('%Y%m%d-%H%M')) self.logger = get_logger(log_dir) print('RUNDIR: {}'.format(log_dir)) self.logger.info('{}-Train'.format(self.model_name)) self.save_path = log_dir self.save_image_path = os.path.join(self.save_path, 'saved_val_images')
def __init__( self, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'): self.logger = get_logger(self.__class__.__name__) self.user_agent = user_agent self.redis = None self.expire_time = None
def __init__(self, *args, **kwargs): super(MyTestCase, self).__init__(*args, **kwargs) self.logger = get_logger(self.__class__.__name__) self.urls = [ 'http://vnexpress.net/tin-tuc/the-gioi/viet-nam-yeu-cau-trung-quoc-rut-cac-may-bay-chien-dau-khoi-hoang-sa-3387175.html', 'http://vnexpress.net/tin-tuc/thoi-su/hai-quan-viet-nam-co-them-cap-tau-ten-lua-tan-cong-hien-dai-3387260.html', 'http://kinhdoanh.vnexpress.net/tin-tuc/vi-mo/ong-bui-quang-vinh-40-nam-ngon-lua-luc-nao-cung-chay-trong-toi-3387136.html', 'http://thethao.vnexpress.net/photo/hinh-bong-da/co-may-msn-vo-vun-barca-thanh-cuu-vuong-champions-league-3386815.html' ]
def __init__(self, model, optimizer, lr_scheduler, loss_criterion, eval_criterion, device, loaders, checkpoint_dir, max_num_epochs=100, max_num_iterations=1e5, validate_after_iters=100, log_after_iters=100, validate_iters=None, num_iterations=1, num_epoch=0, eval_score_higher_is_better=True, best_eval_score=None, logger=None): if logger is None: self.logger = utils.get_logger('UNet3DTrainer', level=logging.DEBUG) else: self.logger = logger self.logger.info(model) self.model = model self.optimizer = optimizer self.scheduler = lr_scheduler self.loss_criterion = loss_criterion self.eval_criterion = eval_criterion self.device = device self.loaders = loaders self.checkpoint_dir = checkpoint_dir self.max_num_epochs = max_num_epochs self.max_num_iterations = max_num_iterations self.validate_after_iters = validate_after_iters self.log_after_iters = log_after_iters self.validate_iters = validate_iters self.eval_score_higher_is_better = eval_score_higher_is_better logger.info( f'eval_score_higher_is_better: {eval_score_higher_is_better}') if best_eval_score is not None: self.best_eval_score = best_eval_score else: # initialize the best_eval_score if eval_score_higher_is_better: self.best_eval_score = float('-inf') else: self.best_eval_score = float('+inf') self.writer = SummaryWriter( log_dir=os.path.join(checkpoint_dir, 'logs')) self.num_iterations = num_iterations self.num_epoch = num_epoch
def setUp(self) -> None: dim = 10 self.cs = CS.ConfigurationSpace() for d in range(dim): self.cs.add_hyperparameter( CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5)) self.hp_names = list(self.cs._hyperparameters.keys()) self.logger = get_logger(file_name='test', logger_name='test')
def __init__(self, urls, content_getter, model_file_path, tokenizer, min_ngram, max_ngram): self.logger = get_logger(self.__class__.__name__) self.urls = urls self.content_getter = content_getter self.model_file_path = model_file_path self.tokenizer = tokenizer self.min_ngram = min_ngram self.max_ngram = max_ngram
def _init_logger(self): log_dir = '../logs/nasunet/search' + '/{}'.format(self.cfg['data']['dataset']) + \ '/search-{}'.format(time.strftime('%Y%m%d-%H%M%S')) self.logger = get_logger(log_dir) self.logger.info('RUNDIR: {}'.format(log_dir)) shutil.copy(self.args.config, log_dir) self.logger.info('Nas-Search') self.save_path = log_dir self.save_tbx_log = self.save_path + '/tbx_log' self.writer = SummaryWriter(self.save_tbx_log)
def _init_logger(self): log_dir = '../logs/'+ self.model_name + '/train' + '/{}'.format(self.cfg['data']['dataset']) \ +'/{}'.format(time.strftime('%Y%m%d-%H%M%S')) self.logger = get_logger(log_dir) print('RUNDIR: {}'.format(log_dir)) self.logger.info('{}-Train'.format(self.model_name)) self.save_path = log_dir self.save_tbx_log = self.save_path + '/tbx_log' self.save_image_path = os.path.join(self.save_path, 'saved_val_images') self.writer = SummaryWriter(self.save_tbx_log) shutil.copy(self.args.config, self.save_path)
def __init__(self): self.logger = get_logger(self.__class__.__name__) self.es = get_es_client() self.prefix_index_name = 'dic' self.doc_type = 'vocab' self.support_languages = {'arabic', 'armenian', 'basque', 'brazilian', 'bulgarian', 'catalan', 'cjk', 'czech', 'danish', 'dutch', 'english', 'finnish', 'french', 'galician', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'latvian', 'lithuanian', 'norwegian', 'persian', 'portuguese', 'romanian', 'russian', 'sorani', 'spanish', 'swedish', 'turkish', 'thai'} logging.getLogger('elasticsearch').setLevel(logging.CRITICAL)
def __init__(self, user_agent='ClarityBot', page_load_timeout=15, wait_after_last_request=0.5): self.logger = get_logger(self.__class__.__name__) self.user_agent = user_agent self.redis = None self.expire_time = None self.cluster = os.environ.get('CRAWLER_URL', 'http://174.138.126.116:3000/execute') self.access_key = os.environ.get('CRAWLER_ACCESS_KEY', 'cHVwcmVuZGVyX3Nlb2NsYXJpdHk=') self.page_load_timeout = page_load_timeout self.wait_after_last_request = wait_after_last_request
def __init__(self, model_loc_dir, model_name, content_getter, evaluate_mode=False): self.logger = get_logger(self.__class__.__name__) self.content_getter = content_getter self.web_page_type_classifier = None self.labels = None self.model_name = model_name self.model_name_key = 'current_page_type_classifier_model' self.model_loc_dir = model_loc_dir self.kv_storage = get_redis_conn() self.evaluate_mode = evaluate_mode
def main(): # Create main logger logger = get_logger('UNet3DTrainer') # Load and log experiment configuration config = load_config() logger.info(config) manual_seed = config.get('manual_seed', None) if manual_seed is not None: logger.info(f'Seed the RNG for all devices with {manual_seed}') torch.manual_seed(manual_seed) # see https://pytorch.org/docs/stable/notes/randomness.html torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Create the model model = get_model(config) # put the model on GPUs logger.info(f"Sending the model to '{config['device']}'") # model = model.to(config['device']) # # Log the number of learnable parameters logger.info(f'Number of learnable params {get_number_of_learnable_parameters(model)}') # Create loss criterion loss_criterion = torch.nn.BCELoss(reduction='mean') # Create evaluation metric eval_criterion = loss_criterion # Create data loaders loaders = get_train_loaders(config) # Create the optimizer optimizer = _create_optimizer(config, model) # Create learning rate adjustment strategy lr_scheduler = _create_lr_scheduler(config, optimizer) # Create model trainer trainer = _create_trainer(config, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, loss_criterion=loss_criterion, eval_criterion=eval_criterion, loaders=loaders, logger=logger) # Start training trainer.fit()
def __init__(self, content_getter, similarity, unit='word', min_ngram=1, max_ngram=1, main_page_selector=None, sub_page_selector=None, url_1_selector=None, url_2_selector=None, url_3_selector=None): self.similarity = similarity self.content_getter = content_getter self.main_page_selector = main_page_selector self.sub_page_selector = sub_page_selector self.url_1_selector = url_1_selector self.url_2_selector = url_2_selector self.url_3_selector = url_3_selector self.unit = unit self.min_ngram = min_ngram self.max_ngram = max_ngram self.logger = get_logger(self.__class__.__name__)
def get_test_loaders(config): """ Returns a list of DataLoader, one per each test file. :param config: a top level configuration object containing the 'datasets' key :return: generator of DataLoader objects """ logger = get_logger('HDF5Dataset') assert 'datasets' in config, 'Could not find data sets configuration' loaders_config = config['loaders'] # get train and validation files test_path = loaders_config['test_path'] assert isinstance(test_path, str) # get h5 internal path raw_internal_path = loaders_config['raw_internal_path'] # get train/validation patch size and stride logger.info(f'Loading test set from: {test_path}...') dataset = HDF5Dataset(test_path, phase='test', raw_internal_path=raw_internal_path, transformer_config=loaders_config['transformer']) num_workers = loaders_config.get('num_workers', 1) batch_size = loaders_config.get('batch_size', 1) logger.info(f'Number of workers for test datasets: {num_workers}') logger.info(f'Batch size for test datasets: {batch_size}') return { 'test': DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) }
def setUp(self) -> None: dim = 10 self.cs_cat = CS.ConfigurationSpace() self.cs = CS.ConfigurationSpace() self.cs_cat.add_hyperparameter( CSH.CategoricalHyperparameter('func', choices=['sine', 'cosine'])) for d in range(dim): self.cs.add_hyperparameter( CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5)) if d < dim - 1: self.cs_cat.add_hyperparameter( CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5)) else: self.cs_cat.add_hyperparameter( CSH.OrdinalHyperparameter(f'x{d}', sequence=list(range(-5, 6)), meta={ 'lower': -5, 'upper': 5 })) self.hp_names = list(self.cs._hyperparameters.keys()) self.hp_names_cat = list(self.cs_cat._hyperparameters.keys()) self.logger = get_logger(file_name='test', logger_name='test')
def __init__(self, content_getter, es_client): self.content_getter = content_getter self.es_client = es_client self.logger = get_logger(self.__class__.__name__) self.index_name = 'web' self.doc_type = 'page'
def __init__(self, urls, storage, classifier): self.logger = get_logger(self.__class__.__name__) self.urls = urls self.storage = storage self.classifier = classifier
def __init__(self): self.logger = get_logger(self.__class__.__name__)
from typing import Dict import ConfigSpace as CS import ConfigSpace.hyperparameters as CSH from util.utils import get_logger import numpy as np from optimizer.tpe import TPEOptimizer def sphere(eval_config: Dict[str, float]) -> float: vals = np.array(list(eval_config.values())) vals *= vals return np.sum(vals) if __name__ == '__main__': dim = 10 cs = CS.ConfigurationSpace() for d in range(dim): cs.add_hyperparameter( CSH.UniformFloatHyperparameter(f'x{d}', lower=-5, upper=5)) logger = get_logger(file_name='sphere', logger_name='sphere') opt = TPEOptimizer(obj_func=sphere, config_space=cs, mutation_prob=0.0, resultfile='sphere') opt.optimize(logger)
def get_train_loaders(config): """ Returns dictionary containing the training and validation loaders (torch.utils.data.DataLoader) backed by the datasets.hdf5.HDF5Dataset. :param config: a top level configuration object containing the 'loaders' key :return: dict { 'train': <train_loader> 'val': <val_loader> } """ assert 'loaders' in config, 'Could not find data loaders configuration' loaders_config = config['loaders'] logger = get_logger('HDF5Dataset') logger.info('Creating training and validation set loaders...') # get train and validation files train_path = loaders_config['train_path'] val_path = loaders_config['val_path'] assert isinstance(train_path, str) assert isinstance(val_path, str) # get h5 internal paths for raw and label raw_internal_path = loaders_config['raw_internal_path'] label_internal_path = loaders_config['label_internal_path'] weight_internal_path = loaders_config.get('weight_internal_path', None) logger.info(f'Loading training set from: {train_path}...') # create H5 backed training and validation dataset with data augmentation train_dataset = HDF5Dataset( train_path, phase='train', transformer_config=loaders_config['transformer'], raw_internal_path=raw_internal_path, label_internal_path=label_internal_path, weight_internal_path=weight_internal_path) logger.info(f'Loading validation set from: {val_path}...') val_dataset = HDF5Dataset(val_path, phase='val', transformer_config=loaders_config['transformer'], raw_internal_path=raw_internal_path, label_internal_path=label_internal_path, weight_internal_path=weight_internal_path) num_workers = loaders_config.get('num_workers', 1) batch_size = loaders_config.get('batch_size', 1) logger.info(f'Number of workers for train/val datasets: {num_workers}') logger.info(f'Batch size for train/val datasets: {batch_size}') # when training with volumetric data use batch_size of 1 due to GPU memory constraints return { 'train': DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers), 'val': DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) }
def __init__(self, crawler, extractor): self.crawler = crawler self.extractor = extractor self.logger = get_logger(self.__class__.__name__)
import json from util.utils import get_config_space, get_logger, ParameterSettings from optimizer.tpe import TPEOptimizer from hpolib.tabular_benchmark import (FCNetNavalPropulsionBenchmark, FCNetParkinsonsTelemonitoringBenchmark, FCNetProteinStructureBenchmark, FCNetSliceLocalizationBenchmark) if __name__ == '__main__': js = open('hpolib/params.json') searching_space: Dict[str, ParameterSettings] = json.load(js) config_space = get_config_space(searching_space, hp_module_path='hpolib') logger = get_logger(file_name='hpolib', logger_name='hpolib') benchmark = [ FCNetNavalPropulsionBenchmark, FCNetParkinsonsTelemonitoringBenchmark, FCNetProteinStructureBenchmark, FCNetSliceLocalizationBenchmark ][0] # You need to change the path according to your path to the data data_dir = f'{os.environ["HOME"]}/research/nas_benchmarks/fcnet_tabular_benchmarks/' bm = benchmark(data_dir=data_dir) obj_func = bm.objective_func opt = TPEOptimizer(obj_func=obj_func, config_space=config_space, mutation_prob=0.05, resultfile='hpolib')
def __init__(self): self.logger = get_logger(__name__)
def __init__(self, *args, **kwargs): super(MyTestCase, self).__init__(*args, **kwargs) self.logger = get_logger(self.__class__.__name__)
from typing import Dict import json from util.utils import get_config_space, get_logger, ParameterSettings from cnn.train import get_objective_func from optimizer.tpe import TPEOptimizer if __name__ == '__main__': js = open('cnn/params.json') searching_space: Dict[str, ParameterSettings] = json.load(js) config_space = get_config_space(searching_space, hp_module_path='cnn') logger = get_logger(file_name='cnn', logger_name='cnn') obj_func = get_objective_func(logger=logger, searching_space=searching_space, config_space=config_space) opt = TPEOptimizer(obj_func=obj_func, config_space=config_space, mutation_prob=0.05, resultfile='cnn') opt.optimize(logger)
logger.info('Val Acc.: {:.2f}%'.format(val_acc * 100)) return 1.0 - val_acc return objective_func if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--filename', type=str, default='example') parser.add_argument('--saved_config', type=str, default=None) args = parser.parse_args() # Logger setting logger = get_logger(file_name=args.filename, logger_name=args.filename) js = open('cnn/params.json') searching_space: Dict[str, ParameterSettings] = json.load(js) config_space = get_config_space(searching_space, hp_module_path='cnn') objective_func = get_objective_func( logger=logger, searching_space=searching_space, config_space=config_space ) results = objective_func( config=Hyperparameters().__dict__, budget=BudgetConfig().__dict__ )
from werkzeug.utils import secure_filename, redirect from api import get_similarity_checker from app import app from similarity_checker import tokenize_and_normalize_content from util.utils import get_logger, get_unicode # This is the path to the upload directory app.config['UPLOAD_FOLDER'] = 'web/upload' # These are the extension that we are accepting to be uploaded excel_extensions = {'xls', 'xlsx'} sup_file_type = {'csv', 'txt'} | excel_extensions app.config['ALLOWED_EXTENSIONS'] = sup_file_type app.config['MAX_CONTENT_LENGTH'] = 1024 * 1024 * 1024 # Accept max 1GB file logger = get_logger(__name__) redis = StrictRedis(db=1, host=os.environ.get('REDIS_HOST', 'localhost'), port=os.environ.get('REDIS_PORT', 6379)) def convert_to_utf8(source_file_path): result_file_path = source_file_path + '.utf8' import codecs block_size = 1048576 # or some other, desired size in bytes with codecs.open(source_file_path, "r", "utf-8") as source: with codecs.open(result_file_path, "w", "utf-8") as target_file: while True: contents = source.read(block_size) if not contents:
def __init__(self, storage): self.logger = get_logger(self.__class__.__name__) self.storage = storage