Ejemplo n.º 1
0
def load_model(model_filename, config_filename, use_cuda):
    state_dict = torch.load(model_filename)
    config = Config()
    config.load(config_filename)
    model = RemotionRNN(config.model_config)
    model.load_state_dict(state_dict['model'])
    if config.model_config.use_word_embeddings:
        model.embedding.weight.requires_grad = config.train_embeddings
    model = model.cuda() if use_cuda else model

    if config.optimizer == "adam":
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=config.opt_lr)
    elif config.optimizer == "adadelta":
        optimizer = optim.Adadelta(filter(lambda p: p.requires_grad,
                                          model.parameters()),
                                   lr=config.opt_lr,
                                   rho=config.opt_rho,
                                   eps=config.opt_eps)
    else:
        assert False

    optimizer.load_state_dict(state_dict['optimizer'])

    return model, optimizer
Ejemplo n.º 2
0
def main(config_filename):
    config = Config()
    config.load(config_filename)

    train_data, test_data = get_data(config.data_config, config.competition)
    vocabulary = train_data.get_vocabulary(config.lower_vocabulary).merge(
        test_data.get_vocabulary(config.lower_vocabulary))
    if not os.path.exists(config.embeddings_filename) and \
            config.data_config.language == "ru" and \
            'fasttext' in config.embeddings_filename:
        shrink_w2v("/media/yallen/My Passport/Models/Vectors/FastText/wiki.ru.vec", vocabulary,
                   100000, config.embeddings_filename)
    if not os.path.exists(config.embeddings_filename) and \
            config.data_config.language == "ru" and \
            'w2v' in config.embeddings_filename:
        shrink_w2v("/media/yallen/My Passport/Models/Vectors/RDT/russian-big-w2v.txt", vocabulary,
                   100000, config.embeddings_filename)
    if not os.path.exists(config.embeddings_filename) and \
            config.data_config.language == "en" and \
            'w2v' in config.embeddings_filename:
        shrink_w2v("/media/yallen/My Passport/Models/Vectors/W2V/GoogleNews-vectors-negative300.vec", vocabulary,
                   150000, config.embeddings_filename)
    if not os.path.exists(config.embeddings_filename) and \
            config.data_config.language == "en" and \
            'fasttext' in config.embeddings_filename:
        shrink_w2v("/media/yallen/My Passport/Models/Vectors/FastText/wiki.en.vec", vocabulary,
                   150000, config.embeddings_filename)
    char_set = train_data.get_char_set()
    print(vocabulary.size())
    print(char_set)

    targets, additionals, rev_categories, output_sizes = get_targets_additionals(train_data)
    train_model(config_filename, train_data, vocabulary, char_set, targets, additionals, output_sizes)
    predict(config_filename, test_data, vocabulary, char_set,
            targets, additionals, rev_categories)
Ejemplo n.º 3
0
def main():
    process = None

    try:
        config = {}
        Config.load(config)

        LoggingHelper.init(config)

        process = Process()
        process.open(config)
        process.run()

        return 0

    except KeyboardInterrupt:
        return 0

    except Exception as ex:
        _logger.exception(ex)
        return 1

    finally:
        if process is not None:
            process.close()
Ejemplo n.º 4
0
def loaded_config(tmp_path):
    p = tmp_path / "test_config.json"
    p.write_text(json.dumps(MOCK_CONFIG))

    config = Config()
    config.load(p)

    return config
Ejemplo n.º 5
0
def predict(config_filename, test_data, vocabulary, char_set, targets,
            additionals, rev_categories):
    config = Config()
    config.load(config_filename)

    use_cuda = torch.cuda.is_available()
    model, _ = load_model(config.model_filename, config_filename, use_cuda)
    model.eval()

    competition = config.competition
    task_type = config.task_type
    task_key = competition + "-" + task_type
    test_batches = get_batches(test_data.reviews, vocabulary, char_set, 1,
                               config.model_config.word_max_count,
                               config.model_config.char_max_word_length,
                               targets[task_key], additionals[task_key],
                               config.model_config.use_pos)
    new_reviews = []
    for review, batch in zip(test_data.reviews, test_batches):
        predictions = model.predict(batch)

        if config.model_config.is_sequence_predictor:
            length = sum(
                [int(elem != 0) for elem in batch.word_indices[0].data])
            if model.config.use_crf:
                review_pred = predictions[0][:length]
            else:
                review_pred = predictions[0, :length].cpu()
        else:
            review_pred = predictions[0].cpu().item()
        new_review = get_new_review(review, review_pred, competition,
                                    task_type, rev_categories,
                                    config.model_config.is_sequence_predictor)
        new_reviews.append(new_review)

    if competition == "imdb":
        csv = "id,sentiment\n"
        for review in new_reviews:
            csv += str(review.rid) + "," + str(review.sentiment) + "\n"
        with open(config.output_filename, "w", encoding='utf-8') as f:
            f.write(csv)

    if competition == "sst2" or competition == "sst1":
        text = ""
        for review in new_reviews:
            text += str(review.sentiment) + " " + review.text + "\n"
        with open(config.output_filename, "w", encoding='utf-8') as f:
            f.write(text)

    if competition == "semeval" or competition == "sentirueval":
        xml = '<?xml version="1.0" ?>\n'
        xml += '<Reviews>\n' if competition == "semeval" else "<reviews>\n"
        for review in new_reviews:
            xml += review.to_xml()
        xml += '</Reviews>\n' if competition == "semeval" else "</reviews>\n"
        with open(config.output_filename, "w", encoding='utf-8') as f:
            f.write(xml)
Ejemplo n.º 6
0
    def __init__(self, config: Config = None):
        if not config:
            config = Config.load()

        self.config = config

        self.already_checked: dict[Site, set[BaseVideo]] = {
            site: set()
            for site in list(Site.__members__.values())
        }
        # Dict of dicts in the form {site: {video_id: db_id}}
        self.all_vids: dict[Site, dict[str, int]] = {
            site: {}
            for site in list(Site.__members__.values())
        }
        self.all_vid_ids: dict[Site, set[str]] = {
            site: set()
            for site in list(Site.__members__.values())
        }
        self.channel_cache: dict[Site, set[BaseChannel]] = {
            site: set()
            for site in list(Site.__members__.values())
        }
        self.db_channel_cache: dict[Site, set[str]] = {
            site: set()
            for site in list(Site.__members__.values())
        }

        self._yt_api = YTApi(self.config.yt_token)
        self.all_tags: dict[str, int] = {}
        self.threads: list[threading.Thread] = []

        self._conn = psycopg.connect(self.config.db_conn_string,
                                     row_factory=dict_row)
        self.db = DbUtils(self._conn)
def main():
    runner = None

    try:
        config = Config.load()

        init_logging(config)

        runner = Runner()
        runner.open(config)
        runner.run()

        return 0

    except KeyboardInterrupt:
        # if runner is not None:
        #     runner.close()
        return 0

    except Exception as ex:
        _logger.exception(ex)
        # no runner.close() to signal abnomal termination!
        return 1

    finally:
        if runner is not None:
            runner.close()
Ejemplo n.º 8
0
def main():

    try:
        config = Config.load()

        init_logging(config)

        runner = Runner(config)
        runner.run()

        return 0

    except KeyboardInterrupt:
        _logger.info("aborted.")
        return 0

    except MessageException as ex:
        _logger.error(ex)
        _logger.error("aborted!")
        return 1

    except Exception as ex:
        _logger.exception(ex)
        _logger.error("aborted!")
        # no runner.close() to signal abnormal termination!
        return 1
Ejemplo n.º 9
0
def main():
    reload(sys)
    sys.setdefaultencoding('utf8')
    parser = argparse.ArgumentParser(description="Processing of messages protobuf")
    parser.add_argument('CONFIG_DAEMON', type=str)
    config_file = ""
    try:
        args = parser.parse_args()
        config_file = args.CONFIG_DAEMON
    except argparse.ArgumentTypeError:
        print("Bad usage, learn how to use me with %s -h" % sys.argv[0])
        sys.exit(1)
    config_data = Config()
    config_data.load(config_file)
    daemon = Daemon(config_data)
    daemon.run()

    sys.exit(0)
Ejemplo n.º 10
0
def main():
    reload(sys)
    sys.setdefaultencoding('utf8')
    parser = argparse.ArgumentParser(
        description="Processing of messages protobuf")
    parser.add_argument('CONFIG_DAEMON', type=str)
    config_file = ""
    try:
        args = parser.parse_args()
        config_file = args.CONFIG_DAEMON
    except argparse.ArgumentTypeError:
        print("Bad usage, learn how to use me with %s -h" % sys.argv[0])
        sys.exit(1)
    config_data = Config()
    config_data.load(config_file)
    daemon = Daemon(config_data)
    daemon.run()

    sys.exit(0)
Ejemplo n.º 11
0
def main():
    # remember time
    start_time = time.time()

    p = argparse.ArgumentParser(description=f"Generating from *.proto files. Enabled")
    p.add_argument('--workdir', default=os.path.dirname(os.path.realpath(__file__)))
    parse_args = p.parse_args()

    working_directory = parse_args.workdir
    config_path = os.path.join(working_directory, Config.TypicalName)

    # load config
    config = Config.load(config_path)

    replaceable_options = config.get_replaceable_options()
    for k, v in replaceable_options.items():
        environ_val = os.environ.get(k, str(v).lower())

        if environ_val and replaceable_options[k] != Misc.str_to_bool(environ_val):
            replaceable_options[k] = Misc.str_to_bool(environ_val)

    config.update(replaceable_options)
    config_changed = config.is_changed()

    # display config file
    print(f'Working directory: {working_directory}')
    print(f'Config: {config_path}\n{str(config)}')
    proto_root = config['proto_root']

    # then compile our matcher
    pattern = "(^.+)\\.{}$".format('|'.join(config['extensions']))
    matcher = re.compile(pattern)

    if os.path.isabs(proto_root):
        abs_proto_folder = proto_root
    else:
        abs_proto_folder = os.path.join(working_directory, proto_root)

    if not os.path.isdir(abs_proto_folder):
        raise Exception(f"proto_root: \"{abs_proto_folder}\" is not a valid path")

    dh = DirHashCalculator(config['force'] or config_changed)

    changed, new_digest = dh.get_changed(abs_proto_folder, matcher)
    matching = dh.get_matching(abs_proto_folder, matcher)

    code_gen_args = (changed, matching, matcher)
    CodeGenerator(working_directory, config).gen_all(*code_gen_args)

    # The downside is that while there was any unsuccessfully built files, other ones will be re-compiled as well
    dh.save_digest(abs_proto_folder, new_digest)

    elapsed_time = round(time.time() - start_time, 3)
    print(colorama.Fore.WHITE + f"Build done in {elapsed_time} s")
Ejemplo n.º 12
0
    logs_dir = os.getenv('LOGS_DIR', dir_path)
    handler = logging.FileHandler(filename=os.path.join(logs_dir, 'debug.log'), encoding='utf-8', mode='a')
    handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s:[%(module)s] %(message)s'))
    logger.addHandler(handler)

    parser = ArgumentParser()
    parser.add_argument('-d', '--dry-run', action='store_true', default=False)
    parser.add_argument('-a', '--upload-all', action='store_true', default=False,
                        help='''If this parameter is given uploads all files,
                        even if those files could not be linked to a video id.
                        However those uploads cannot be further tracked back to the database.''')

    args = parser.parse_args(sys.argv[1:])

    config = Config.load()
    data_dir = os.path.join(dir_path, 'data')
    video_dir = os.path.join(data_dir, 'videos')
    counter = Counter()

    video_files = {}

    for site in os.listdir(video_dir):
        site_path = os.path.join(video_dir, site)
        for file in os.listdir(site_path):
            s3_type = get_s3_type(file)
            counter.update([s3_type])

            filename, ext = os.path.splitext(file)
            filename = re.sub(r'(\.info|\.live_chat|\.\w{2})$', '', filename)
            if filename not in video_files:
Ejemplo n.º 13
0
import tensorflow as tf
import os
from build_data import generate_model_data, CoNLLDataset, generate_model_data_elmo
from src.data_utils import transform_data, transform_data_all
from src.sent_data_utils import save_predicted
from src.config import Config
from models.ner_model import NERModel

if __name__ == "__main__":
    config = Config()
    config.load()

    model = NERModel(config)
    tf.reset_default_graph()
    model.build()
    print('Loading model...')
    model.restore_session(config.dir_model)
    print('Model load completed...')

    files = os.listdir(config.input_save_dir)
    for file in files:
        input_file = os.path.join(config.input_save_dir, file)
        extra = CoNLLDataset(input_file, config.processing_word_elmo,
                             config.processing_tag, config.max_iter)
        extra, _, _ = transform_data(extra, config.vocab_chars, config.vocab_tags, \
                                     config.max_length_words, config.max_length_chars, config.use_elmo)

        predicted_labels = model.predict_abstract(extra)
        output_file = os.path.join(config.output_save_dir, file)

        idx2tag = dict([(v, k) for (k, v) in config.vocab_tags.items()])
Ejemplo n.º 14
0
def startup():
    Config.load(encoding="utf8")
    global _db, _logger
    _db = get_mongo_database()
    _logger = get_logger("nmdm-fetcher-logger")
    _monitor.start()
Ejemplo n.º 15
0
def train_model(config_filename, train_data, vocabulary, char_set, targets,
                additionals, output_sizes):
    config = Config()
    config.load(config_filename)

    use_cuda = torch.cuda.is_available()
    print("Use cuda: ", use_cuda)

    random.seed(config.seed)
    torch.manual_seed(config.seed)
    if use_cuda:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True

    task_key = config.competition + "-" + config.task_type

    config.model_config.word_vocabulary_size = vocabulary.size()
    config.model_config.char_count = len(char_set)

    if config.model_config.use_pos:
        gram_vector_size = len(train_data.reviews[0].sentences[0][0].vector)
        config.model_config.gram_vector_size = gram_vector_size

    config.model_config.output_size = output_sizes[task_key]
    config.save(config_filename)

    model = RemotionRNN(config.model_config)
    if config.model_config.use_word_embeddings and config.use_pretrained_embeddings:
        embeddings = get_embeddings(vocabulary, config.embeddings_filename,
                                    config.model_config.word_embedding_dim)
        model.embedding.weight = torch.nn.Parameter(embeddings, requires_grad=config.train_embeddings)
    model = model.cuda() if use_cuda else model
    print(model)

    if config.optimizer == "adam":
        optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.opt_lr)
    elif config.optimizer == "adadelta":
        optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()),
                                   lr=config.opt_lr, rho=config.opt_rho, eps=config.opt_eps)
    else:
        assert False

    n = len(train_data.reviews)
    train_size = 1 - config.val_size
    border = int(n*train_size)
    reviews = random.Random(config.seed).sample(train_data.reviews, n)
    train_reviews, val_reviews = reviews[:border], reviews[border:]

    target_function = targets[task_key]
    additional_function = additionals[task_key]

    prev_val_loss = float("inf")
    bad_count = 0
    for epoch in range(config.epochs):
        train_loss = 0
        train_count = 0
        train_batches = get_batches(train_reviews, vocabulary, char_set, config.batch_size,
                                    config.model_config.word_max_count, config.model_config.char_max_word_length,
                                    target_function, additional_function, config.model_config.use_pos)
        for batch in train_batches:
            model.train()
            loss = process_batch(model, batch, optimizer)
            train_loss += loss
            train_count += 1

        val_loss = 0
        val_count = 0
        val_batches = get_batches(val_reviews, vocabulary, char_set, config.batch_size,
                                  config.model_config.word_max_count, config.model_config.char_max_word_length,
                                  target_function, additional_function, config.model_config.use_pos)
        for batch in val_batches:
            model.eval()
            loss = process_batch(model, batch, None)
            val_loss += loss
            val_count += 1

        print("Epoch: {epoch}, train loss: {train_loss}, val loss: {val_loss}".format(
            epoch=epoch,
            train_loss=train_loss/train_count,
            val_loss=val_loss/val_count
        ))
        if prev_val_loss < val_loss:
            bad_count += 1
        else:
            save_model(model, optimizer, config.model_filename)
            config.save(config_filename)
            bad_count = 0
            prev_val_loss = val_loss
        if bad_count == config.patience+1:
            break
    return model
Ejemplo n.º 16
0
 def __init__(self, config: Config = Config.load()):
     self.config = config
     self.population = []
     self.proving_grounds = ProvingGrounds()
Ejemplo n.º 17
0
class Scrappy(object):
    '''Scrappy is an object that represents the spider itself. The main run 
    method will block the current thread. The work is done by UrlMiners and the
    RecordingMiner.
    '''

    def __init__(self, 
                 config_file, 
                 thread_count=1, 
                 timeout=60, 
                 seed=''):
        '''Initializer,
        
            Arguments:
            
                config_file             the file containing configuration data.
                thread_count            the number of worker threads to use.
                timeout                 the time workers should wait on request.
                seed                    a website for Scrappy to crawl.
                
            Throws:
            
                Currently passes all exceptions, but can caught a 
                KeyboardInterrupt.
        '''
                 
        self.config_file = config_file
        self.config = Config()
        self.thread_count = thread_count
        self.timeout = timeout
        self.visited_urls = SetWrapper(set())
        self.unvisited_urls = SetWrapper(set())
        self.targets = SetWrapper(set(), format="{},\n")
        self.verbose = False
        self.saverate = 6
        self.seed = seed
        
    def load(self):
        '''Load all configuration tokens and assign them.
        
            Throws:
            
                Shouldn't throw any exceptions, but may fail to load a file if
                the file doesn't exist. 
        '''
		
        print('\nLoading =>\n')
            
        self.config.load(self.config_file)
        self.target_file = self.config['target_file']
        self.visited_file = self.config['visited_file']
        self.unvisited_file = self.config['unvisited_file']
        
        print('\t\tConfig File: {}'.format(self.config_file))
        print('\t\tTarget File: {}'.format(self.target_file))
        print('\t\tUnvisited File: {}'.format(self.unvisited_file))
        print('\t\tVisited File: {}'.format(self.visited_file))
        print('\t\tTimeout: {}'.format(self.timeout))
        print('\t\tSave rate: {}'.format(self.saverate))
        print('\t\tThread count: {}'.format(self.thread_count))
                    
        if self.seed:
            self.unvisited_urls.add(self.seed)
                    
        if isfile(self.unvisited_file):
            with open(self.unvisited_file, 'r') as infile:
                for url in infile:
                    if url:
                        self.unvisited_urls.add(url)
            print('\n\t\tLoaded {} '
                  'unvisited sites.'.format(len(self.unvisited_urls.data)))
        
        if isfile(self.visited_file):
            with open(self.visited_file, 'r') as infile:
                for url in infile:
                    if url:
                        self.visited_urls.add(url)
            print('\t\tLoaded {} '
                  'visited sites.'.format(len(self.visited_urls.data)))
        
        if isfile(self.target_file):
            with open(self.target_file, 'r') as infile:
                for target in infile:
                    # -2, cut off new line and ,
                    self.targets.add(target[:len(target) - 2])
            print('\t\tLoaded {} '
                  'targets.'.format(len(self.targets.data)))
        
    def run(self):
        '''This is the main method which blocks the current thread's execution.
        The thread will block until the queue is empty. If the queue is never
        empty then the thread will never unblock. You can use a keyboard 
        interrupt to save and end. 
        
            Throws:
            
                Shouldn't throw anything... we hope.
        '''

        start_time = time()
    
        print('\nRunning =>\n')
        print('\t\tStarted at {}\n'.format(TimeStamp().now()))
            
        try:
            url_queue = Queue()
                
            for url in self.unvisited_urls.data:
                url_queue.put(url)
            
            for i in range(0, self.thread_count):
                thread = UrlMiner(url_queue, 
                                  self.unvisited_urls, 
                                  self.visited_urls, 
                                  self.targets, 
                                  self.timeout)
                thread.setDaemon(True)
                thread.verbose = self.verbose
                thread.start()
                    
            recorder = RecordingMiner([[self.unvisited_urls,self.unvisited_file],
                                       [self.visited_urls, self.visited_file], 
                                       [self.targets, self.target_file]], 
                                       interval=self.saverate)        
                                                  
            recorder.setDaemon(True)
            recorder.start()

            while not url_queue.empty():
                sleep(10)
                
            recorder.running = False
            
        except KeyboardInterrupt:
            recorder.running = False
            recorder.save()
            
        print('\nEnding =>\n'
              '\t\tElapsed time {0:.2f} '
              'minutes.'.format((time() - start_time) / 60))
              
        print('\t\tScraped {} sites.'.format(len(self.visited_urls.data)))
                                       
        print('\t\tUnvisited: '
              '{}\n\t\tVisited: {}'.format(len(self.unvisited_urls.data),
                                            len(self.visited_urls.data)))
                                                           
        print('\t\tTargets: {}'.format(len(self.targets.data)))
Ejemplo n.º 18
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config_file', type=str, required=True)
    args = parser.parse_args()

    # settings
    config_path = Path(args.config_file)
    config = Config.load(config_path)

    warnings.filterwarnings('ignore')
    set_seed(config.seed)
    start_time = time.time()

    with timer('load data'):
        DATA_DIR = './input/riiid-test-answer-prediction/'
        usecols = [
            'row_id',
            'timestamp',
            'user_id',
            'content_id',
            'content_type_id',
            'answered_correctly',
            'prior_question_elapsed_time',
        ]
        dtype = {
            'row_id': 'int64',
            'timestamp': 'int64',
            'user_id': 'int32',
            'content_id': 'int16',
            'content_type_id': 'int8',
            'answered_correctly': 'int8',
            'prior_question_elapsed_time': 'float32'
        }

        train_df = pd.read_csv(DATA_DIR + 'train.csv',
                               usecols=usecols,
                               dtype=dtype)
        question_df = pd.read_csv(DATA_DIR + 'questions.csv',
                                  usecols=['question_id', 'part'])

    train_df = train_df[train_df['content_type_id'] == 0].reset_index(
        drop=True)

    question_df['part'] += 1  # 0: padding id, 1: start id
    train_df['content_id'] += 2  # 0: padding id, 1: start id
    question_df['question_id'] += 2
    train_df = train_df.merge(question_df,
                              how='left',
                              left_on='content_id',
                              right_on='question_id')

    with timer('validation split'):
        train_idx, valid_idx, epoch_valid_idx = virtual_time_split(
            train_df,
            valid_size=config.valid_size,
            epoch_valid_size=config.epoch_valid_size)
        valid_y = train_df.iloc[valid_idx]['answered_correctly'].values
        epoch_valid_y = train_df.iloc[epoch_valid_idx][
            'answered_correctly'].values

    print('-' * 20)
    print(f'train size: {len(train_idx)}')
    print(f'valid size: {len(valid_idx)}')

    with timer('prepare data loader'):
        train_user_seqs = get_user_sequences(train_df.iloc[train_idx])
        valid_user_seqs = get_user_sequences(train_df.iloc[valid_idx])

        train_dataset = TrainDataset(train_user_seqs,
                                     window_size=config.window_size,
                                     stride_size=config.stride_size)
        valid_dataset = ValidDataset(train_df,
                                     train_user_seqs,
                                     valid_user_seqs,
                                     valid_idx,
                                     window_size=config.window_size)

        train_loader = DataLoader(train_dataset, **config.train_loader_params)
        valid_loader = DataLoader(valid_dataset, **config.valid_loader_params)

        # valid loader for epoch validation
        epoch_valid_user_seqs = get_user_sequences(
            train_df.iloc[epoch_valid_idx])
        epoch_valid_dataset = ValidDataset(train_df,
                                           train_user_seqs,
                                           epoch_valid_user_seqs,
                                           epoch_valid_idx,
                                           window_size=config.window_size)
        epoch_valid_loader = DataLoader(epoch_valid_dataset,
                                        **config.valid_loader_params)

    with timer('train'):
        if config.model == 'akt':
            content_encoder_config = BertConfig(
                **config.content_encoder_config)
            knowledge_encoder_config = BertConfig(
                **config.knowledge_encoder_config)
            decoder_config = BertConfig(**config.decoder_config)

            content_encoder_config.max_position_embeddings = config.window_size + 1
            knowledge_encoder_config.max_position_embeddings = config.window_size
            decoder_config.max_position_embeddings = config.window_size + 1

            model = AktEncoderDecoderModel(content_encoder_config,
                                           knowledge_encoder_config,
                                           decoder_config)

        elif config.model == 'saint':
            encoder_config = BertConfig(**config.encoder_config)
            decoder_config = BertConfig(**config.decoder_config)

            encoder_config.max_position_embeddings = config.window_size
            decoder_config.max_position_embeddings = config.window_size

            model = SaintEncoderDecoderModel(encoder_config, decoder_config)

        else:
            raise ValueError(f'Unknown model: {config.model}')

        model.to(config.device)
        model.zero_grad()

        optimizer = optim.Adam(model.parameters(), **config.optimizer_params)
        scheduler = NoamLR(optimizer, warmup_steps=config.warmup_steps)
        loss_ema = None

        for epoch in range(config.n_epochs):
            epoch_start_time = time.time()
            model.train()

            progress = tqdm(train_loader,
                            desc=f'epoch {epoch + 1}',
                            leave=False)
            for i, (x_batch, w_batch, y_batch) in enumerate(progress):
                y_pred = model(**x_batch.to(config.device).to_dict())
                loss = nn.BCEWithLogitsLoss(weight=w_batch.to(config.device))(
                    y_pred, y_batch.to(config.device))
                loss.backward()

                if (config.gradient_accumulation_steps is None
                        or (i + 1) % config.gradient_accumulation_steps == 0):
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()

                loss_ema = loss_ema * 0.9 + loss.item(
                ) * 0.1 if loss_ema is not None else loss.item()
                progress.set_postfix(loss=loss_ema)

            valid_preds = predict(model,
                                  epoch_valid_loader,
                                  device=config.device)
            valid_score = roc_auc_score(epoch_valid_y, valid_preds)

            elapsed_time = time.time() - epoch_start_time
            print(
                f'Epoch {epoch + 1}/{config.n_epochs} \t valid score: {valid_score:.5f} \t time: {elapsed_time / 60:.1f} min'
            )

    with timer('predict'):
        valid_preds = predict(model, valid_loader, device=config.device)
        valid_score = roc_auc_score(valid_y, valid_preds)

    print(f'valid score: {valid_score:.5f}')

    output_dir = Path(f'./output/{config_path.stem}/')
    output_dir.mkdir(parents=True, exist_ok=True)

    torch.save(model.state_dict(), output_dir / 'model.pt')
    torch.save(optimizer.state_dict(), output_dir / 'optimizer.pt')

    elapsed_time = time.time() - start_time
    print(f'all processes done in {elapsed_time / 60:.1f} min.')
Ejemplo n.º 19
0
#!/usr/bin/python3

import logging

from src.config import Config
from src.main import Main

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
    config_path = 'data/config.yaml'
    config = Config.load(config_path)

    if config.version_changed():
        logging.info('Version changed, migrating data.')
        config.migrate()
        config.save(config_path)
        logging.info('Migration complete.')

    Main(config).run()
    config.save(config_path)