Example #1
0
 def __init__(self, parser):
     opt = parser.parse_args()
     # Possibly build a dictionary (not all models do this).
     if opt['dict_build_first'] and 'dict_file' in opt:
         if opt['dict_file'] is None and opt.get('model_file'):
             opt['dict_file'] = opt['model_file'] + '.dict'
         print("[ building dictionary first... ]")
         build_dict.build_dict(opt)
     # Create model and assign it to the specified task
     self.agent = create_agent(opt)
     self.world = create_task(opt, self.agent)
     self.train_time = Timer()
     self.validate_time = Timer()
     self.log_time = Timer()
     print('[ training... ]')
     self.parleys = 0
     self.total_exs = 0
     self.total_episodes = 0
     self.total_epochs = 0
     self.max_exs = None
     self.max_parleys = None
     self.world_num_exs = self.world.num_examples()
     if self.world_num_exs is not None:
         self.max_exs = opt['num_epochs'] * self.world_num_exs
         self.max_parleys = math.ceil(self.max_exs / opt['batchsize'])
     self.best_valid = 0
     self.impatience = 0
     self.saved = False
     self.valid_world = None
     self.opt = opt
Example #2
0
 def __init__(self, parser):
     opt = parser.parse_args()
     # Possibly build a dictionary (not all models do this).
     if opt['dict_build_first'] and 'dict_file' in opt:
         if opt['dict_file'] is None and opt.get('model_file'):
             opt['dict_file'] = opt['model_file'] + '.dict'
         print("[ building dictionary first... ]")
         build_dict.build_dict(opt)
     # Create model and assign it to the specified task
     self.agent = create_agent(opt)
     self.world = create_task(opt, self.agent)
     self.train_time = Timer()
     self.validate_time = Timer()
     self.log_time = Timer()
     self.save_time = Timer()
     print('[ training... ]')
     self.parleys = 0
     self.total_episodes = 0
     self.total_epochs = 0
     self.max_num_epochs = opt[
         'num_epochs'] if opt['num_epochs'] > 0 else float('inf')
     self.max_train_time = opt[
         'max_train_time'] if opt['max_train_time'] > 0 else float('inf')
     self.log_every_n_secs = opt['log_every_n_secs'] if opt[
         'log_every_n_secs'] > 0 else float('inf')
     self.val_every_n_secs = opt['validation_every_n_secs'] if opt[
         'validation_every_n_secs'] > 0 else float('inf')
     self.save_every_n_secs = opt['save_every_n_secs'] if opt[
         'save_every_n_secs'] > 0 else float('inf')
     self.best_valid = 0
     self.impatience = 0
     self.saved = False
     self.valid_world = None
     self.opt = opt
Example #3
0
def main():
    warnings.warn = warn
    mk_dir()
    print('# split\n')
    split()
    print('# building dictionary\n')
    for file in tqdm(os.listdir('TEMP')):
        statinfo = os.stat('TEMP/'+file)

        #check if it is empty shell
        if statinfo.st_size>1000:
            try:
                df = build_dict(file)
                df = agg(df)
                
                #cant use if df on it
                if len(df.index)!=0:
                    df['size'] = df['size'] = statinfo.st_size
                    df['raw_label'] = file.replace('.pcap','')
                    df.to_csv('TEMP_CSV/'+file.replace('.pcap','.csv'),index=False)
                    
            except Exception as e:
                print(e)

    print('# collecting dataframe\n')
    r,c = join()
    print('\nGOT ',r,'nonzero records out of ',len(os.listdir('TEMP')),'files\n')
    print('df shape: ',r,'x',c)
    mk_dir()
Example #4
0
def build_dict_from_nltk(output_file, corpus=None, stopwords=None,
                         stemmer=Stemmer(), measure='IDF', verbose=False):
    '''
    @param output_file: the name of the file where the dictionary should be
                        saved
    @param corpus:      the NLTK corpus to use (defaults to nltk.corpus.reuters)
    @param stopwords:   a list of (not stemmed) stopwords (defaults to
                        nltk.corpus.reuters.words('stopwords'))
    @param stemmer:     the L{Stemmer} object to be used
    @param measure:     the measure used to compute the weights ('IDF'
                        i.e. 'inverse document frequency' or 'ICF' i.e.
                        'inverse collection frequency'; defaults to 'IDF')
    @param verbose:     whether information on the progress should be printed
                        on screen
    '''
    
    from build_dict import build_dict
    import nltk
    import pickle

    if not (corpus and stopwords):
        nltk.download('reuters')
        
    corpus = corpus or nltk.corpus.reuters
    stopwords = stopwords or nltk.corpus.reuters.words('stopwords')

    corpus_list = []
    
    if verbose: print 'Processing Brown corpus...'
    for file in corpus.fileids():
        doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
               if w[0].isalpha()]
        corpus_list.append(doc)

    corpus = nltk.corpus.reuters
    stopwords += list(nltk.corpus.reuters.words('stopwords'))

    if verbose: print 'Processing Reuter corpus...'
    for file in corpus.fileids():
        doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file)
               if w[0].isalpha()]
        corpus_list.append(doc)

    if verbose: print 'Processing stopwords...'
    stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus_list, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1) 
Example #5
0
def build_dict_from_nltk(output_file,
                         corpus=None,
                         stopwords=None,
                         stemmer=Stemmer(),
                         measure='IDF',
                         verbose=False):
    '''
    @param output_file: the name of the file where the dictionary should be
                        saved
    @param corpus:      the NLTK corpus to use (defaults to nltk.corpus.reuters)
    @param stopwords:   a list of (not stemmed) stopwords (defaults to
                        nltk.corpus.reuters.words('stopwords'))
    @param stemmer:     the L{Stemmer} object to be used
    @param measure:     the measure used to compute the weights ('IDF'
                        i.e. 'inverse document frequency' or 'ICF' i.e.
                        'inverse collection frequency'; defaults to 'IDF')
    @param verbose:     whether information on the progress should be printed
                        on screen
    '''

    from build_dict import build_dict
    import nltk
    import pickle

    if not (corpus and stopwords):
        nltk.download('reuters')

    corpus = corpus or nltk.corpus.reuters
    stopwords = stopwords or nltk.corpus.reuters.words('stopwords')

    corpus_list = []

    if verbose: print 'Processing corpus...'
    for file in corpus.fileids():
        doc = [
            stemmer(Tag(w.lower())).stem for w in corpus.words(file)
            if w[0].isalpha()
        ]
        corpus_list.append(doc)

    if verbose: print 'Processing stopwords...'
    stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus_list, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1)
Example #6
0
def build_dict_from_files(output_file,
                          corpus_files,
                          stopwords_file=None,
                          reader=SimpleReader(),
                          stemmer=Stemmer(),
                          measure='IDF',
                          verbose=False):
    '''
    @param output_file:    the name of the file where the dictionary should be
                           saved
    @param corpus_files:   a list of files with words to process
    @param stopwords_file: a file containing a list of stopwords
    @param reader:         the L{Reader} object to be used
    @param stemmer:        the L{Stemmer} object to be used
    @param measure:        the measure used to compute the weights ('IDF'
                           i.e. 'inverse document frequency' or 'ICF' i.e.
                           'inverse collection frequency'; defaults to 'IDF')
    @param verbose:        whether information on the progress should be
                           printed on screen
    '''

    import pickle

    if verbose: print 'Processing corpus...'
    corpus = []
    for filename in corpus_files:
        with open(filename, 'r') as doc:
            corpus.append(reader(doc.read()))
    corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus]

    stopwords = None
    if stopwords_file:
        if verbose: print 'Processing stopwords...'
        with open(stopwords_file, 'r') as sw:
            stopwords = reader(sw.read())
        stopwords = [w.stem for w in map(stemmer, stopwords)]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1)
Example #7
0
def build_dict_from_files(output_file, corpus_files, stopwords_file=None,
                          reader=SimpleReader(), stemmer=Stemmer(),
                          measure='IDF', verbose=False):
    '''
    @param output_file:    the name of the file where the dictionary should be
                           saved
    @param corpus_files:   a list of files with words to process
    @param stopwords_file: a file containing a list of stopwords
    @param reader:         the L{Reader} object to be used
    @param stemmer:        the L{Stemmer} object to be used
    @param measure:        the measure used to compute the weights ('IDF'
                           i.e. 'inverse document frequency' or 'ICF' i.e.
                           'inverse collection frequency'; defaults to 'IDF')
    @param verbose:        whether information on the progress should be
                           printed on screen
    '''

    import pickle

    if verbose: print 'Processing corpus...'
    corpus = []
    for filename in corpus_files:
        with open(filename, 'r') as doc:
            corpus.append(reader(doc.read()))
    corpus = [[w.stem for w in map(stemmer, doc)] for doc in corpus]

    stopwords = None
    if stopwords_file:
        if verbose: print 'Processing stopwords...'
        with open(stopwords_file, 'r') as sw:
            stopwords = reader(sw.read())
        stopwords = [w.stem for w in map(stemmer, stopwords)]

    if verbose: print 'Building dictionary... '
    dictionary = build_dict(corpus, stopwords, measure)
    with open(output_file, 'wb') as out:
        pickle.dump(dictionary, out, -1) 
Example #8
0
def main():
    # Get command line arguments
    parser = ParlaiParser(True, True)
    train = parser.add_argument_group('Training Loop Arguments')
    train.add_argument('-et',
                       '--evaltask',
                       help=('task to use for valid/test (defaults to the ' +
                             'one used for training if not set)'))
    train.add_argument('-d', '--display-examples', type='bool', default=False)
    train.add_argument('-e', '--num-epochs', type=float, default=-1)
    train.add_argument('-ttim', '--max-train-time', type=float, default=-1)
    train.add_argument('-ltim', '--log-every-n-secs', type=float, default=2)
    train.add_argument('-lparl',
                       '--log-every-n-parleys',
                       type=int,
                       default=100)
    train.add_argument('-vtim',
                       '--validation-every-n-secs',
                       type=float,
                       default=-1)
    train.add_argument('-vparl',
                       '--validation-every-n-parleys',
                       type=int,
                       default=-1)
    train.add_argument('-vme',
                       '--validation-max-exs',
                       type=int,
                       default=-1,
                       help='max examples to use during validation (default ' +
                       '-1 uses all)')
    train.add_argument(
        '-vp',
        '--validation-patience',
        type=int,
        default=5,
        help=('number of iterations of validation where result ' +
              'does not improve before we stop training'))
    train.add_argument(
        '-vmt',
        '--validation-metric',
        default='accuracy',
        help='key into report table for selecting best validation')
    train.add_argument('-dbf',
                       '--dict-build-first',
                       type='bool',
                       default=True,
                       help='build dictionary first before training agent')
    opt = parser.parse_args()

    # Set logging
    logger = logging.getLogger('DrQA')
    logger.setLevel(logging.INFO)
    fmt = logging.Formatter('%(asctime)s: %(message)s', '%m/%d/%Y %I:%M:%S %p')
    console = logging.StreamHandler()
    console.setFormatter(fmt)
    logger.addHandler(console)
    if 'log_file' in opt:
        logfile = logging.FileHandler(opt['log_file'], 'w')
        logfile.setFormatter(fmt)
        logger.addHandler(logfile)
    logger.info('[ COMMAND: %s ]' % ' '.join(sys.argv))

    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        logger.info("[ building dictionary first... ]")
        build_dict.build_dict(opt)

    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    logger.info('[ training... ]')
    parleys = 0
    total_exs = 0
    max_exs = opt['num_epochs'] * len(world)
    max_parleys = math.ceil(max_exs / opt['batchsize'])
    best_valid = 0
    impatience = 0
    saved = False
    valid_world = None
    best_accuracy = 0

    while True:
        world.parley()
        parleys += 1

        if opt['num_epochs'] > 0 and parleys >= max_parleys:
            logger.info('[ num_epochs completed: {} ]'.format(
                opt['num_epochs']))
            break
        if opt['max_train_time'] > 0 and train_time.time(
        ) > opt['max_train_time']:
            logger.info('[ max_train_time elapsed: {} ]'.format(
                train_time.time()))
            break

#        instead of every_n_secs, use n_parleys
#        if opt['log_every_n_secs'] > 0 and log_time.time() > opt['log_every_n_secs']:
        if opt['log_every_n_parleys'] > 0 and parleys % opt[
                'log_every_n_parleys'] == 0:
            if opt['display_examples']:
                logger.info(world.display() + '\n~~')

            logs = []
            # time elapsed
            logs.append('time:{}s'.format(math.floor(train_time.time())))
            logs.append('parleys:{}'.format(parleys))

            # get report and update total examples seen so far
            if hasattr(agent, 'report'):
                train_report = agent.report()
                agent.reset_metrics()
            else:
                train_report = world.report()
                world.reset_metrics()

            if hasattr(train_report, 'get') and train_report.get('total'):
                total_exs += train_report['total']
                logs.append('total_exs:{}'.format(total_exs))

            # check if we should log amount of time remaining
            time_left = None
            if opt['num_epochs'] > 0:
                exs_per_sec = train_time.time() / total_exs
                time_left = (max_exs - total_exs) * exs_per_sec
            if opt['max_train_time'] > 0:
                other_time_left = opt['max_train_time'] - train_time.time()
                if time_left is not None:
                    time_left = min(time_left, other_time_left)
                else:
                    time_left = other_time_left
            if time_left is not None:
                logs.append('time_left:{}s'.format(math.floor(time_left)))

            # join log string and add full metrics report to end of log
            log = '[ {} ] {}'.format(' '.join(logs), train_report)

            logger.info(log)
            log_time.reset()


#        instead of every_n_secs, use n_parleys
#       if (opt['validation_every_n_secs'] > 0 and
#                    validate_time.time() > opt['validation_every_n_secs']):
        if (opt['validation_every_n_parleys'] > 0
                and parleys % opt['validation_every_n_parleys'] == 0):
            #if True :
            valid_report, valid_world = run_eval(agent,
                                                 opt,
                                                 'valid',
                                                 opt['validation_max_exs'],
                                                 logger=logger)
            #if False :
            if valid_report[opt['validation_metric']] > best_accuracy:
                best_accuracy = valid_report[opt['validation_metric']]
                impatience = 0
                logger.info('[ new best accuracy: ' + str(best_accuracy) +
                            ' ]')
                world.save_agents()
                saved = True
                if best_accuracy == 1:
                    logger.info('[ task solved! stopping. ]')
                    break
            #if True:
            else:
                opt['learning_rate'] *= 0.5
                agent.model.set_lrate(opt['learning_rate'])
                logger.info('[ Decrease learning_rate %.2e]' %
                            opt['learning_rate'])
                impatience += 1
                logger.info(
                    '[ did not beat best accuracy: {} impatience: {} ]'.format(
                        round(best_accuracy, 4), impatience))

            validate_time.reset()
            if opt['validation_patience'] > 0 and impatience >= opt[
                    'validation_patience']:
                logger.info('[ ran out of patience! stopping training. ]')
                break
            if opt['learning_rate'] < pow(10, -6):
                logger.info(
                    '[ learning_rate < pow(10,-6) ! stopping training. ]')
                break

    world.shutdown()
    if not saved:
        world.save_agents()
    else:
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid', write_log=True, logger=logger)
    run_eval(agent, opt, 'test', write_log=True, logger=logger)
Example #9
0
def main():
    # Get command line arguments
    parser = ParlaiParser(True, True)
    train = parser.add_argument_group('Training Loop Arguments')
    train.add_argument('-et', '--evaltask',
                       help=('task to use for valid/test (defaults to the '
                             'one used for training if not set)'))
    train.add_argument('-d', '--display-examples',
                       type='bool', default=False)
    train.add_argument('-e', '--num-epochs', type=float, default=-1)
    train.add_argument('-ttim', '--max-train-time',
                       type=float, default=-1)
    train.add_argument('-ltim', '--log-every-n-secs',
                       type=float, default=2)
    train.add_argument('-vtim', '--validation-every-n-secs',
                       type=float, default=-1)
    train.add_argument('-vme', '--validation-max-exs',
                       type=int, default=-1,
                       help='max examples to use during validation (default '
                            '-1 uses all)')
    train.add_argument('-vp', '--validation-patience',
                       type=int, default=5,
                       help=('number of iterations of validation where result'
                             ' does not improve before we stop training'))
    train.add_argument('-vmt', '--validation-metric', default='accuracy',
                       help='key into report table for selecting best '
                            'validation')
    train.add_argument('-dbf', '--dict-build-first',
                       type='bool', default=True,
                       help='build dictionary first before training agent')
    opt = parser.parse_args()
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        print("[ building dictionary first... ]")
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    print('[ training... ]')
    parleys = 0
    total_exs = 0
    max_exs = opt['num_epochs'] * len(world)
    max_parleys = math.ceil(max_exs / opt['batchsize'])
    best_valid = 0
    impatience = 0
    saved = False
    valid_world = None
    while True:
        world.parley()
        parleys += 1

        if opt['num_epochs'] > 0 and parleys >= max_parleys:
            print('[ num_epochs completed: {} ]'.format(opt['num_epochs']))
            break
        if opt['max_train_time'] > 0 and train_time.time() > opt['max_train_time']:
            print('[ max_train_time elapsed: {} ]'.format(train_time.time()))
            break
        if opt['log_every_n_secs'] > 0 and log_time.time() > opt['log_every_n_secs']:
            if opt['display_examples']:
                print(world.display() + '\n~~')

            logs = []
            # time elapsed
            logs.append('time:{}s'.format(math.floor(train_time.time())))
            logs.append('parleys:{}'.format(parleys))

            # get report and update total examples seen so far
            if hasattr(agent, 'report'):
                train_report = agent.report()
                agent.reset_metrics()
            else:
                train_report = world.report()
                world.reset_metrics()

            if hasattr(train_report, 'get') and train_report.get('total'):
                total_exs += train_report['total']
                logs.append('total_exs:{}'.format(total_exs))

            # check if we should log amount of time remaining
            time_left = None
            if opt['num_epochs'] > 0:
                exs_per_sec = train_time.time() / total_exs
                time_left = (max_exs - total_exs) * exs_per_sec
            if opt['max_train_time'] > 0:
                other_time_left = opt['max_train_time'] - train_time.time()
                if time_left is not None:
                    time_left = min(time_left, other_time_left)
                else:
                    time_left = other_time_left
            if time_left is not None:
                logs.append('time_left:{}s'.format(math.floor(time_left)))

            # join log string and add full metrics report to end of log
            log = '[ {} ] {}'.format(' '.join(logs), train_report)

            print(log)
            log_time.reset()

        if (opt['validation_every_n_secs'] > 0 and
                validate_time.time() > opt['validation_every_n_secs']):
            valid_report, valid_world = run_eval(
                agent, opt, 'valid', opt['validation_max_exs'],
                valid_world=valid_world)
            if valid_report[opt['validation_metric']] > best_valid:
                best_valid = valid_report[opt['validation_metric']]
                impatience = 0
                print('[ new best {}: {} ]'.format(
                    opt['validation_metric'], best_valid))
                world.save_agents()
                saved = True
                if opt['validation_metric'] == 'accuracy' and best_valid == 1:
                    print('[ task solved! stopping. ]')
                    break
            else:
                impatience += 1
                print('[ did not beat best {}: {} impatience: {} ]'.format(
                        opt['validation_metric'], round(best_valid, 4),
                        impatience))
            validate_time.reset()
            if opt['validation_patience'] > 0 and impatience >= opt['validation_patience']:
                print('[ ran out of patience! stopping training. ]')
                break
    world.shutdown()
    if not saved:
        world.save_agents()
    else:
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid', write_log=True)
    run_eval(agent, opt, 'test', write_log=True)
Example #10
0
def main():
    # Get command line arguments
    parser = ParlaiParser(True, True)
    train = parser.add_argument_group('Training Loop Arguments')
    train.add_argument('-et', '--evaltask',
                        help=('task to use for valid/test (defaults to the ' +
                              'one used for training if not set)'))
    train.add_argument('-d', '--display-examples',
                        type='bool', default=False)
    train.add_argument('-e', '--num-epochs', type=int, default=-1)
    train.add_argument('-ttim', '--max-train-time',
                        type=float, default=-1)
    train.add_argument('-ltim', '--log-every-n-secs',
                        type=float, default=2)
    train.add_argument('-vtim', '--validation-every-n-secs',
                        type=float, default=-1)
    train.add_argument('-vme', '--validation-max-exs',
                        type=int, default=-1,
                        help='max examples to use during validation (default ' +
                             '-1 uses all)')
    train.add_argument('-vp', '--validation-patience',
                        type=int, default=5,
                        help=('number of iterations of validation where result '
                              + 'does not improve before we stop training'))
    train.add_argument('-dbf', '--dict-build-first',
                        type='bool', default=True,
                        help='build dictionary first before training agent')
    opt = parser.parse_args()
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    print('[ training... ]')
    total_exs = 0
    max_exs = opt['num_epochs'] * len(world)
    best_accuracy = 0
    impatience = 0
    saved = False
    while True:
        world.parley()
        if opt['num_epochs'] > 0 and total_exs >= max_exs:
            print('[ num_epochs completed: {} ]'.format(opt['num_epochs']))
            break
        if opt['max_train_time'] > 0 and train_time.time() > opt['max_train_time']:
            print('[ max_train_time elapsed: {} ]'.format(train_time.time()))
            break
        if opt['log_every_n_secs'] > 0 and log_time.time() > opt['log_every_n_secs']:
            if opt['display_examples']:
                print(world.display() + '\n~~')

            logs = []
            # time elapsed
            logs.append('time:{}s'.format(math.floor(train_time.time())))

            # get report and update total examples seen so far
            if hasattr(agent, 'report'):
                train_report = agent.report()
                agent.reset_metrics()
            else:
                train_report = world.report()
                world.reset_metrics()
            total_exs += train_report['total']
            logs.append('total_exs:{}'.format(total_exs))

            # check if we should log amount of time remaining
            time_left = None
            if opt['num_epochs'] > 0:
                exs_per_sec =  train_time.time() / total_exs
                time_left = (max_exs - total_exs) * exs_per_sec
            if opt['max_train_time'] > 0:
                other_time_left = opt['max_train_time'] - train_time.time()
                if time_left is not None:
                    time_left = min(time_left, other_time_left)
                else:
                    time_left = other_time_left
            if time_left is not None:
                logs.append('time_left:{}s'.format(math.floor(time_left)))

            # join log string and add full metrics report to end of log
            log = '[ {} ] {}'.format(' '.join(logs), train_report)

            print(log)
            log_time.reset()

        if (opt['validation_every_n_secs'] > 0 and
                validate_time.time() > opt['validation_every_n_secs']):
            valid_report = run_eval(agent, opt, 'valid', True, opt['validation_max_exs'])
            if valid_report['accuracy'] > best_accuracy:
                best_accuracy = valid_report['accuracy']
                impatience = 0
                print('[ new best accuracy: ' + str(best_accuracy) +  ' ]')
                if opt['model_file']:
                    agent.save(opt['model_file'])
                    saved = True
                if best_accuracy == 1:
                    print('[ task solved! stopping. ]')
                    break
            else:
                impatience += 1
                print('[ did not beat best accuracy: {} impatience: {} ]'.format(
                        round(best_accuracy, 4), impatience))
            validate_time.reset()
            if opt['validation_patience'] > 0 and impatience >= opt['validation_patience']:
                print('[ ran out of patience! stopping training. ]')
                break
    world.shutdown()
    if not saved:
        if opt['model_file']:
            agent.save(opt['model_file'])
    else:
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid')
    run_eval(agent, opt, 'test')
 parser = argparse.ArgumentParser()
 # parser.add_argument("--gpu", nargs='+', type=int, default=[4,5,6,7], help="gpu id to use")
 parser.add_argument("--gpu",
                     nargs='+',
                     type=int,
                     default=[0, 1, 2, 3],
                     help="gpu id to use")
 parser.add_argument("--thread_num",
                     type=int,
                     default=4,
                     help="process on each GPU")
 args = parser.parse_args()
 gpus = [str(x) for x in args.gpu]
 print(gpus)
 gpu_num = len(gpus)
 build_dict()
 tasks = [[] for i in range(gpu_num)]
 pools = [mp.Pool(processes=args.thread_num) for i in range(gpu_num)]
 i = 0
 # without pre-train
 for bi_directional in args.bi_directional:
     for labels in [[0, 1], [0, 2], [0, 3], [0, 4],
                    [0, 5]]:  # , [0, 1, 2, 3, 4, 5]]:
         for data_type in ['movie', 'news', 'tweet']:
             unlabeled_data_num = 0
             labeled_data_num = 0.8
             test_data_num = 0.2
             tasks[i].append(
                 make_task(gpus[i], 'ACL', data_type, labeled_data_num,
                           test_data_num, labels, bi_directional))
             i = (i + 1) % gpu_num
Example #12
0
def main(parser):
    opt = parser.parse_args()
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        print("[ building dictionary first... ]")
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    print('[ training... ]')
    parleys = 0
    total_exs = 0
    max_exs = opt['num_epochs'] * len(world)
    max_parleys = math.ceil(max_exs / opt['batchsize'])
    best_valid = 0
    impatience = 0
    saved = False
    valid_world = None
    with world:
        while True:
            world.parley()
            parleys += 1

            if opt['num_epochs'] > 0 and parleys >= max_parleys:
                print('[ num_epochs completed: {} ]'.format(opt['num_epochs']))
                break
            if opt['max_train_time'] > 0 and train_time.time(
            ) > opt['max_train_time']:
                print('[ max_train_time elapsed: {} ]'.format(
                    train_time.time()))
                break
            if opt['log_every_n_secs'] > 0 and log_time.time(
            ) > opt['log_every_n_secs']:
                if opt['display_examples']:
                    print(world.display() + '\n~~')

                logs = []
                # time elapsed
                logs.append('time:{}s'.format(math.floor(train_time.time())))
                logs.append('parleys:{}'.format(parleys))

                # get report and update total examples seen so far
                if hasattr(agent, 'report'):
                    train_report = agent.report()
                    agent.reset_metrics()
                else:
                    train_report = world.report()
                    world.reset_metrics()

                if hasattr(train_report, 'get') and train_report.get('total'):
                    total_exs += train_report['total']
                    logs.append('total_exs:{}'.format(total_exs))

                # check if we should log amount of time remaining
                time_left = None
                if opt['num_epochs'] > 0:
                    exs_per_sec = train_time.time() / total_exs
                    time_left = (max_exs - total_exs) * exs_per_sec
                if opt['max_train_time'] > 0:
                    other_time_left = opt['max_train_time'] - train_time.time()
                    if time_left is not None:
                        time_left = min(time_left, other_time_left)
                    else:
                        time_left = other_time_left
                if time_left is not None:
                    logs.append('time_left:{}s'.format(math.floor(time_left)))

                # join log string and add full metrics report to end of log
                log = '[ {} ] {}'.format(' '.join(logs), train_report)

                print(log)
                log_time.reset()

            if (opt['validation_every_n_secs'] > 0
                    and validate_time.time() > opt['validation_every_n_secs']):
                valid_report, valid_world = run_eval(agent,
                                                     opt,
                                                     'valid',
                                                     opt['validation_max_exs'],
                                                     valid_world=valid_world)
                if valid_report[opt['validation_metric']] > best_valid:
                    best_valid = valid_report[opt['validation_metric']]
                    impatience = 0
                    print('[ new best {}: {} ]'.format(
                        opt['validation_metric'], best_valid))
                    world.save_agents()
                    saved = True
                    if opt['validation_metric'] == 'accuracy' and best_valid > 99.5:
                        print('[ task solved! stopping. ]')
                        break
                else:
                    impatience += 1
                    print('[ did not beat best {}: {} impatience: {} ]'.format(
                        opt['validation_metric'], round(best_valid, 4),
                        impatience))
                validate_time.reset()
                if opt['validation_patience'] > 0 and impatience >= opt[
                        'validation_patience']:
                    print('[ ran out of patience! stopping training. ]')
                    break
    if not saved:
        # save agent
        world.save_agents()
    elif opt.get('model_file'):
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid', write_log=True)
    run_eval(agent, opt, 'test', write_log=True)
Example #13
0
def main(parser):
    opt = parser.parse_args()
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        print("[ building dictionary first... ]")
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    print('[ training... ]')
    parleys = 0
    total_exs = 0
    max_exs = opt['num_epochs'] * len(world)
    max_parleys = math.ceil(max_exs / opt['batchsize'])
    best_valid = 0
    impatience = 0
    saved = False
    valid_world = None
    with world:
        while True:
            world.parley()
            parleys += 1

            if opt['num_epochs'] > 0 and parleys >= max_parleys:
                print('[ num_epochs completed: {} ]'.format(opt['num_epochs']))
                break
            if opt['max_train_time'] > 0 and train_time.time() > opt['max_train_time']:
                print('[ max_train_time elapsed: {} ]'.format(train_time.time()))
                break
            if opt['log_every_n_secs'] > 0 and log_time.time() > opt['log_every_n_secs']:
                if opt['display_examples']:
                    print(world.display() + '\n~~')

                logs = []
                # time elapsed
                logs.append('time:{}s'.format(math.floor(train_time.time())))
                logs.append('parleys:{}'.format(parleys))

                # get report and update total examples seen so far
                if hasattr(agent, 'report'):
                    train_report = agent.report()
                    agent.reset_metrics()
                else:
                    train_report = world.report()
                    world.reset_metrics()

                if hasattr(train_report, 'get') and train_report.get('total'):
                    total_exs += train_report['total']
                    logs.append('total_exs:{}'.format(total_exs))

                # check if we should log amount of time remaining
                time_left = None
                if opt['num_epochs'] > 0:
                    exs_per_sec = train_time.time() / total_exs
                    time_left = (max_exs - total_exs) * exs_per_sec
                if opt['max_train_time'] > 0:
                    other_time_left = opt['max_train_time'] - train_time.time()
                    if time_left is not None:
                        time_left = min(time_left, other_time_left)
                    else:
                        time_left = other_time_left
                if time_left is not None:
                    logs.append('time_left:{}s'.format(math.floor(time_left)))

                # join log string and add full metrics report to end of log
                log = '[ {} ] {}'.format(' '.join(logs), train_report)

                print(log)
                log_time.reset()

            if (opt['validation_every_n_secs'] > 0 and
                    validate_time.time() > opt['validation_every_n_secs']):
                valid_report, valid_world = run_eval(
                    agent, opt, 'valid', opt['validation_max_exs'],
                    valid_world=valid_world)
                if valid_report[opt['validation_metric']] > best_valid:
                    best_valid = valid_report[opt['validation_metric']]
                    impatience = 0
                    print('[ new best {}: {} ]'.format(
                        opt['validation_metric'], best_valid))
                    world.save_agents()
                    saved = True
                    if opt['validation_metric'] == 'accuracy' and best_valid > 99.5:
                        print('[ task solved! stopping. ]')
                        break
                else:
                    impatience += 1
                    print('[ did not beat best {}: {} impatience: {} ]'.format(
                            opt['validation_metric'], round(best_valid, 4),
                            impatience))
                validate_time.reset()
                if opt['validation_patience'] > 0 and impatience >= opt['validation_patience']:
                    print('[ ran out of patience! stopping training. ]')
                    break
    if not saved:
        # save agent
        world.save_agents()
    elif opt.get('model_file'):
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid', write_log=True)
    run_eval(agent, opt, 'test', write_log=True)
Example #14
0
def main():
    # Get command line arguments
    parser = ParlaiParser(True, True)
    parser.add_argument('-d', '--display-examples',
                        type='bool', default=False)
    parser.add_argument('-e', '--num-epochs', type=int, default=1)
    parser.add_argument('-ttim', '--max-train-time',
                        type=float, default=float('inf'))
    parser.add_argument('-ltim', '--log-every-n-secs',
                        type=float, default=1)
    parser.add_argument('-vtim', '--validation-every-n-secs',
                        type=float, default=False)
    parser.add_argument('-vp', '--validation-patience',
                        type=int, default=5,
                        help=('number of iterations of validation where result '
                              + 'does not improve before we stop training'))
    parser.add_argument('-dbf', '--dict_build_first',
                        type='bool', default=True,
                        help='build dictionary first before training agent')
    opt = parser.parse_args()
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first']:
        if 'dict_file' not in opt and 'model_file' in opt:
            opt['dict_file'] = opt['model_file'] + '.dict'
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    world = create_task(opt, agent)

    train_time = Timer()
    validate_time = Timer()
    log_time = Timer()
    print("[ training... ]")
    parleys = 0
    num_parleys = opt['num_epochs'] * int(len(world) / opt['batchsize'])
    best_accuracy = 0
    impatience = 0
    saved = False
    for i in range(num_parleys):
        world.parley()
        parleys = parleys + 1
        if train_time.time() > opt['max_train_time']:
            print("[ max_train_time elapsed: " + str(train_time.time()) + " ]")
            break
        if log_time.time() > opt['log_every_n_secs']:
            if opt['display_examples']:
                print(world.display() + "\n~~")
            parleys_per_sec =  train_time.time() / parleys
            time_left = (num_parleys - parleys) * parleys_per_sec
            log = ("[ time:" + str(math.floor(train_time.time()))
                  + "s parleys:" + str(parleys)
                  + " time_left:"
                  + str(math.floor(time_left))  + "s ]")
            if hasattr(agent, 'report'):
                log = log + str(agent.report())
            else:
                log = log + str(world.report())
                # TODO: world.reset_metrics()
            print(log)
            log_time.reset()
        if (opt['validation_every_n_secs'] and
            validate_time.time() > opt['validation_every_n_secs']):
            valid_report = run_eval(agent, opt, 'valid', True)
            if valid_report['accuracy'] > best_accuracy:
                best_accuracy = valid_report['accuracy']
                impatience = 0
                print("[ new best accuracy: " + str(best_accuracy) +  " ]")
                if opt['model_file']:
                    agent.save(opt['model_file'])
                    saved = True
                if best_accuracy == 1:
                    print('[ task solved! stopping. ]')
                    break
            else:
                impatience += 1
                print("[ did not beat best accuracy: " + str(best_accuracy) +
                      " impatience: " + str(impatience)  + " ]")
            validate_time.reset()
            if impatience >= opt['validation_patience']:
                print('[ ran out of patience! stopping. ]')
                break
    world.shutdown()
    if not saved:
        if opt['model_file']:
            agent.save(opt['model_file'])
    else:
        # reload best validation model
        agent = create_agent(opt)

    run_eval(agent, opt, 'valid')
    run_eval(agent, opt, 'test')
Example #15
0
def train_model(opt):
    # Possibly build a dictionary (not all models do this).
    if opt['dict_build_first'] and 'dict_file' in opt:
        if opt['dict_file'] is None and opt.get('pretrained_model'):
            opt['dict_file'] = opt['pretrained_model'] + '.dict'
        if opt['dict_file'] is None and opt.get('model_file'):
            opt['dict_file'] = opt['model_file'] + '.dict'
        print("[ building dictionary first... ]")
        build_dict.build_dict(opt)
    # Create model and assign it to the specified task
    agent = create_agent(opt)
    if opt['datatype'].split(':')[0] == 'train':
        world = create_task(opt, agent)

        train_time = Timer()
        validate_time = Timer()
        log_time = Timer()
        print('[ training... ]')
        parleys = 0
        total_exs = 0
        max_exs = opt['num_epochs'] * len(world)
        epochs_done = 0
        max_parleys = math.ceil(max_exs / opt['batchsize'])
        best_metric_name = opt['chosen_metric']
        best_metric = 0
        impatience = 0
        saved = False
        valid_world = None
        try:
            while True:
                world.parley()
                parleys += 1
                new_epoch = world.epoch_done()
                if new_epoch:
                    world.reset()
                    epochs_done += 1

                if opt['num_epochs'] > 0 and parleys >= max_parleys:
                    print('[ num_epochs completed: {} ]'.format(opt['num_epochs']))
                    break
                if 0 < opt['max_train_time'] < train_time.time():
                    print('[ max_train_time elapsed: {} ]'.format(train_time.time()))
                    break
                if (0 < opt['log_every_n_secs'] < log_time.time()) or \
                        (opt['log_every_n_epochs'] > 0 and new_epoch and
                                 (epochs_done % opt['log_every_n_epochs']) == 0):
                    if opt['display_examples']:
                        print(world.display() + '\n~~')

                    logs = list()
                    # time elapsed
                    logs.append('time:{}s'.format(math.floor(train_time.time())))
                    logs.append('parleys:{}'.format(parleys))
                    if epochs_done > 0:
                        logs.append('epochs done:{}'.format(epochs_done))

                    # get report and update total examples seen so far
                    if hasattr(agent, 'report'):
                        train_report = agent.report()
                        agent.reset_metrics()
                    else:
                        train_report = world.report()
                        world.reset_metrics()

                    if hasattr(train_report, 'get') and train_report.get('total'):
                        total_exs += train_report['total']
                        logs.append('total_exs:{}'.format(total_exs))

                    # check if we should log amount of time remaining
                    time_left = None
                    if opt['num_epochs'] > 0 and total_exs > 0:
                        exs_per_sec = train_time.time() / total_exs
                        time_left = (max_exs - total_exs) * exs_per_sec
                    if opt['max_train_time'] > 0:
                        other_time_left = opt['max_train_time'] - train_time.time()
                        if time_left is not None:
                            time_left = min(time_left, other_time_left)
                        else:
                            time_left = other_time_left
                    if time_left is not None:
                        logs.append('time_left:{}s'.format(math.floor(time_left)))

                    # join log string and add full metrics report to end of log
                    log = '[ {} ] {}'.format(' '.join(logs), train_report)

                    print(log)
                    log_time.reset()

                if 0 < opt['validation_every_n_secs'] < validate_time.time() or \
                        (opt['validation_every_n_epochs'] > 0 and new_epoch and (
                                    epochs_done % opt['validation_every_n_epochs']) == 0):

                    valid_report, valid_world = run_eval(agent, opt, 'valid',
                                                         opt['validation_max_exs'],
                                                         valid_world=valid_world)
                    if best_metric_name not in valid_report and 'accuracy' in valid_report:
                        best_metric_name = 'accuracy'
                    if valid_report[best_metric_name] > best_metric:
                        best_metric = valid_report[best_metric_name]
                        impatience = 0
                        print('[ new best ' + best_metric_name + ': ' + str(best_metric) + ' ]')
                        world.save_agents()
                        saved = True
                        if best_metric == 1:
                            print('[ task solved! stopping. ]')
                            break
                    else:
                        impatience += 1
                        print('[ did not beat best ' + best_metric_name + ': {} impatience: {} ]'.format(
                                round(best_metric, 4), impatience))
                    validate_time.reset()
                    if 0 < opt['validation_patience'] <= impatience:
                        print('[ ran out of patience! stopping training. ]')
                        break
        except KeyboardInterrupt:
            print('Stopped training, starting testing')

        if not saved:
            world.save_agents()
        # else:
        world.shutdown()

        # reload best validation model
        opt['pretrained_model'] = opt['model_file']
        agent = create_agent(opt)

        run_eval(agent, opt, 'valid', write_log=True)
        run_eval(agent, opt, 'test', write_log=True)
    else:
        run_eval(agent, opt, opt['datatype'], write_log=True)
    agent.shutdown()