コード例 #1
0
def all_launcher(rts, logger):
    '''
    The entire data processing chain has been called, this will take a 
    couple of hours (at least) to complete.
    '''

    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'all', stopwatch, event='start')
    print 'Start of building %s %s dataset.' % (rts.language.name, rts.project)

    functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'),
                                          (extract_launcher, 'extract'),
                                          #(sort_launcher, 'sort'),
                                          (store_launcher, 'store'),
                                          (transformer_launcher, 'transform')))

    for function, callname in functions.iteritems():
        if callname not in rts.ignore:
            print 'Launching %s' % function.func_name
            res = function(rts, logger)
            if res == False:
                sys.exit(False)
            elif res == None:
                pass
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'all', stopwatch, event='finish')
コード例 #2
0
    def __init__(self, process_type):
        if process_type == 'train':
            self.anno_path = cfg.YOLOv2.TRAIN_DATA
            self.batch_size = cfg.TRAIN.BATCH_SIZE
            self.is_training = True
            self.data_type = loader_cfg.TRAINING_LOADER_FLAGS
        else:
            self.anno_path = cfg.YOLOv2.TEST_DATA
            self.batch_size = cfg.EVAL.BATCH_SIZE
            self.is_training = False
        self.dataset_loader = simone_loader.SimoneDatasetLoader(
            loader_cfg.DATASET_DIR, loader_cfg.TRAINING_LOADER_FLAGS, True)
        self.num_samples = self.dataset_loader.get_total_num()
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size) - 2)
        self.batch_count = 0
        self.is_use_thread = cfg.YOLOv2.IS_USE_THREAD
        self.img_anchors = self.load_anchors(cfg.IMG.ANCHORS)

        self.loader_need_exit = 0
        self.timer = timer.Timer()
        self.per_step_ano = []
        if self.is_use_thread:
            self.prepr_data = []
            self.max_cache_size = 10
            self.lodaer_processing = threading.Thread(target=self.loader)
            self.lodaer_processing.start()
コード例 #3
0
    def __init__(self, preprocessor, dataset_type):
        if dataset_type == 'train':
            self.anno_path = cfg.CONTFUSE.TRAIN_DATA
            self.batch_size = cfg.TRAIN.BATCH_SIZE
            self.is_data_aug = cfg.TRAIN.IS_DATA_AUG
        if dataset_type == 'val':
            self.anno_path = cfg.CONTFUSE.VAL_DATA
            self.batch_size = cfg.EVAL.BATCH_SIZE
            self.is_data_aug = False
        if dataset_type == 'test':
            self.anno_path = cfg.CONTFUSE.TEST_DATA
            self.batch_size = cfg.EVAL.BATCH_SIZE
            self.is_data_aug = False

        self.img_anchors = loader.load_anchors(cfg.IMAGE.ANCHORS)
        self.bev_anchors = loader.load_anchors(cfg.BEV.ANCHORS)
        self.annotations = loader.load_annotations(self.anno_path)
        self.num_samples = len(self.annotations)
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))
        self.batch_count = 0
        self.is_use_thread = cfg.CONTFUSE.IS_USE_THREAD

        self.cuda_preprocessor = preprocessor.preprocessor

        self.loader_need_exit = 0
        self.timer = timer.Timer()

        if self.is_use_thread:
            self.prepr_data = []
            self.max_cache_size = 10
            self.lodaer_processing = threading.Thread(target=self.loader)
            self.lodaer_processing.start()
コード例 #4
0
def run():
    with timer.Timer("reading dataset"):
        dataset_train = util.read_float_dataset(train_filename)
        header_train = util.read_header(train_filename)
        dataset_test = util.read_float_dataset(test_filename)
        header_test = util.read_header(test_filename)

    with timer.Timer("reading trees"):
        tree_states = []
        for filename in sorted(glob.glob(tree_basename.replace('%d',
                                                               '*[0-9]'))):
            tree = pygv.AGraph(filename)
            tree_state = TreeState(tree, filename)
            tree_states.append(tree_state)
            #tree.layout(prog='dot')
            #tree.draw(filename+".png")
        #num_trees = len(tree_states)

    with timer.Timer("extracting binary variables"):
        domain, pairs_dict = binarize(tree_states, header_test)
        for tree_state in tree_states:
            tree_state.domain = domain
            tree_state.pairs_dict = pairs_dict
        binarize_dataset(dataset_train, domain, pairs_dict, header_train,
                         binarized_train_filename)
        binarize_dataset(dataset_test, domain, pairs_dict, header_test,
                         binarized_test_filename)
        discretize_dataset(dataset_train, domain, pairs_dict, header_train,
                           discretized_train_filename)
        discretize_dataset(dataset_test, domain, pairs_dict, header_test,
                           discretized_test_filename)

    with timer.Timer("binarizing trees"):
        binarize_tree_states(tree_states, domain, pairs_dict,
                             binarized_tree_basename)

    with timer.Timer("writing constraints"):
        write_constraints(
            domain, pairs_dict,
            [constraint_filename_working, constraint_filename_output],
            constraint_sdd_filename, constraint_vtree_filename)

    print "\tdiscretization: "
    for k, v in pairs_dict.iteritems():
        print "\t", k, v
コード例 #5
0
def diff_launcher(rts, logger):
    print 'Start creating diff dataset'
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'diff', stopwatch, event='start')
    log.to_csv(logger, rts, 'Start', 'Diff', diff_launcher)
    differ.launcher(rts)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish')
    log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
コード例 #6
0
def downloader_launcher(rts, logger):
    '''
    This launcher calls the dump downloader to download a Wikimedia dump file.
    '''
    print 'Start downloading'
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'download', stopwatch, event='start')
    downloader.launcher(rts, logger)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'download', stopwatch, event='finish')
コード例 #7
0
    def _run(self, cmd, soft_limit, *args, **kwargs):
        result = ExecutorResult(cmd)

        self._open_streams()
        kw = self.kwargs.copy()
        kw['stdin'] = self.stdin_fp
        kw['stdout'] = self.stdout_fp
        kw['stderr'] = self.stderr_fp
        kw.update(**kwargs)

        with timer.Timer() as t:
            # try to execute the command
            try:
                logger.debug('running cmd {}, {}, {}', cmd, args, kw)
                process = subprocess.Popen(cmd, *args, **kw)
            except FileNotFoundError as ex:
                duration = t.duration
                self.decrease_timepool(duration)
                result.message = 'File not found'
                self._close_streams()
                return result(status=ExecutorStatus.FILE_NOT_FOUND,
                              error=ex,
                              duration=duration)

            # try to wait for the command to finish
            try:
                rc = process.wait(self._time_left)
            except subprocess.TimeoutExpired as ex:
                duration = t.duration
                self.decrease_timepool(duration)
                process.kill()
                result.message = 'Terminated: global timeout was reached'
                self._close_streams()
                return result(status=ExecutorStatus.GLOBAL_TIMEOUT,
                              error=ex,
                              duration=duration)

        # decrease limit
        duration = t.duration
        self.decrease_timepool(duration)
        result.stdin = self.stdin_path
        result.stdout = self.stdout_path
        result.stderr = self.stderr_path

        # determine result
        if rc == 0:
            status = ExecutorStatus.OK
            if soft_limit and t.duration > soft_limit:
                status = ExecutorStatus.SOFT_TIMEOUT
        else:
            status = ExecutorStatus.ERROR_WHILE_RUNNING

        return result(status=status, returncode=rc, duration=duration)
コード例 #8
0
def transformer_launcher(rts, logger):
    '''
    This function derives a number of variables from the editors_raw collection
    this will significantly improve processing speed.
    '''
    print 'Start transforming dataset'
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
    log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
    #transformer.transform_editors_multi_launcher(rts)
    transformer.transform_editors_single_launcher(rts)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
    log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
コード例 #9
0
def store_launcher(rts, logger):
    '''
    The data is ready to be stored once the sorted function has completed. This
    function starts storing data in MongoDB.
    '''
    print 'Start storing data in %s' % rts.storage
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'store', stopwatch, event='start')
    log.to_csv(logger, rts, 'Start', 'Store', store_launcher)
    store.launcher(rts)
    store.launcher_articles(rts)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'store', stopwatch, event='finish')
    log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)
コード例 #10
0
ファイル: srilm.py プロジェクト: wanlinxie/dissertation
    def __init__(self, n,
            lm_path='/path/to/project/resources/LMs/sample.lm'):
        #'/proj/fluke/resources/LMs/en.giga.noUN.5gram.lm.bin'
        """Initialize language models.
        """
        # Record the maximum size of ngrams in the stored LM
        self.n = n

        with timer.Timer():
            print "Initializing language models",

            self.lm = srilm.initLM(n)
            srilm.readLM(self.lm, lm_path)
            print
コード例 #11
0
ファイル: local.py プロジェクト: janhybs/automate
    def _run(self, cmd, soft_limit=0, *args, **kwargs):
        cp = self.kwargs.copy()
        cp.update(
            dict(
                stdin=self.stdin_fp,
                stdout=self.stdout_fp,
                stderr=self.stderr_fp,
            ))
        cp.update(kwargs)
        result = ExecutorResult(cmd)

        with timer.Timer() as t:
            # try to execute the command
            try:
                print(cmd, args, cp)
                process = sp.Popen(cmd, *args, **cp)
            except FileNotFoundError as ex:
                duration = t.duration
                self._time_left -= duration
                self.message = 'File not found'
                return result(status=ExecutorStatus.FILE_NOT_FOUND,
                              error=ex,
                              duration=duration)

            # try to wait for the command to finish
            try:
                rc = process.wait(self._time_left)
            except sp.TimeoutExpired as ex:
                duration = t.duration
                self._time_left -= duration
                process.kill()
                self.message = 'Terminated: global timeout was reached'
                return result(status=ExecutorStatus.GLOBAL_TIMEOUT,
                              error=ex,
                              duration=duration)

        # decrease limit
        duration = t.duration
        self._time_left -= duration

        # determine result
        if rc == 0:
            status = ExecutorStatus.OK
            if soft_limit and t.duration > soft_limit:
                status = ExecutorStatus.SOFT_TIMEOUT
        else:
            status = ExecutorStatus.ERROR_WHILE_RUNNING

        return result(status=status, returncode=rc, duration=duration)
コード例 #12
0
ファイル: analyzer.py プロジェクト: IcanCheung/mediawiki-svn
def generate_chart_data(rts, func, **kwargs):
    '''
    This is the entry function to be called to generate data for creating 
    charts.
    '''

    stopwatch = timer.Timer()
    plugin = retrieve_plugin(func)

    if not plugin:
        available_plugins = inventory.available_analyses()
        raise exceptions.UnknownPluginError(plugin, available_plugins)
        plugin = getattr(plugin, func)

    feedback(func, rts)

    tasks = JoinableQueue()
    result = JoinableQueue()

    mgr = Manager()
    lock = mgr.RLock()
    obs = dict()
    obs_proxy = mgr.dict(obs)

    db = storage.init_database(rts.storage, rts.dbname, rts.collection)
    editors = db.retrieve_distinct_keys('editor')
    #editors = editors[:500]
    if rts.collection.find('editors_dataset') > -1:
        min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
        kwargs['min_year'] = min_year
        kwargs['max_year'] = max_year

    fmt = kwargs.pop('format', 'long')
    time_unit = kwargs.pop('time_unit', 'year')

    var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)

    try:
        print 'Determining whether plugin requires preloaded data...'
        preloader = getattr(plugin, 'preload')
        print 'Preloading data...'
        data = preloader(rts)
    except Exception, error:
        data = None
コード例 #13
0
def extract_launcher(rts, logger):
    '''
    The extract launcher is used to extract the required variables from a dump
    file. If the zip file is a known archive then it will first launch the
    unzip launcher. 
    '''
    print 'Extracting data from XML'
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'extract', stopwatch, event='start')
    log.to_csv(logger, rts, 'Start', 'Extract', extract_launcher)

    #remove output from previous run.
    file_utils.delete_file(rts.txt, None, directory=True)
    file_utils.create_directory(rts.txt)

    extracter.launcher(rts)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'extract', stopwatch, event='finish')
    log.to_csv(logger, rts, 'Finish', 'Extract', extract_launcher)
コード例 #14
0
def test(args, dataset, scaffold, logger):
    sess_cfg = tf.ConfigProto(allow_soft_placement=True,
                              gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_cfg)

    fetches = scaffold["fetches"]
    metrics = scaffold["metrics"]

    # Load checkpoint if possible
    saver = scaffold["saver"]
    if args.ckpt and Path(args.ckpt + ".index").exists():
        checkpoint_path = args.ckpt
    else:
        latest_filename = "best_checkpoint" if args.load_best_ckpt else None
        ckpt = tf.train.get_checkpoint_state(args.model_dir, latest_filename)
        if ckpt and ckpt.model_checkpoint_path:
            checkpoint_path = ckpt.model_checkpoint_path
        else:
            raise ValueError("Missing checkpoint for restoring.")
    saver.restore(sess, checkpoint_path)
    logger.info("Restoring parameters from %s", checkpoint_path)

    test_loss_acc = tools.Accumulator()
    ti = timer.Timer()
    logger.info("Start testing ...")

    # Test
    sess.run(
        [dataset["parent_iter"].initializer,
         tf.local_variables_initializer()])
    while True:
        try:
            ti.tic()
            total_loss_val, _ = sess.run(
                [fetches["total_loss"], metrics["acc_up"]])
            ti.toc()
            test_loss_acc.update(total_loss_val)
        except tf.errors.OutOfRangeError:
            break
    acc_val = sess.run(metrics["acc"])
    logger.info("Test loss: %.4f, Test acc: %.4f, %.2f step/s",
                test_loss_acc.avg, acc_val, ti.speed)
    sess.close()
コード例 #15
0
def dataset_launcher(rts, logger):
    '''
    Dataset launcher is the entry point to generate datasets from the command
    line. 
    '''
    print 'Start generating dataset'
    stopwatch = timer.Timer()
    log.to_db(rts, 'dataset', 'export', stopwatch, event='start')

    for plugin in rts.plugins:
        #cProfile.runctx('analyzer.generate_chart_data(rts, plugin, **rts.keywords)', globals(), locals(), filename="analyzer.cprof")
        analyzer.generate_chart_data(rts, plugin, **rts.keywords)
        log.to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher,
                       plugin=plugin,
                       dbname=rts.dbname,
                       collection=rts.editors_dataset)
    stopwatch.elapsed()
    log.to_db(rts, 'dataset', 'export', stopwatch, event='finish')
    log.to_csv(logger, rts, 'Finish', 'Dataset', dataset_launcher)
コード例 #16
0
ファイル: __init__.py プロジェクト: wanlinxie/dissertation
def annotate(sents, *annotator_names):
    """Annotate one or more sentences with the given annotators.
    """
    if len(annotator_names) == 0:
        print "WARNING: no annotator specified"
        return
    if len(annotator_names) == 1 and \
            (isinstance(annotator_names[0], tuple) or
                isinstance(annotator_names[0], list)):
        # Annotators may be provided in a list or tuple
        annotator_names = annotator_names[0]

    if not isinstance(sents, tuple) and not isinstance(sents, list):
        # One or more sentences may be provided
        sents = [sents]

    for annotator_name in annotator_names:
        annotator = load_annotator(annotator_name)
        with timer.Timer():
            annotator.run_on_corpus(sents)
コード例 #17
0
    def __init__(self):
        self.initial_weight = cfg.EVAL.WEIGHT
        self.time = time.strftime('%Y-%m-%d-%H-%M-%S',
                                  time.localtime(time.time()))
        self.moving_ave_decay = cfg.YOLOv2.MOVING_AVE_DECAY
        self.eval_logdir = "./data/logs/eval"
        self.evalset = dataset.Dataset('test')
        self.output_dir = cfg.EVAL.OUTPUT_PRED_PATH
        self.img_anchors = loader.load_anchors(cfg.IMG.ANCHORS)

        with tf.name_scope('model'):
            self.model = yolov2_network.YOLOv2Network()
            self.net = self.model.load()
            self.img_pred = self.net['img_pred']

        config = ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = InteractiveSession(config=config)
        self.saver = tf.train.Saver()  #ema_obj.variables_to_restore())
        self.saver.restore(self.sess, self.initial_weight)
        self.timer = timer.Timer()
コード例 #18
0
    def __init__(self, args):
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        self.args = args
        self.device = hp.assign_device(args.device)
        self.run_id = str(round(time.time() % 1e7))
        print(f"Run id: {self.run_id}")
        self.log_dir, self.checkpoint_dir, self.samples_dir = hp.init_logs(
            args, self.run_id, self.log_dir_formatter(args))

        print(
            f"Process id: {str(os.getpid())} | hostname: {socket.gethostname()}"
        )
        print(f"Run id: {self.run_id}")
        print(f"Time: {datetime.now()}")
        self.pp = pprint.PrettyPrinter(indent=4)
        self.pp.pprint(vars(args))
        print('==> Building model..')
        self.timer = timer.Timer()
        self.mode = args.mode
        self.build_model()
コード例 #19
0
    def __init__(self, sess, config):
        if config.learning_rate_D < 0:
            config.learning_rate_D = config.learning_rate
        """
        Args:
            sess: TensorFlow session
            config: The configuration; see main.py for entries
        """

        self.format = 'NCHW'
        self.timer = timer.Timer()
        self.dataset = config.dataset
        if config.architecture == 'dc128':
            config.output_size = 128
        elif config.architecture in ['dc64', 'dcgan64']:
            config.output_size = 64
        output_size = config.output_size

        self.sess = sess
        if config.real_batch_size == -1:
            config.real_batch_size = config.batch_size
        self.config = config
        self.is_grayscale = (config.c_dim == 1)
        self.batch_size = config.batch_size
        self.real_batch_size = config.real_batch_size
        self.sample_size = 64 if self.config.is_train else config.batch_size
        #self.sample_size = batch_size

        self.output_size = output_size
        self.data_dir = config.data_dir
        self.z_dim = self.config.z_dim

        self.gf_dim = config.gf_dim
        self.df_dim = config.df_dim
        self.dof_dim = self.config.dof_dim

        self.c_dim = config.c_dim
        self.input_dim = self.output_size * self.output_size * self.c_dim

        discriminator_desc = '_dc'
        if self.config.learning_rate_D == self.config.learning_rate:
            lr = 'lr%.8f' % self.config.learning_rate
        else:
            lr = 'lr%.8fG%fD' % (self.config.learning_rate,
                                 self.config.learning_rate_D)
        arch = '%dx%d' % (self.config.gf_dim, self.config.df_dim)

        self.description = (
            "%s%s_%s%s_%sd%d-%d-%d_%s_%s_%s" %
            (self.dataset, arch, self.config.architecture, discriminator_desc,
             self.config.model + '-' + self.config.kernel, self.config.dsteps,
             self.config.start_dsteps, self.config.gsteps, self.batch_size,
             self.output_size, lr))
        if self.config.dof_dim > 1:
            self.description += '_dof{}'.format(self.config.dof_dim)
        if self.config.batch_norm:
            self.description += '_bn'

        self.max_to_keep = 5
        self._ensure_dirs()
        self.with_labels = config.with_labels
        if self.with_labels:
            self.num_classes = 1000

        stdout = sys.stdout
        if self.config.log:
            self.old_stdout = sys.stdout
            self.old_stderr = sys.stderr
            self.log_file = open(os.path.join(self.sample_dir, 'log.txt'),
                                 'w',
                                 buffering=1)
            print('Execution start time: %s' % time.ctime())
            print('Log file: %s' % self.log_file)
            stdout = self.log_file
            sys.stdout = self.log_file
            sys.stderr = self.log_file
        if config.compute_scores:
            self.scorer = scorer.Scorer(self.sess,
                                        self.dataset,
                                        config.MMD_lr_scheduler,
                                        stdout=stdout)
        print('Execution start time: %s' % time.ctime())
        pprint.PrettyPrinter().pprint(vars(self.config))
        #if self.config.multi_gpu:
        #    self.build_model_multi_gpu()
        #else:
        self.build_model()
        self.initialized_for_sampling = config.is_train
コード例 #20
0
    def load_treebank(self, treebank_path):
        """Load dependencies from a file containing Stanford-style dependency
        parses.
        """
        with timer.Timer():
            num_sents = 0
            with open(treebank_path) as f:
                started_sent = False
                for line in f:
                    if line == '\n':
                        # Ignore line but note if a sentence was just
                        # completed
                        if started_sent:
                            num_sents += 1
                            sys.stdout.write("Loading treebank sentences: " +
                                             str(num_sents) + "\r")
                        started_sent = False
                        continue

                    started_sent = True
                    match = re.match(self.dep_re, line)
                    if match is None:
                        print "ERROR: Unexpected Stanford dependency format"
                        print line
                        continue
                    label, token0, t0, token1, t1 = match.groups()

                    # In the Stanford typed dependency format, token0 is
                    # the governor/head and token1 is the dependent
                    direction = None
                    if t0 > t1:
                        # Head follows dependent: left attachment
                        direction = -1
                    elif t0 < t1:
                        # Head precedes dependent: right attachment
                        direction = 1
                    else:
                        print "ERROR: Unexpected token indices"
                        print line
                        continue

                    # Note counts of words
                    token0 = token0.lower() if token0 != 'ROOT' else token0
                    token1 = token1.lower()
                    self.add_to_counter(self.word_counts, label, token0,
                                        token1, direction)

                    # Note counts of stems
                    stem0 = porter2.stem(token0)
                    stem1 = porter2.stem(token1)
                    self.add_to_counter(self.stem_counts, label, stem0, stem1,
                                        direction)

                    # Note total number of unique labels, words and stems
                    self.all_labels.add(label)
                    self.all_words.update((token0, token1))
                    self.all_stems.update((stem0, stem1))

        print
        self.num_labels = len(self.all_labels)
        self.num_words = len(self.all_words)
        self.num_stems = len(self.all_stems)
コード例 #21
0
    def chunk_paragraphs(self, tokenizer, model_name, preprocess_step,
                         data_set_range):
        c_unknown = 0
        c_known = 0
        dis = 0

        timer1 = timer.Timer()
        for i, ex in tqdm(enumerate(self.examples[::])):
            total = min([len(self.examples[start_idx::]), number_of_part1])

            # if i >total:
            #     break
            if (i + 1) % 5000 == 0:
                self.save_tokenizer(tokenizer, data_set_range)
                self.save_coqa_dataset(data_set_range)  #chunked_examples
                print(timer1.remains(total, i))

            question_length = len(ex['annotated_question']['word'])
            if question_length > 350:
                continue
            doc_length_available = 512 - question_length - 3
            if model_name == 'RoBERTa':
                doc_length_available = doc_length_available - 3

            paragraph = self.paragraphs[
                ex['paragraph_id']]['annotated_context']['word']
            paragraph = preprocess(paragraph)
            if model_name != 'RoBERTa' and model_name != 'SpanBERT':
                paragraph = [p.lower() for p in paragraph]
            paragraph_length = len(paragraph)
            start_offset = 0
            doc_spans = []
            while start_offset < paragraph_length:
                length = paragraph_length - start_offset
                if length > doc_length_available:
                    length = doc_length_available - 1
                    doc_spans.append([start_offset, length, 1])
                else:
                    doc_spans.append([start_offset, length, 0])
                if start_offset + length == paragraph_length:
                    break
                start_offset += length
            for spans in doc_spans:
                segment_ids = []
                tokens = []
                if model_name == 'RoBERTa':
                    tokens.append('<s>')
                for q in ex['annotated_question']['word']:
                    segment_ids.append(0)
                    if model_name == 'RoBERTa' or model_name == 'SpanBERT':
                        tokens.append(q)
                        tokenizer.add_tokens([q])
                    else:
                        tokens.append(q.lower())
                        tokenizer.add_tokens([q.lower()])
                        # save_object([q.lower()], filename)

                if model_name == 'RoBERTa':
                    tokens.extend(['</s>', '</s>'])
                else:
                    tokens.append('[SEP]')
                    segment_ids.append(0)

                tokenizer.add_tokens(paragraph[spans[0]:spans[0] + spans[1]])
                # save_object(paragraph[spans[0]:spans[0] + spans[1]], filename)
                tokens.extend(paragraph[spans[0]:spans[0] + spans[1]])
                segment_ids.extend([1] * spans[1])
                yes_index = len(tokens)
                tokens.append('yes')
                segment_ids.append(1)
                no_index = len(tokens)
                tokens.append('no')
                segment_ids.append(1)

                if spans[2] == 1:
                    tokens.append('<unknown>')
                    tokenizer.add_tokens(['<unknown>'])

                    # save_object(['<unknown>'], filename)

                    segment_ids.append(1)
                if model_name == 'RoBERTa':
                    tokens.append('</s>')
                input_mask = [1] * len(tokens)
                input_ids = tokenizer.convert_tokens_to_ids(tokens)
                converted_to_string = tokenizer.convert_ids_to_tokens(
                    input_ids)
                input_ids.extend([0] * (512 - len(tokens)))
                input_mask.extend([0] * (512 - len(tokens)))
                segment_ids.extend([0] * (512 - len(tokens)))

                start = ex['answer_span'][0]
                end = ex['answer_span'][1]

                if start >= spans[0] and end <= spans[1]:
                    c_known += 1
                    start = question_length + 1 + start
                    end = question_length + 1 + end

                else:
                    c_unknown += 1
                    start = len(tokens) - 1
                    end = len(tokens) - 1
                if ex['answer'] == 'yes' and tokens[start] != 'yes':
                    start = yes_index
                    end = yes_index
                if ex['answer'] == 'no' and tokens[start] != 'no':
                    start = no_index
                    end = no_index

                _example = {
                    'tokens': tokens,
                    'answer': tokens[start:end + 1],
                    'actual_answer': ex['answer'],
                    'input_tokens': input_ids,
                    'input_mask': input_mask,
                    'segment_ids': segment_ids,
                    'start': start,
                    'end': end,
                    'turn_id': ex['turn_id'],
                    'paragraph_id': self.paragraphs[ex['paragraph_id']]['id']
                }
                self.chunked_examples.append(_example)
                #save_object(_example, sname)

        print("Chunk paragrapsh end.      tokenizer number: {} ".format(
            len(tokenizer)))
        # if preprocess_step==PREPROCESS_STEP.SPLIT_DATA_AND_SAVE:
        self.save_tokenizer(tokenizer, data_set_range)
        self.save_coqa_dataset(data_set_range)  #chunked_examples
コード例 #22
0
def train(args, dataset, scaffold, logger):
    sess_cfg = tf.ConfigProto(allow_soft_placement=True,
                              gpu_options=tf.GPUOptions(allow_growth=True))
    sess = tf.Session(config=sess_cfg)

    fetches = scaffold["fetches"]
    optimizer = scaffold["optimizer"]
    saver = scaffold["saver"]
    writer = scaffold["writer"]
    metrics = scaffold["metrics"]
    summaries = scaffold["summaries"]
    require_val = not args.no_val

    logger.info(optimizer)

    # After create session
    sess.run(tf.global_variables_initializer())
    logger.info("Global variable initialized")
    local_var_init = tf.local_variables_initializer()
    sess.run(local_var_init)
    logger.info("Local variable initialized")
    # Load checkpoint if possible
    ckpt = tf.train.get_checkpoint_state(args.model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        logger.info("Restoring parameters from %s", ckpt.model_checkpoint_path)

    tr_feed_dict = {K.backend.learning_phase(): 1}
    val_feed_dict = {K.backend.learning_phase(): 0}

    if require_val:
        # Get train/val string handler
        train_handler, val_handler = sess.run([
            dataset["train"]["iter"].string_handle(),
            dataset["val"]["iter"].string_handle()
        ])
        tr_feed_dict[dataset["handler"]] = train_handler
        val_feed_dict[dataset["handler"]] = val_handler

    finished_epoch = sess.run(
        optimizer.global_step) // dataset["train"]["steps"]
    total_epochs = args.epochs or args.total_epochs - finished_epoch

    log_loss_acc = tools.Accumulator()
    total_loss_acc = tools.Accumulator()
    regu_loss_acc = tools.Accumulator()
    val_loss_acc = tools.Accumulator()
    best_acc = 0.
    best_epoch = 0.
    ti = timer.Timer()
    logger.info("Start training ...")

    for i in range(total_epochs):
        # Train
        sess.run(dataset["train"]["iter"].initializer)
        logger.info("Epoch %d/%d - Learning rate: %.4g", i + 1, total_epochs,
                    sess.run(optimizer.lr))
        while True:
            try:
                ti.tic()
                if ti.calls == 0:
                    summary_val, fetches_val = sess.run([summaries, fetches],
                                                        tr_feed_dict)
                    writer.add_summary(
                        summary_val,
                        global_step=i)  # Here we refer global step to #epoch
                else:
                    fetches_val = sess.run(fetches, tr_feed_dict)
                ti.toc()
                total_loss_acc.update(fetches_val["total_loss"])
                log_loss_acc.update(fetches_val["total_loss"])
                regu_loss_acc.update(fetches_val["regu_loss"])
                if ti.calls % args.log_step == 0:
                    logger.info(
                        "Epoch %d/%d Step %d/%d - Train loss: %.4f - %.2f step/s",
                        i + 1,
                        total_epochs, ti.calls, dataset["train"]["steps"],
                        log_loss_acc.pop(), ti.speed)
            except tf.errors.OutOfRangeError:
                break

        # At epoch end
        val_summ = collections.OrderedDict()
        if require_val:
            sess.run(dataset["val"]["iter"].initializer)
            while True:
                try:
                    ti.tic()
                    total_loss_val, _ = sess.run(
                        [fetches["total_loss"], metrics["acc_up"]],
                        val_feed_dict)
                    ti.toc()
                    val_loss_acc.update(total_loss_val)
                except tf.errors.OutOfRangeError:
                    break
            acc_val = sess.run(metrics["acc"])
            if acc_val > best_acc:
                best_acc = acc_val
                best_epoch = i + 1
                if args.save_best_ckpt:
                    save_path = scaffold["best_saver"].save(
                        sess,
                        args.model_dir + "/best_" + args.tag,
                        i,
                        write_meta_graph=False,
                        latest_filename="best_checkpoint")
                    logger.info("Save (best) checkpoint to %s", save_path)
            val_summ["acc"] = acc_val
            val_summ["val_loss"] = val_loss_acc.pop()
            sess.run(local_var_init
                     )  # Reset accuracy local variables 'count' and 'total'
            logger.info(
                "Epoch %d/%d - Train loss: %.4f, Val loss: %.4f, Val acc: %.4f, %.2f step/s",
                i + 1, total_epochs, total_loss_acc.avg, val_summ["val_loss"],
                val_summ["acc"], ti.speed)
        else:
            logger.info("Epoch %d/%d - Train loss: %.4f, %.2f step/s", i + 1,
                        total_epochs, total_loss_acc.avg, ti.speed)
        summary_kits.summary_scalar(
            writer, i, ["train_loss", "regu_loss"] + list(val_summ.keys()),
            [total_loss_acc.pop(), regu_loss_acc.pop()] +
            list(val_summ.values()))
        save_path = saver.save(sess,
                               args.model_dir + "/" + args.tag,
                               i,
                               write_meta_graph=False)
        logger.info("Save checkpoint to %s", save_path)
        ti.reset()
    logger.info("Best val acc: %.4f in epoch %d.", best_acc, best_epoch)
    sess.close()
    writer.close()
コード例 #23
0
    def __init__(self):
        self.learn_rate_init = cfg.TRAIN.LEARN_RATE_INIT
        self.learn_rate_end = cfg.TRAIN.LEARN_RATE_END
        self.first_stage_epochs = cfg.TRAIN.FRIST_STAGE_EPOCHS
        self.second_stage_epochs = cfg.TRAIN.SECOND_STAGE_EPOCHS
        self.warmup_periods = cfg.TRAIN.WARMUP_EPOCHS
        self.initial_weight = cfg.TRAIN.PRETRAIN_WEIGHT
        self.time = time.strftime('%Y-%m-%d-%H-%M-%S',
                                  time.localtime(time.time()))
        self.moving_ave_decay = cfg.YOLOv2.MOVING_AVE_DECAY
        self.train_logdir = "./data/log/train"
        self.trainset = dataset.Dataset('train')
        self.valset = dataset.Dataset('val')
        self.steps_per_period = len(self.trainset)
        config = ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = InteractiveSession(config=config)
        self.timer = timer.Timer()
        # self.sess                = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        with tf.name_scope('model'):
            self.model = yolov2_network.YOLOv2Network()
            self.net = self.model.load()
            self.net_var = tf.global_variables()
            self.loss = self.net["yolov2_loss"]

        with tf.name_scope('learn_rate'):
            self.global_step = tf.Variable(1.0,
                                           dtype=tf.float64,
                                           trainable=False,
                                           name='global_step')
            warmup_steps = tf.constant(self.warmup_periods *
                                       self.steps_per_period,
                                       dtype=tf.float64,
                                       name='warmup_steps')
            train_steps = tf.constant(
                (self.first_stage_epochs + self.second_stage_epochs) *
                self.steps_per_period,
                dtype=tf.float64,
                name='train_steps')
            self.learn_rate = tf.cond(
                pred=self.global_step < warmup_steps,
                true_fn=lambda: self.global_step / warmup_steps * self.
                learn_rate_init,
                false_fn=lambda: self.learn_rate_end + 0.5 *
                (self.learn_rate_init - self.learn_rate_end) * (1 + tf.cos(
                    (self.global_step - warmup_steps) /
                    (train_steps - warmup_steps) * np.pi)))
            global_step_update = tf.assign_add(self.global_step, 1.0)

        with tf.name_scope("define_weight_decay"):
            moving_ave = tf.train.ExponentialMovingAverage(
                self.moving_ave_decay).apply(tf.trainable_variables())

        with tf.name_scope("define_first_stage_train"):
            self.first_stage_trainable_var_list = []
            for var in tf.trainable_variables():
                var_name = var.op.name
                var_name_mess = str(var_name).split('/')
                if var_name_mess[0] in ["yolov2_headnet"]:
                    self.first_stage_trainable_var_list.append(var)
            first_stage_optimizer = tf.train.AdamOptimizer(
                self.learn_rate).minimize(
                    self.loss, var_list=self.first_stage_trainable_var_list)
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                with tf.control_dependencies(
                    [first_stage_optimizer, global_step_update]):
                    with tf.control_dependencies([moving_ave]):
                        self.train_op_with_frozen_variables = tf.no_op()

        with tf.name_scope("define_second_stage_train"):
            second_stage_trainable_var_list = tf.trainable_variables()
            second_stage_optimizer = tf.train.AdamOptimizer(
                self.learn_rate).minimize(
                    self.loss, var_list=second_stage_trainable_var_list)
            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                with tf.control_dependencies(
                    [second_stage_optimizer, global_step_update]):
                    with tf.control_dependencies([moving_ave]):
                        self.train_op_with_all_variables = tf.no_op()

        with tf.name_scope('loader_and_saver'):
            self.loader = tf.train.Saver(self.net_var)
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)

        with tf.name_scope('summary'):
            tf.summary.scalar("learn_rate", self.learn_rate)
            tf.summary.scalar("yolov2_loss", self.net["yolov2_loss"])
            tf.summary.scalar("img_obj_loss", self.net["img_obj_loss"])
            tf.summary.scalar("img_cls_loss", self.net["img_cls_loss"])
            tf.summary.scalar("img_bbox_loss", self.net["img_bbox_loss"])
            logdir = "../logs/tensorboard"
            if os.path.exists(logdir):
                shutil.rmtree(logdir)
            os.mkdir(logdir)
            self.write_op = tf.summary.merge_all()
            self.summary_writer = tf.summary.FileWriter(logdir,
                                                        graph=self.sess.graph)
        img_pred_dir = cfg.YOLOv2.LOG_DIR + "/pred/img_pred/"
        if os.path.exists(img_pred_dir):
            shutil.rmtree(img_pred_dir)
        os.mkdir(img_pred_dir)
コード例 #24
0
ファイル: compile_rf.py プロジェクト: AndyShih12/RF_SDD
def run():
    with timer.Timer("reading dataset"):
        dataset = util.read_binary_dataset(test_filename)
        domain = util.read_header(test_filename)
        '''
        if OPTIONS.majority_circuit_opt:
            l = len(domain)
            for k in xrange(num_trees):
                domain["Tree_%d" % k] = l+k
        '''

    with timer.Timer("initializing manager"):
        # start sdd manager
        var_count = len(domain) - 1
        vtree = sdd.sdd_vtree_new(var_count, "balanced")
        manager = sdd.sdd_manager_new(vtree)
        #sdd.sdd_manager_auto_gc_and_minimize_on(manager)
        #sdd.sdd_manager_auto_gc_and_minimize_off(manager)
        sdd_state = SddState(vtree, manager)

    with timer.Timer("reading constraints"):
        constraint_sdd, constraint_info = encode_logical_constraints(
            constraint_filename, manager, domain)
        sdd.sdd_ref(constraint_sdd, manager)

    with timer.Timer("reading trees"):
        tree_states = []
        for filename in sorted(glob.glob(tree_basename.replace('%d', '*'))):
            tree = pygv.AGraph(filename)
            tree_state = TreeState(tree, domain, constraint_info)
            tree_states.append(tree_state)
            #tree.layout(prog='dot')
            #tree.draw(filename+".png")
        #num_trees = len(tree_states)

    with timer.Timer("compiling trees"):
        forest_sdds, _ = izip(*forest_sdds_iter(tree_states, sdd_state))
        #forest_sdds = list(forest_sdds_iter(tree_states,sdd_state))

        forest_sdds = [
            (tree_state, tree_sdd)
            for tree_state, tree_sdd in zip(tree_states, forest_sdds)
        ]
        cmpf = lambda x, y: cmp(sdd.sdd_size(x[1]), sdd.sdd_size(y[1]))
        forest_sdds.sort(cmp=cmpf)
        tree_states = [tree_state for tree_state, tree_sdd in forest_sdds]

        #ACACAC
        sdd.sdd_manager_auto_gc_and_minimize_off(manager)
        sdd.sdd_manager_minimize_limited(manager)
        stats = SddSizeStats()
        for tree_state, tree_sdd in forest_sdds:
            stats.update(tree_sdd)
            sdd.sdd_deref(tree_sdd, manager)
        sdd.sdd_manager_garbage_collect(manager)
        forest_sdds, used_vars_list = izip(
            *forest_sdds_iter(tree_states, sdd_state))
    print stats

    with timer.Timer("compiling all", prefix="| "):
        alpha = compile_all(forest_sdds, used_vars_list, num_trees, domain,
                            manager, constraint_sdd)

    with timer.Timer("evaluating"):
        msg = util.evaluate_dataset_all_sdd(dataset, alpha, manager)
    print "|     trees : %d" % num_trees
    print "--- evaluating majority vote on random forest (compiled):"
    print msg
    print "|  all size :", sdd.sdd_size(alpha)
    print "|  all count:", sdd.sdd_count(alpha)
    print " model count:", sdd.sdd_global_model_count(alpha, manager)

    with timer.Timer("checking monotonicity"):
        result = is_monotone(alpha, manager)
    print "Is monotone?", result

    #for tree_sdd in forest_sdds: sdd.sdd_deref(tree_sdd,manager)
    print "===================="
    print "before garbage collecting..."
    print "live size:", sdd.sdd_manager_live_count(manager)
    print "dead size:", sdd.sdd_manager_dead_count(manager)
    print "garbage collecting..."
    sdd.sdd_manager_garbage_collect(manager)
    print "live size:", sdd.sdd_manager_live_count(manager)
    print "dead size:", sdd.sdd_manager_dead_count(manager)

    vtree = sdd.sdd_manager_vtree(manager)
    print "Writing sdd file %s and vtree file %s" % (sdd_filename,
                                                     vtree_filename)
    sdd.sdd_save(sdd_filename, alpha)
    sdd.sdd_vtree_save(vtree_filename, vtree)

    print "Writing constraint sdd file %s and constraint vtree file %s" % (
        constraint_sdd_filename, constraint_vtree_filename)
    sdd.sdd_save(constraint_sdd_filename, constraint_sdd)
    sdd.sdd_vtree_save(constraint_vtree_filename, vtree)
コード例 #25
0
    def __init__(self,
                 sess,
                 config,
                 batch_size=64,
                 output_size=64,
                 z_dim=100,
                 c_dim=3,
                 data_dir='./data'):
        if config.learning_rate_D < 0:
            config.learning_rate_D = config.learning_rate
        """
        Args:
            sess: TensorFlow session
            batch_size: The size of batch. Should be specified before training.
            output_size: (optional) The resolution in pixels of the images. [64]
            z_dim: (optional) Dimension of dim for Z. [100]
            gf_dim: (optional) Dimension of gen filters in first conv layer. [64]
            df_dim: (optional) Dimension of discrim filters in first conv layer. [64]
            gfc_dim: (optional) Dimension of gen units for for fully connected layer. [1024]
            dfc_dim: (optional) Dimension of discrim units for fully connected layer. [1024]
            c_dim: (optional) Dimension of image color. For grayscale input, set to 1. [3]
        """
        self.timer = timer.Timer()
        self.dataset = config.dataset
        if config.architecture == 'dc128':
            output_size = 128
        if config.architecture in ['dc64', 'dcgan64']:
            output_size = 64

        self.sess = sess
        if config.real_batch_size == -1:
            config.real_batch_size = config.batch_size
        self.config = config
        self.is_grayscale = (c_dim == 1)
        self.batch_size = batch_size
        self.real_batch_size = config.real_batch_size
        self.sample_size = 64 if self.config.is_train else batch_size
        self.output_size = output_size
        self.data_dir = data_dir
        self.z_dim = z_dim

        self.gf_dim = config.gf_dim
        self.df_dim = config.df_dim
        self.dof_dim = self.config.dof_dim

        self.c_dim = c_dim

        discriminator_desc = '_dc'
        if self.config.learning_rate_D == self.config.learning_rate:
            lr = 'lr%.8f' % self.config.learning_rate
        else:
            lr = 'lr%.8fG%fD' % (self.config.learning_rate,
                                 self.config.learning_rate_D)
        arch = '%dx%d' % (self.config.gf_dim, self.config.df_dim)

        self.description = (
            "%s%s_%s%s_%sd%d-%d-%d_%s_%s_%s" %
            (self.dataset, arch, self.config.architecture, discriminator_desc,
             self.config.kernel, self.config.dsteps, self.config.start_dsteps,
             self.config.gsteps, self.batch_size, self.output_size, lr))

        if self.config.batch_norm:
            self.description += '_bn'

        self._ensure_dirs()

        stdout = sys.stdout
        if self.config.log:
            self.old_stdout = sys.stdout
            self.old_stderr = sys.stderr
            self.log_file = open(os.path.join(self.sample_dir, 'log.txt'),
                                 'w',
                                 buffering=1)
            print('Execution start time: %s' % time.ctime())
            print('Log file: %s' % self.log_file)
            stdout = self.log_file
            sys.stdout = self.log_file
            sys.stderr = self.log_file
        if config.compute_scores:
            self.scorer = scorer.Scorer(self.dataset,
                                        config.MMD_lr_scheduler,
                                        stdout=stdout)
        print('Execution start time: %s' % time.ctime())
        pprint.PrettyPrinter().pprint(self.config.__dict__['__flags'])
        self.build_model()

        self.initialized_for_sampling = config.is_train
コード例 #26
0
def run(basename,train_filename,test_filename,
        num_trees=100,tree_depth=0,class_index=0):

    with timer.Timer("loading data"):
        training = read_dataset(train_filename,class_index=class_index)
        testing = read_dataset(test_filename,class_index=class_index)

    """
    print "====== naive Bayes ====="
    with timer.Timer("training"):
        nb = NaiveBayes()
        nb.buildClassifier(training)
    with timer.Timer("testing"):
        eval_training = evaluate_dataset(nb,training)
        eval_testing = evaluate_dataset(nb,testing)
    print "=== evaluation (training):"
    print eval_training.toSummaryString()
    print "=== evaluation (testing):"
    print eval_testing.toSummaryString()
    """

    print "====== random forest ====="
    with timer.Timer("training"):
        rf = RandomForest()
        #rf.setOptions([
        #  u'-P', u'100', u'-I', u'100', u'-num-slots', u'1', u'-K', u'0', u'-M', u'1.0', u'-V', u'0.001', u'-S', u'1',
        #  u'-num-decimal-places', u'6'
        #])
        rf.setNumIterations(num_trees)
        if tree_depth:
            rf.setMaxDepth(tree_depth)
        rf.buildClassifier(training)
    with timer.Timer("testing"):
        eval_training = evaluate_dataset(rf,training)
        eval_testing = evaluate_dataset(rf,testing)
    print "=== evaluation (training):"
    print eval_training.toSummaryString()
    print "=== evaluation (testing):"
    print eval_testing.toSummaryString()

    #print rf.getmembers()

    num_classifiers = len(rf.m_Classifiers)
    for i,tree in enumerate(rf.m_Classifiers):
        options_arr = tree.getOptions()
        options_arr_python = [x for x in options_arr]
        options_arr_python += [u'-num-decimal-places',u'6']
        tree.setOptions(options_arr_python)
        #print tree.toString()
        #binarize(tree)
        filename = basename % i
        with open(filename,"w") as f:
            f.writelines(tree.graph())

    correct,incorrect = 0,0
    for instance in testing:
        pos,neg = 0,0
        for tree in rf.m_Classifiers:
            #print tree.classifyInstance(instance)
            if tree.classifyInstance(instance) >= 0.5:
                pos += 1
            else:
                neg += 1
            my_label = 1.0 if pos >= neg else 0.0
        if my_label == instance.classValue():
            correct += 1
        else:
            incorrect += 1
    print "    trees : %d" % num_trees
    print "--- evaluating majority vote on random forest:"
    print "  correct : %d" % correct
    print "incorrect : %d" % incorrect