Ejemplo n.º 1
0
  def parse_generator(self):
    """ This is a (hacky) way to maintain everything loaded. Every time you call __next__() on this generator, it will parse data
        found in self.current_input which should be an open file or StringIO"""

    with tf.Graph().as_default():
      config_proto = tf.compat.v1.ConfigProto()
      # if self.per_process_gpu_memory_fraction == -1:
      config_proto.gpu_options.allow_growth = True
      # else:
      #   config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction
      with tf.compat.v1.Session(config=config_proto) as sess:
        # load the model and prep the parse set

        print("SELF.TRAIN_FILES",self.train_files,file=sys.stderr)
        self.add_file_vocabs(self.train_files)
        self.setup_vocabs()
        trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model)
        with tf.compat.v1.variable_scope(self.name.title()):
          train_tensors = trainset()
        train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys]

        saver = tf.compat.v1.train.Saver(self.save_vars, max_to_keep=1)
        for var in self.non_save_vars:
          sess.run(var.initializer)
        saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))

        # create parseset outside of the while loop
        parseset = Parseset.from_configurable(self, self.vocabs, parse_files=self.current_input, nlp_model=self.nlp_model)
        with tf.compat.v1.variable_scope(self.name.title(), reuse=True):
          parse_tensors = parseset(moving_params=self.optimizer)
        parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys]

        while True:

          self.prune_vocabs()
          self.add_file_vocabs([self.current_input]) # add new vocubulary items from the current data
          parseset.reinit(self.vocabs, self.current_input) # this creates new buckets for current data


          probs = []
          sents = []
          for feed_dict, tokens in parseset.iterbatches(shuffle=False):
            probs.append(sess.run(parse_outputs, feed_dict=feed_dict))
            sents.append(tokens)
          outp=io.StringIO()
          parseset.write_probs(sents, outp, probs, parseset._metadata)
          yield outp.getvalue()
          
      del trainset
      if self.verbose:
        try:
          print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green'),file=sys.stderr)
        except:
          print(ctext('Parsing took {} seconds'.format(time.time()-start_time), 'bright_green'),file=sys.stderr)
    return
Ejemplo n.º 2
0
 def print_accuracy(self, accumulators, time, prefix='Train'):
   """ """
   
   acc_dict = self.process_accumulators(accumulators, time=time)
   strings = []
   strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
   strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
   strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
   strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
   string = ctext('{0}  ', 'bold') + ' | '.join(strings)
   print(string.format(prefix, **acc_dict),file=sys.stderr)
   return
Ejemplo n.º 3
0
    def from_dataset(cls, dataset, *args, **kwargs):
        """ """

        multibucket = cls.from_configurable(dataset, *args, **kwargs)
        indices = []
        for multibucket_ in dataset:
            indices.append(multibucket_.indices)
        #for i in xrange(1, len(indices)):
        #  assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
        multibucket._indices = np.array(multibucket_.indices)
        buckets = [
            Bucket.from_dataset(dataset, i, *args, **kwargs)
            for i in range(len(multibucket_))
        ]
        multibucket._buckets = buckets
        if dataset.verbose:
            for bucket in multibucket:
                print('Bucket {name} is {shape}'.format(
                    name=bucket.name,
                    shape=ctext(
                        ' x '.join(str(x) for x in bucket.indices.shape),
                        'bright_blue')),
                      file=sys.stderr)
        return multibucket
Ejemplo n.º 4
0
    def parse(self, input_files, output_dir=None, output_file=None):
        """ """

        if isinstance(input_files, types.GeneratorType):
            pass
        else:
            if not isinstance(input_files, (tuple, list)):
                input_files = [input_files]
            if len(input_files) > 1 and output_file is not None:
                raise ValueError(
                    'Cannot provide a value for --output_file when parsing multiple files'
                )

        with tf.Graph().as_default():
            config_proto = tf.ConfigProto()
            if self.per_process_gpu_memory_fraction == -1:
                config_proto.gpu_options.allow_growth = True
            else:
                config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction
            with tf.Session(config=config_proto) as sess:
                # load the model and prep the parse set

                print("SELF.TRAIN_FILES", self.train_files, file=sys.stderr)
                self.add_file_vocabs(self.train_files)
                self.setup_vocabs()
                trainset = Trainset.from_configurable(self,
                                                      self.vocabs,
                                                      nlp_model=self.nlp_model)
                with tf.variable_scope(self.name.title()):
                    train_tensors = trainset()
                train_outputs = [
                    train_tensors[train_key]
                    for train_key in trainset.train_keys
                ]

                saver = tf.train.Saver(self.save_vars, max_to_keep=1)
                for var in self.non_save_vars:
                    sess.run(var.initializer)
                saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))

                start_time = time.time()
                for input_file in input_files:

                    #print("Parseset vocab")
                    self.add_file_vocabs([input_file])

                    #print("Beg Parseset.from_configurable")
                    parseset = Parseset.from_configurable(
                        self,
                        self.vocabs,
                        parse_files=input_file,
                        nlp_model=self.nlp_model)
                    #print("Done Parseset.from_configurable")
                    with tf.variable_scope(self.name.title(), reuse=True):
                        parse_tensors = parseset(moving_params=self.optimizer)
                    parse_outputs = [
                        parse_tensors[parse_key]
                        for parse_key in parseset.parse_keys
                    ]

                    if not isinstance(input_file, io.StringIO):
                        input_dir, input_file = os.path.split(input_file)
                        if output_dir is None and output_file is None:
                            output_dir = self.save_dir
                        if output_dir == input_dir and output_file is None:
                            output_path = os.path.join(input_dir,
                                                       'parsed-' + input_file)
                        elif output_file is None:
                            output_path = os.path.join(output_dir, input_file)
                        else:
                            output_path = output_file
                    else:
                        assert output_file is not None
                        output_path = output_file  #The expectation is for this to be an open file

                    probs = []
                    sents = []
                    for feed_dict, tokens in parseset.iterbatches(
                            shuffle=False):
                        probs.append(
                            sess.run(parse_outputs, feed_dict=feed_dict))
                        sents.append(tokens)
                    parseset.write_probs(sents, output_path, probs,
                                         parseset._metadata)
                    del parseset
            del trainset
            if self.verbose:
                try:
                    print(ctext(
                        'Parsing {0} file(s) took {1} seconds'.format(
                            len(input_files),
                            time.time() - start_time), 'bright_green'),
                          file=sys.stderr)
                except:
                    print(ctext(
                        'Parsing took {} seconds'.format(time.time() -
                                                         start_time),
                        'bright_green'),
                          file=sys.stderr)
        return
Ejemplo n.º 5
0
    def train(self, load=False):
        """ """

        # prep the configurables

        self.add_file_vocabs(self.parse_files)
        self.setup_vocabs()
        trainset = Trainset.from_configurable(self,
                                              self.vocabs,
                                              nlp_model=self.nlp_model)
        with tf.variable_scope(self.name.title()):
            train_tensors = trainset()
        print("train_tensors: ", train_tensors)
        train = self.optimizer(tf.losses.get_total_loss())
        train_outputs = [
            train_tensors[train_key] for train_key in trainset.train_keys
        ]
        saver = tf.train.Saver(self.save_vars, max_to_keep=1)
        validset = Parseset.from_configurable(self,
                                              self.vocabs,
                                              nlp_model=self.nlp_model)
        with tf.variable_scope(self.name.title(), reuse=True):
            valid_tensors = validset(moving_params=self.optimizer)
        valid_outputs = [
            valid_tensors[train_key] for train_key in validset.train_keys
        ]
        valid_outputs2 = [
            valid_tensors[valid_key] for valid_key in validset.valid_keys
        ]
        current_acc = 0
        best_acc = 0
        n_iters_since_improvement = 0
        n_iters_in_epoch = 0

        # calling these properties is inefficient so we save them in separate variables
        min_train_iters = self.min_train_iters
        max_train_iters = self.max_train_iters
        validate_every = self.validate_every
        save_every = self.save_every
        verbose = self.verbose
        quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement

        # load or prep the history
        if load:
            self.history = pkl.load(
                open(os.path.join(self.save_dir, 'history.pkl')))
        else:
            self.history = {
                'train': defaultdict(list),
                'valid': defaultdict(list)
            }

        # start up the session
        config_proto = tf.ConfigProto()
        #if self.per_process_gpu_memory_fraction == -1:
        config_proto.gpu_options.allow_growth = True
        #else:
        #  config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction
        with tf.Session(config=config_proto) as sess:
            sess.run(tf.global_variables_initializer())
            if load:
                saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
            total_train_iters = sess.run(self.global_step)
            train_accumulators = np.zeros(len(train_outputs))
            train_time = 0
            # training loop
            while total_train_iters < max_train_iters:
                print(total_train_iters)
                for feed_dict in trainset.iterbatches():
                    # print("feed_dict: ",feed_dict)
                    start_time = time.time()
                    batch_values = sess.run(train_outputs + [train],
                                            feed_dict=feed_dict)[:-1]
                    batch_time = time.time() - start_time
                    # update accumulators
                    total_train_iters += 1
                    n_iters_since_improvement += 1
                    train_accumulators += batch_values
                    train_time += batch_time
                    # possibly validate
                    if total_train_iters == 1 or (total_train_iters %
                                                  validate_every == 0):
                        valid_accumulators = np.zeros(len(train_outputs))
                        valid_time = 0
                        with codecs.open(os.path.join(self.save_dir,
                                                      'sanity_check'),
                                         'w',
                                         encoding='utf-8',
                                         errors='ignore') as f:
                            for feed_dict, sents in validset.iterbatches(
                                    return_check=True):
                                #print("sent: ", sents[0])
                                start_time = time.time()
                                batch_values = sess.run(valid_outputs +
                                                        valid_outputs2,
                                                        feed_dict=feed_dict)
                                batch_time = time.time() - start_time
                                # update accumulators
                                valid_accumulators += batch_values[:len(
                                    valid_outputs)]
                                valid_preds = batch_values[len(valid_outputs):]
                                valid_time += batch_time
                                validset.check(valid_preds, sents, f)
                        # update history
                        trainset.update_history(self.history['train'],
                                                train_accumulators)
                        current_acc = validset.update_history(
                            self.history['valid'], valid_accumulators)
                        # print
                        if verbose:
                            print(
                                ctext('{0:6d}'.format(int(total_train_iters)),
                                      'bold') + ')')
                            sys.stdout.flush()
                            trainset.print_accuracy(train_accumulators,
                                                    train_time)
                            validset.print_accuracy(valid_accumulators,
                                                    valid_time)
                        train_accumulators = np.zeros(len(train_outputs))
                        train_time = 0
                        if current_acc > best_acc:
                            if verbose:
                                print(ctext('Saving model...',
                                            'bright_yellow'),
                                      file=sys.stderr)
                                sys.stderr.flush()
                            best_acc = current_acc
                            n_iters_since_improvement = 0
                            saver.save(
                                sess,
                                os.path.join(self.save_dir, self.name.lower()),
                                #global_step=self.global_epoch,
                                write_meta_graph=False)
                            with open(
                                    os.path.join(self.save_dir, 'history.pkl'),
                                    'wb') as f:
                                pkl.dump(dict(self.history), f)
                        elif n_iters_since_improvement >= quit_after_n_iters_without_improvement and total_train_iters > min_train_iters:
                            break
                else:
                    # We've completed one epoch
                    if total_train_iters <= min_train_iters:
                        saver.save(
                            sess,
                            os.path.join(self.save_dir, self.name.lower()),
                            #global_step=self.global_epoch,
                            write_meta_graph=False)
                        with open(os.path.join(self.save_dir, 'history.pkl'),
                                  'wb') as f:
                            pkl.dump(dict(self.history), f)
                    sess.run(self.global_epoch.assign_add(1.))
                    continue
                break
            # Now parse the training and testing files
            input_files = self.train_files + self.parse_files
            saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
            for input_file in input_files:
                parseset = Parseset.from_configurable(self,
                                                      self.vocabs,
                                                      parse_files=input_file,
                                                      nlp_model=self.nlp_model)
                with tf.variable_scope(self.name.title(), reuse=True):
                    parse_tensors = parseset(moving_params=self.optimizer)
                parse_outputs = [
                    parse_tensors[parse_key]
                    for parse_key in parseset.parse_keys
                ]

                input_dir, input_file = os.path.split(input_file)
                output_dir = self.save_dir
                output_file = input_file

                start_time = time.time()
                probs = []
                sents = []
                for feed_dict, tokens in parseset.iterbatches(shuffle=False):
                    probs.append(sess.run(parse_outputs, feed_dict=feed_dict))
                    sents.append(tokens)
                parseset.write_probs(sents,
                                     os.path.join(output_dir, output_file),
                                     probs, parseset._metadata)
        if self.verbose:
            print(ctext(
                'Parsing {0} file(s) took {1} seconds'.format(
                    len(input_files),
                    time.time() - start_time), 'bright_green'),
                  file=sys.stderr)
        return