Beispiel #1
0
    def train_step(self, cases, weights, caching):
        if len(cases) != len(weights):
            raise ValueError('cases and weights must have the same length.')

        if len(cases) == 0:
            #logging.warn('Training on zero cases.')
            print >> sys.stderr, " WARNING: Zero cases   \033[F"
            # still increment the step
            sess = tf.get_default_session()
            sess.run(self._increment_step)
        elif not self._max_batch_size or len(cases) <= self._max_batch_size:
            print >> sys.stderr, " Updating ({} cases)   \033[F".format(len(cases))
            self.compute(self._take_step, cases, weights, caching)
        else:
            print >> sys.stderr, " Updating ({} cases)   \033[F".format(len(cases))
            assert not caching
            grads = None
            slices = range(0, len(cases), self._max_batch_size)
            for i in verboserate(slices, desc='Computing gradients ({} cases)'.format(len(cases))):
                cases_slice = cases[i:i + self._max_batch_size]
                weights_slice = weights[i:i + self._max_batch_size]
                grads_slice = self.compute(self._grad_tensors,
                                           cases_slice, weights_slice, False)
                if grads is None:
                    grads = grads_slice
                else:
                    for i in xrange(len(self._grad_tensors)):
                        grads[i] += grads_slice[i]
            sess = tf.get_default_session()
            feed_dict = dict(zip(self._combined_grad_placeholders, grads))
            sess.run(self._apply_gradients, feed_dict)
            sess.run(self._increment_step)
Beispiel #2
0
    def __init__(self, vocab_size=400000):
        """Load GloveEmbeddings.

        Args:
            word_vocab_size (int): max # of words in the vocab. If not specified, uses all available GloVe vectors.

        Returns:
            (np.array, SemgenVocab)
        """
        embed_dim = 100
        if vocab_size < 5000:
            raise ValueError('Need to at least use 5000 words.')

        glove_path = join(DataDirectory.glove, 'glove.6B.100d.txt')
        download_path = 'http://nlp.stanford.edu/data/glove.6B.zip'
        if not os.path.exists(glove_path):
            raise RuntimeError('Missing file: {}. Download it here: {}'.format(glove_path, download_path))

        # embeddings for special words
        words = list(UtteranceVocab.SPECIAL_TOKENS)
        num_special = len(words)
        embeds = [np.zeros(embed_dim, dtype=np.float32) for _ in words]  # zeros are just placeholders for now

        with open(glove_path, 'r') as f:
            lines = verboserate(f, desc='Loading GloVe embeddings', total=vocab_size, initial=num_special)
            for i, line in enumerate(lines, start=num_special):
                if i == vocab_size: break
                tokens = line.split()
                word, embed = tokens[0], np.array([float(tok) for tok in tokens[1:]])
                words.append(word)
                embeds.append(embed)

        vocab = UtteranceVocab(words)
        embed_matrix = np.stack(embeds)

        special_embeds = emulate_distribution((num_special, embed_dim), embed_matrix[:5000, :], seed=2)
        embed_matrix[:num_special, :] = special_embeds
        assert embed_matrix.shape[1] == 100

        super(GloveEmbeddings, self).__init__(embed_matrix, vocab)
Beispiel #3
0
def _get_all_hits(get_page):
    """Given a function that retrieves a single page of HITs, retrieve all HITs.

    WARNING:
        - this function can be quite slow.
        - results are returned in no particular order.

    Args:
        get_page (Callable[[int, int], list[HIT]]): a function which takes a page size and page number.
            and returns a list of HITs.

            kwargs:
                page_size (int)
                page_number (int)

    Returns:
        generator[HIT]
    """
    page_size = 100  # HITs per page

    # compute the pages that need to be fetched
    search_results = get_page(page_size=page_size, page_number=1)
    total_hits = int(search_results.TotalNumResults)
    total_pages = total_hits / page_size + bool(total_hits % page_size)
    page_nums = list(range(1, total_pages + 1))

    # fetch all the pages in parallel
    fetch_page = lambda i: get_page(page_size=page_size, page_number=i)
    with SimpleExecutor(fetch_page) as executor:
        for i in page_nums:
            executor.submit(i, i)
        for i, page in verboserate(executor.results(),
                                   desc='Fetching pages of HITs',
                                   total=total_pages):
            if isinstance(page, Failure):
                print page.traceback
                continue
            for hit in page:
                yield hit
Beispiel #4
0
    def train_edit(self, use_lsh, topk):
        # TODO(kelvin): do something to preserve random state upon reload?
        train_state = self.train_state
        examples = self._examples
        config = self.config
        workspace = self.workspace

        vae_editor = train_state.model.vae_model
        ret_model = train_state.model.ret_model
        edit_model = train_state.model.edit_model

        # Set up static editor training
        step = train_state.train_steps
        while step < 3 * config.optim.max_iters:
            train_eval = ret_model.ret_and_make_ex(examples.train, use_lsh,
                                                   examples.train, 1)
            valid_eval = ret_model.ret_and_make_ex(examples.valid, use_lsh,
                                                   examples.train, 0)
            ret_batches = similar_size_batches(train_eval,
                                               config.optim.batch_size)
            # random.shuffle(train_batches)
            random.shuffle(ret_batches)
            for batch in verboserate(ret_batches,
                                     desc='Streaming training for retrieval'):
                # Set up pairs to edit on
                fict_batch = edit_model.ident_mapper(batch,
                                                     config.model.ident_pr)
                edit_loss, _, _ = edit_model.loss(fict_batch)
                loss = edit_loss
                finite_grads, grad_norm = self._take_grad_step(
                    train_state, loss)
                self.check_gradnan(finite_grads, train_state, workspace)
                step = train_state.train_steps
                self.eval_and_save(edit_model, step, train_state, config,
                                   grad_norm, train_eval, valid_eval)

                if step >= 3 * config.optim.max_iters:
                    break
                pass
Beispiel #5
0
    def _train(cls, config, train_state, examples):
        model = train_state.model
        optimizer = train_state.optimizer
        train_batches = similar_size_batches(
            examples.train, config.optim.batch_size, size=lambda ex: len(ex))

        while True:
            random.shuffle(train_batches)
            i = 0  # cannot enumerate(verboserate(...))
            for batch in verboserate(train_batches, desc='Streaming training examples'):
                loss = model.loss(batch, cls._train_state.train_steps)
                cls._take_grad_step(train_state, loss)
                if (i % 100) == 0:
                    cls.evaluate()
                if (i % 1000) == 0:
                    if config.model.type == 1: # SVAE
                        # write interpolations to file
                        fname = "interps_batches_{}".format(i)
                        num_ex = 10
                        a_idx = np.random.randint(len(batch), size=num_ex)
                        b_idx = np.random.randint(len(batch), size=num_ex)
                        interps = []
                        for a, b in zip(a_idx, b_idx):
                            ex_a = batch[a]
                            ex_b = batch[b]
                            interpolation = model._interpolate_examples(ex_a, ex_b)
                            interpolation_repr = []
                            interpolation_repr.append(" ".join(ex_a))
                            interpolation_repr.extend(
                                [" ".join(ex) for ex in interpolation])
                            interpolation_repr.append(" ".join(ex_b))
                            interps.append(interpolation_repr)
                        with open(join(cls._interps_dir, fname), 'w') as fout:
                            data = "\n\n".join(["\n".join(ex) for ex in interps])
                            fout.write(data.encode('utf-8'))
                if (i % 5000) == 0:
                    cls.checkpoints.save(train_state)
                i += 1
Beispiel #6
0
        def examples_from_file(path):
            """Return list[EditExample] from file path."""
            examples = []

            # count total lines before loading
            total_lines = int(local('wc -l {}'.format(path), capture=True).split()[0])
            print("----> TOTAL LINES", total_lines)

            with codecs.open(path, 'r', encoding='utf-8') as f:
                for line in verboserate(f, desc='Reading data file.', total=total_lines):
		    print("BEZIGMET", line)
                    src, trg = line.strip().lower().split('\t')
                    src_words = src.split(' ')
                    trg_words = trg.split(' ')
                    assert len(src_words) > 0
                    assert len(trg_words) > 0

                    if use_diff:
                        ex = EditExample.salient_diff(src_words, trg_words, free_set)
                    else:
                        ex = EditExample.whitelist_blacklist(src_words, trg_words)
                    examples.append(ex)
            return examples
    def view(self, select=lambda path: True):
        """View runs.
        
        Args:
            select (Callable[str, bool]): given a path to a run, returns True if we want to display the
                run, False otherwise.
        """
        field_names = list(self._renderers.keys())
        table = PrettyTable(field_names=field_names)
        types = OrderedDict((n, set()) for n in field_names)

        for i, path in verboserate(list(self._runs._int_dirs.items()),
                                   desc='Scanning runs.'):
            if not select(path):
                continue

            row = []
            for render in list(self._renderers.values()):
                try:
                    s = render(path)
                except:
                    s = ''
                row.append(s)

            # record types
            for name, elem in zip(field_names, row):
                types[name].add(type(elem))

            table.add_row(row)

        self._print_table(table)

        # display types for each attribute
        type_table = PrettyTable(['attribute', 'types'])
        for name, type_set in types.items():
            type_table.add_row([name, ', '.join(t.__name__ for t in type_set)])
        self._print_table(type_table)
Beispiel #8
0
    def edit(self,
             examples,
             max_seq_length=150,
             beam_size=5,
             batch_size=64,
             constrain_vocab=False,
             verbose=False):
        """Performs edits on a batch of source sentences.

        Args:
            examples (list[EditExample])
            max_seq_length (int): max # timesteps to generate for
            beam_size (int): for beam decoding
            batch_size (int): max number of examples to pass into the RNN decoder at a time.
                The total # examples decoded in parallel = batch_size / beam_size.
            constrain_vocab (bool):
default is False

        Returns:
            beam_list (list[list[list[unicode]]]): a batch of beams.
            edit_traces (list[EditTrace])
        """
        self.eval()  # set to evaluation mode, for dropout to work correctly
        beam_list = []
        edit_traces = []

        batches = chunks(examples, batch_size / beam_size)
        batches = verboserate(batches,
                              desc='Decoding examples') if verbose else batches
        for batch in batches:
            beams, traces = self._edit_batch(batch, max_seq_length, beam_size,
                                             constrain_vocab)
            beam_list.extend(beams)
            edit_traces.extend(traces)
        self.train()  # set back to train mode
        return beam_list, edit_traces
    def _train(cls, config, train_state, examples, workspace, metadata, tb_logger):
        """Train a model.

        NOTE: modifies TrainState in place.
        - parameters of the Editor and Optimizer are updated
        - train_steps is updated
        - random number generator states are updated at every checkpoint

        Args:
            config (Config)
            train_state (TrainState): initial TrainState. Includes the Editor and Optimizer.
            examples (EditDataSplits)
            workspace (Workspace)
            metadata (Metadata)
            tb_logger (tensorboard_logger.Logger)
        """
        with random_state(train_state.random_state):
            editor = train_state.editor
            optimizer = train_state.optimizer
            noiser = EditNoiser(config.editor.ident_pr, config.editor.attend_pr)
            train_batches = similar_size_batches(examples.train, config.optim.batch_size)

            # test batching!
            # commenting out for now, not certain why there is a batching error. 
            #editor.test_batch(noiser(train_batches[0]))

            while True:
                # TODO(kelvin): this shuffle and the position within the shuffle is not properly restored upon reload
                random.shuffle(train_batches)

                for batch in verboserate(train_batches, desc='Streaming training examples'):
                    # compute gradients
                    optimizer.zero_grad()
                    if config.editor.edit_dropout:
                        noised_batch = noiser(batch)
                    else:
                        noised_batch = batch
                    #loss = editor.loss(noised_batch, draw_samples=config.editor.enable_vae)
                    var_loss, var_params, var_param_grads = editor.loss(noised_batch, draw_samples=config.editor.enable_vae)
                    #reg_loss.backward()
                    #loss.backward()

                    """
                    # clip gradients
                    if train_state.train_steps < 50:
                        # don't clip, just observe the gradient norm
                        grad_norm = clip_grad_norm(editor.parameters(), float('inf'), norm_type=2)
                        train_state.track_grad_norms(grad_norm)
                        metadata['max_grad_norm'] = train_state.max_grad_norm
                    else:
                        # clip according to the max allowed grad norm
                        grad_norm = clip_grad_norm(editor.parameters(), train_state.max_grad_norm)
                        # this returns the gradient norm BEFORE clipping
                    """

                    # Always do gradient clipping
                    # To-do: make this tunable, not hard-coded
                    grad_norm = clip_grad_norm(editor.parameters(), 5.)
                    #storch.nn.utils.clip_grad_norm(editor.parameters(), 5.0) 

                    finite_grads = cls._finite_grads(editor.parameters())
                    #cur = [param for param in editor.parameters()]

                    # take a step if the grads are finite
                    if finite_grads:
                        optimizer.step()

                    # increment step count
                    train_state.increment_train_steps()

                    # somehow we encountered NaN
                    if not finite_grads:
                        # dump parameters
                        train_state.save(workspace.nan_checkpoints)

                        # dump offending example batch
                        examples_path = join(workspace.nan_checkpoints, '{}.examples'.format(train_state.train_steps))
                        with open(examples_path, 'w') as f:
                            pickle.dump(noised_batch, f)

                        print 'Gradient was NaN/inf on step {}.'.format(train_state.train_steps)

                        # if there were more than 5 NaNs in the last 10 steps, drop into the debugger
                        nan_steps = cls._checkpoint_numbers(workspace.nan_checkpoints)
                        recent_nans = [s for s in nan_steps if s > train_state.train_steps - 10]
                        if len(recent_nans) > 5:
                            print 'Too many NaNs encountered recently: {}. Entering debugger.'.format(recent_nans)
                            import pdb
                            pdb.set_trace()

                    # run periodic evaluation and saving
                    if train_state.train_steps % config.eval.eval_steps == 0:
                        cls._evaluate(config, editor, examples, metadata, tb_logger, train_state.train_steps, noiser, big_eval=False)
                        tb_logger.log_value('grad_norm', grad_norm, train_state.train_steps)

                    if train_state.train_steps % config.eval.big_eval_steps == 0:
                        cls._evaluate(config, editor, examples, metadata, tb_logger, train_state.train_steps, noiser, big_eval=True)

                    if train_state.train_steps % config.eval.save_steps == 0:
                        train_state.update_random_state()
                        train_state.save(workspace.checkpoints)

                    if train_state.train_steps >= config.optim.max_iters:
                        return
Beispiel #10
0
from git import Repo
from os.path import join

import sys
print sys.path

from gtd.git_utils import commit_diff
from gtd.chrono import verboserate


repo_path = sys.argv[1]
max_count = sys.argv[2]
files = set(sys.argv[3:])

def format_commit(c):
    msg = c.message.split('\n')[0]
    return '{}\t{}'.format(c.hexsha, msg)

repo = Repo(repo_path)
commits = list(repo.iter_commits('master', max_count=max_count))
lines = []
for c in verboserate(commits, desc='Scanning commits', total=max_count):
    if len(files & commit_diff(c)) == 0:
        continue
    lines.append(format_commit(c))

log_path = join(repo_path, 'git-logs.tsv')
with open(log_path, 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')
Beispiel #11
0
    def _compute_metrics(cls, editor, examples, num_evaluate_examples,
                         batch_size):
        """

        Args:
            editor (Editor)
            examples (list[EditExample])
            num_evaluate_examples (int)
            batch_size (int)

        Returns:
            stats (dict[str, float])
            edit_traces (list[EditTrace]): of length num_evaluate_examples
            loss_traces (list[LossTrace]): of length num_evaluate_examples

        """
        sample = sample_if_large(examples,
                                 num_evaluate_examples,
                                 replace=False)

        # compute loss
        # need to break the sample into batches, in case the sample is too large to fit in GPU memory
        losses, loss_traces, weights, enc_losses = [], [], [], []

        for batch in verboserate(chunks(sample, batch_size),
                                 desc='Computing loss on examples'):
            weights.append(len(batch))
            loss_var, loss_trace_batch, enc_loss = editor.loss(batch)

            # convert loss Variable into float
            loss_val = loss_var.data[0]
            assert isinstance(loss_val, float)
            losses.append(loss_val)
            enc_losses.append(enc_loss)

            loss_traces.extend(loss_trace_batch)

        losses, weights = np.array(losses), np.array(weights)
        loss = np.sum(losses * weights) / np.sum(weights)  # weighted average
        enc_loss = np.sum(np.array(enc_losses) * weights) / np.sum(weights)

        punct_table = dict.fromkeys(
            i for i in xrange(sys.maxunicode)
            if unicodedata.category(unichr(i)).startswith('P'))

        def remove_punct(s):
            new_s = []
            for t in s:
                t = unicode(t).translate(punct_table)
                if t != '':
                    new_s.append(t)
            return new_s

        metrics = {
            'bleu': (bleu, max),
            'edit_dist': (lambda s, t: edit_dist(s, t)[0] / len(s)
                          if len(s) > 0 else len(t), min),
            'exact_match':
            (lambda s, t: 1.0
             if remove_punct(s) == remove_punct(t) else 0.0, max)
        }

        top_results = defaultdict(list)
        top5_results = defaultdict(list)

        # compute predictions
        beams, edit_traces = editor.edit(sample,
                                         batch_size=batch_size,
                                         max_seq_length=150,
                                         verbose=True)
        for ex, beam in izip(sample, beams):
            top = beam[0]
            top5 = beam[:5]
            target = ex.target_words
            for name, (fxn, best) in metrics.items():
                top_results[name].append(fxn(top, target))
                top5_results[name].append(
                    best(fxn(predict, target) for predict in top5))

        # compute averages
        stats_top = {name: np.mean(vals) for name, vals in top_results.items()}
        stats_top5 = {
            '{}_top5'.format(name): np.mean(vals)
            for name, vals in top5_results.items()
        }

        # combine into a single stats object
        stats = {'loss': loss, 'enc_loss': enc_loss}
        stats.update(stats_top)
        stats.update(stats_top5)

        return stats, edit_traces, loss_traces
Beispiel #12
0
 def examples_from_file(data_paths, seq_length_limit, fname):
     examples = {}
     MAX_LINE_LENGTH = 128
     name = '{}.pickle'.format(fname)
     file = pathlib2.Path.cwd(
     ) / 'github_data' / 'processed_repo_pkl' / name
     # if os.path.exists(str(file)):
     #     with open(str(file), 'rb') as f:
     #         examples = pickle.load(f)
     #     f.close()
     #     return list(examples.values())
     # count total lines before loading
     num_direct = len(data_paths)
     for line in verboserate(data_paths,
                             desc='Reading data file.',
                             total=num_direct):
         df = pd.read_csv(line,
                          skiprows=2,
                          header=None,
                          names=[0, 1],
                          dtype=str).fillna(NO_CONTEXT_WORD)
         df[0] = df[0].apply(lambda x: tokenize_fine_grained(x))
         # df[0] = df[0].apply(lambda x: preprocess_tokens(x, MAX_LINE_LENGTH))
         df[1] = df[1].apply(lambda x: tokenize_fine_grained(x))
         try:
             ex = []
             for i, row in df.iterrows():
                 try:
                     ex.append(EditExample(row[0], row[1]))
                 except:
                     # print 'bad formatting in file ' + str(line).split('/')[-1]
                     # print line
                     count = 1
             # skip sequences that are too long, because they use up memory
             # if max_seq_length(ex) > seq_length_limit:
             #     continue
             ex = list(
                 ifilterfalse(
                     lambda x: max_seq_length(x) > seq_length_limit,
                     ex))
             # examples[(str(line).split('/')[-1], len(ex))] = ex
             file = pathlib2.Path.cwd(
             ) / 'github_data' / 'processed_repo_pkl' / fname
             result = {(str(line).split('/')[-1], len(ex)): ex}
             k = str(line).split('/')[-1].split('.')[0]
             pick_obj = {(str(line).split('/')[-1], len(ex)): ex}
             obj_name = str(file / k) + '.pickle'
             with open(obj_name, 'wb') as f:
                 pickle.dump(pick_obj, f)
             f.close()
         except Exception as e:
             print e
             print 'bad formatting in file ' + str(line).split('/')[-1]
             print line
     # name = '{}.pickle'.format(fname)
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / name
     # if fname == 'train':
     # file = pathlib2.Path.cwd() / 'github_data' / 'processed_repo_pkl' / fname
     # for k, v in tqdm(examples.items()):
     #     obj_name = file / k[0].split('.')[0]
     #     pick_obj = {k : v}
     #     with open(str(obj_name), 'wb') as f:
     #         pickle.dump(pick_obj, f)
     #     f.close()
     # else:
     #     if not os.path.exists(str(file)):
     #         with open(str(file), 'wb') as f:
     #             pickle.dump(examples, f)
     #         f.close()
     return list(examples.values())
Beispiel #13
0
def examples_to_supervised_cases(examples):
    """Return a Generator of supervised ParseCases."""
    for example in verboserate(examples,
                               desc='Streaming supervised ParseCases'):
        for case in example_to_supervised_cases(example):
            yield case
 def batch_generator():
     while True:
         # WARNING: random state of train state does not exactly restore state anymore, due to this shuffle
         random.shuffle(train_batches)
         for batch in verboserate(train_batches, desc='Streaming example batches'):
             yield batch
Beispiel #15
0
from os.path import join

import sys
print(sys.path)

from gtd.git_utils import commit_diff
from gtd.chrono import verboserate

repo_path = sys.argv[1]
max_count = sys.argv[2]
files = set(sys.argv[3:])


def format_commit(c):
    msg = c.message.split('\n')[0]
    return '{}\t{}'.format(c.hexsha, msg)


repo = Repo(repo_path)
commits = list(repo.iter_commits('master', max_count=max_count))
lines = []
for c in verboserate(commits, desc='Scanning commits', total=max_count):
    if len(files & commit_diff(c)) == 0:
        continue
    lines.append(format_commit(c))

log_path = join(repo_path, 'git-logs.tsv')
with open(log_path, 'w') as f:
    for line in lines:
        f.write(line)
        f.write('\n')
Beispiel #16
0
    def decode(self,
               examples,
               encoder_output,
               weighted_value_estimators,
               beam_size,
               prefix_hints,
               sibling_penalty,
               max_seq_length=50,
               top_k=5,
               verbose=False):
        """Beam decode.

        Args:
            examples (list[Example])
            encoder_output (EncoderOutput)
            weighted_value_estimators (list[(ValueEstimator, float)]): a list of (estimator, weight) pairs.
            beam_size (int)
            prefix_hints (list[list[unicode]]): a batch of prefixes. For each example, all returned results will start
                with the specified prefix.
            sibling_penalty (float)
            max_seq_length (int): maximum allowable length of outputted sequences
            top_k (int): number of beam candidates to show in trace
            verbose (bool): default is False

        Returns:
            beams (list[list[list[unicode]]]): a batch of beams of decoded sequences
            traces (list[BeamDecoderTrace])
        """
        rnn_state_orig, states_orig = self._initialize(self.decoder_cell,
                                                       examples)

        # duplicate everything to beam_size
        duplicate = BeamDuplicator(beam_size)
        rnn_state = duplicate(rnn_state_orig)
        encoder_output = duplicate(encoder_output)

        states = []
        for state in states_orig:
            states.append(state)
            # these states are guaranteed to die on the first round, because their sequence_prob = 0
            # they are just here as padding
            # TODO(kelvin): WARNING! In the future, the ValueEstimators in BeamDecoder._advance might break
            # my assumption that any extension of a sequence with 0 prob will also have 0 prob.
            # If this assumption is broken, the BeamDecoder will return a beam of identical results.
            doomed = [DecoderState.initial_doomed(state.example)
                      ] * (beam_size - 1)
            states.extend(doomed)

        # perform iterations of beam search
        time_steps = range(max_seq_length)
        if verbose:
            time_steps = verboserate(time_steps,
                                     desc='Beam decoding sequences')

        states_over_time = []
        for _ in time_steps:
            # stop if all sequences have terminated
            if all(state.terminated for state in states): break
            rnn_state, states = self._advance(encoder_output,
                                              weighted_value_estimators,
                                              beam_size, rnn_state, states,
                                              sibling_penalty)
            states_over_time.append(states)

        return self._recover_sequences(states_over_time, beam_size, top_k)