Beispiel #1
0
    def __init__(self,
                 all_instances,
                 word_vocab=None,
                 char_vocab=None,
                 options=None,
                 isShuffle=False,
                 isLoop=False,
                 isSort=True,
                 has_ref=True,
                 batch_size=-1):
        self.options = options
        if batch_size == -1: batch_size = options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (question, passage, entity_start, entity_end, edges, candidates,
             ref, ids, candidates_str) in all_instances:
            question_idx = word_vocab.to_index_sequence_for_list(question)
            passage_idx = word_vocab.to_index_sequence_for_list(passage)
            question_chars_idx = None
            passage_chars_idx = None
            if options.with_char:
                question_chars_idx = char_vocab.to_character_matrix_for_list(
                    question, max_char_per_word=options.max_char_per_word)
                passage_chars_idx = char_vocab.to_character_matrix_for_list(
                    question, max_char_per_word=options.max_char_per_word)
            instances.append((question_idx, question_chars_idx, passage_idx,
                              passage_chars_idx, entity_start, entity_end,
                              edges, candidates, ref, ids, candidates_str))

        all_instances = instances
        instances = None

        # sort instances based on length
        if isSort:
            all_instances = sorted(all_instances,
                                   key=lambda inst: len(inst[2]))

        self.num_instances = len(all_instances)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances,
                                                 batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_instances = []
            for i in xrange(batch_start, batch_end):
                cur_instances.append(all_instances[i])
            cur_batch = Batch(cur_instances,
                              options,
                              word_vocab=word_vocab,
                              has_ref=has_ref)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
Beispiel #2
0
def create_json_file(all_instances, outpath, batch_size=5000):
    import padding_utils
    batch_spans = padding_utils.make_batches(len(all_instances), batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
        cur_instances = all_instances[batch_start:batch_end]
        cur_outpath = outpath + ".{}".format(batch_index)
        print("Dump {} instances out to {}".format(len(cur_instances),
                                                   cur_outpath))
        dump_out_to_json(cur_instances, cur_outpath)
Beispiel #3
0
    def __init__(self,
                 all_instances,
                 word_vocab_enc=None,
                 word_vocab_dec=None,
                 char_vocab=None,
                 edgelabel_vocab=None,
                 options=None,
                 isShuffle=False,
                 isLoop=False,
                 isSort=True,
                 batch_size=-1):
        self.options = options
        if batch_size == -1: batch_size = options.batch_size
        # index tokens and filter the dataset
        src_cover = 0.0
        src_total = 0.0
        node_cover = 0.0
        node_total = 0.0
        for (nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices,
             out_neigh_edges, source, sentence_idx, sentence,
             id) in all_instances:
            src_cover += sum(x != word_vocab_enc.vocab_size for x in source)
            src_total += len(source)
            node_cover += sum(x != word_vocab_enc.vocab_size for x in nodes)
            node_total += len(nodes)
        if src_cover / src_total < 0.9 or node_cover / node_total < 0.9:
            print('source coverage rate: {}'.format(src_cover / src_total))
            print('node coverage rate: {}'.format(node_cover / node_total))
            print('=====')

        # sort instances based on length
        if isSort:
            all_instances = sorted(all_instances,
                                   key=lambda inst:
                                   (len(inst[0]), len(inst[5])))

        self.num_instances = len(all_instances)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances,
                                                 batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_instances = []
            for i in xrange(batch_start, batch_end):
                cur_instances.append(all_instances[i])
            cur_batch = G2SBatch(cur_instances,
                                 options,
                                 word_vocab=word_vocab_dec)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
Beispiel #4
0
def create_json_file(hash_path_dict, urlpath, outpath, batch_size=5000):
    all_urls = read_text_file(urlpath)
    import padding_utils
    batch_spans = padding_utils.make_batches(len(all_urls), batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
        cur_urls = all_urls[batch_start:batch_end]
        cur_outpath = outpath + ".{}".format(batch_index)
        print("Dump {} instances out to {}".format(len(cur_urls), cur_outpath))
        dump_out_to_json(hash_path_dict, cur_urls, cur_outpath)
    def __init__(self, all_instances, word_vocab=None, char_vocab=None, edgelabel_vocab=None, options=None,
                 isShuffle=False, isLoop=False, isSort=True, batch_size=-1):
        self.options = options
        if batch_size ==-1: batch_size=options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (nodes, in_neigh_indices, in_neigh_edges, out_neigh_indices, out_neigh_edges, sentence, sentence_pos, id) in all_instances:
            if options.max_node_num != -1 and len(nodes) > options.max_node_num:
                continue # remove very long passages
            in_neigh_indices = [x[:options.max_in_neigh_num] for x in in_neigh_indices]
            in_neigh_edges = [x[:options.max_in_neigh_num] for x in in_neigh_edges]
            out_neigh_indices = [x[:options.max_out_neigh_num] for x in out_neigh_indices]
            out_neigh_edges = [x[:options.max_out_neigh_num] for x in out_neigh_edges]

            nodes_idx = word_vocab.to_index_sequence_for_list(nodes)
            nodes_chars_idx = None
            if options.with_char:
                nodes_chars_idx = char_vocab.to_character_matrix_for_list(nodes, max_char_per_word=options.max_char_per_word)
            in_neigh_edges_idx = [edgelabel_vocab.to_index_sequence_for_list(edges) for edges in in_neigh_edges]
            out_neigh_edges_idx = [edgelabel_vocab.to_index_sequence_for_list(edges) for edges in out_neigh_edges]
            sentence_idx = word_vocab.to_index_sequence_for_list(sentence[:options.max_answer_len])
            sentence_pos_idx = word_vocab.to_index_sequence_for_list(sentence_pos[:options.max_answer_len])
            instances.append((nodes_idx, nodes_chars_idx,
                in_neigh_indices, in_neigh_edges_idx, out_neigh_indices, out_neigh_edges_idx, sentence_idx, sentence, sentence_pos_idx, sentence_pos, id))

        all_instances = instances
        instances = None

        # sort instances based on length
        if isSort:
            all_instances = sorted(all_instances, key=lambda inst: (len(inst[0]), len(inst[-4])))

        self.num_instances = len(all_instances)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_instances = []
            for i in xrange(batch_start, batch_end):
                cur_instances.append(all_instances[i])
            cur_batch = G2SBatch(cur_instances, options, word_vocab=word_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
    def __init__(self, all_questions, word_vocab=None, char_vocab=None, POS_vocab=None, NER_vocab=None, options=None,
                 isShuffle=False, isLoop=False, isSort=True, batch_size=-1):
        self.options = options
        if batch_size ==-1: batch_size=options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (sent1, sent2, sent3) in all_questions:# sent1 is the long passage or article
            if options.max_passage_len!=-1:
                if sent1.get_length()> options.max_passage_len: continue # remove very long passages
            if sent2.get_length() < 3: continue # filter out very short questions (len<3)
            sent1.convert2index(word_vocab, char_vocab, POS_vocab, NER_vocab, max_char_per_word=options.max_char_per_word)
            #if len(sent1.word_idx_seq) != len(sent1.POS_idx_seq):
            #    print '!!sent1', len(sent1.word_idx_seq), len(sent1.POS_idx_seq)
            sent2.convert2index(word_vocab, char_vocab, POS_vocab, NER_vocab, max_char_per_word=options.max_char_per_word)
            if sent3 is not None:
                sent3.convert2index(word_vocab, char_vocab, POS_vocab, NER_vocab, max_char_per_word=options.max_char_per_word)
                #if len(sent3.word_idx_seq) != len(sent3.POS_idx_seq):
                #    print '!!sent3', len(sent3.word_idx_seq), len(sent3.POS_idx_seq)

            instances.append((sent1, sent2, sent3))

        all_questions = instances
        instances = None

        # sort instances based on length
        if isSort:
            all_questions = sorted(all_questions, key=lambda question: (question[0].get_length(), question[1].get_length()))
        else:
            random.shuffle(all_questions)
            random.shuffle(all_questions)
        self.num_instances = len(all_questions)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_questions = []
            for i in xrange(batch_start, batch_end):
                cur_questions.append(all_questions[i])
            cur_batch = QAQuestionBatch(cur_questions, options, word_vocab=word_vocab, char_vocab=char_vocab,
                    POS_vocab=POS_vocab, NER_vocab=NER_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
    def __init__(self, all_instances, word_vocab=None, char_vocab=None, edgelabel_vocab=None, options=None,
                 isShuffle=False, isLoop=False, isSort=False, batch_size=-1):
        self.options = options
        if batch_size ==-1: batch_size=options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (lex, poses, in_neigh, in_neigh_hidden, in_label, entity_indices, y) in all_instances:
            if options.max_node_num != -1 and len(lex) > options.max_node_num:
                continue # remove very long passages
            in_neigh = [x[:options.max_in_neigh_num] for x in in_neigh]
            in_neigh_hidden = [x[:options.max_in_neigh_num] for x in in_neigh_hidden]
            in_label = [x[:options.max_in_neigh_num] for x in in_label]

            lex_idx = word_vocab.to_index_sequence_for_list(lex)
            lex_chars_idx = None
            if options.with_char:
                lex_chars_idx = char_vocab.to_character_matrix_for_list(lex, max_char_per_word=options.max_char_per_word)
            in_label_idx = [edgelabel_vocab.to_index_sequence_for_list(edges) for edges in in_label]
            instances.append((lex_idx, lex_chars_idx, in_neigh, in_neigh_hidden, in_label_idx, entity_indices, y))

        all_instances = instances
        instances = None

        # sort instances based on length
        #if isSort:
        #    all_instances = sorted(all_instances, key=lambda inst: len(inst[0]))
        #else:
        #    random.shuffle(all_instances)
        #    random.shuffle(all_instances)
        self.num_instances = len(all_instances)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_instances = []
            for i in xrange(batch_start, batch_end):
                cur_instances.append(all_instances[i])
            cur_batch = G2SBatch(cur_instances, options, word_vocab=word_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        #self.isShuffle = isShuffle
        #if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
    def __init__(self, all_questions, word_vocab=None, char_vocab=None, POS_vocab=None, feat_vocab=None, action_vocab=None,
            options=None, isShuffle=False, isLoop=False, isSort=True, batch_size=-1, decode=False):
        self.options = options
        if batch_size ==-1: batch_size=options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (input_sent, concepts, cid2wid, feats, actions, action2cid, action2wid) in all_questions:# sent1 is the long passage or article
            if options.max_passage_len!=-1 and not decode:
                if input_sent.get_length() > options.max_passage_len: continue # remove very long passages
            input_sent.convert2index(word_vocab, char_vocab, POS_vocab, max_char_per_word=options.max_char_per_word)
            concepts_idx = word_vocab.to_index_sequence_for_list(concepts)
            feats_idx = [feat_vocab.to_index_sequence_for_list(x) for x in feats]
            for x in feats_idx:
                assert len(x) == options.feat_num, len(x)
            actions_idx = action_vocab.to_index_sequence_for_list(actions)
            instances.append((input_sent, concepts_idx, cid2wid, feats_idx, actions_idx, action2cid, action2wid))

        all_questions = instances
        instances = None

        # sort instances based on length
        if isSort:
            all_questions = sorted(all_questions, key=lambda question: (question[0].get_length(), len(question[4])))
        elif isShuffle:
            random.shuffle(all_questions)
            random.shuffle(all_questions)
        self.num_instances = len(all_questions)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances, batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_questions = []
            for i in xrange(batch_start, batch_end):
                cur_questions.append(all_questions[i])
            cur_batch = Batch(cur_questions, options, word_vocab=word_vocab, char_vocab=char_vocab, POS_vocab=POS_vocab,
                    feat_vocab=feat_vocab, action_vocab=action_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
Beispiel #9
0
def process_all_sentences(sentences,
                          sess,
                          batcher,
                          sent_embeddings,
                          sent_token_ids,
                          outpath,
                          batch_size=2,
                          max_length=-1,
                          use_h5=True):
    sentences = sorted(sentences, key=lambda sent: -len(sent[1]))
    batch_spans = padding_utils.make_batches(len(sentences), batch_size)
    all_results = {}
    hf = None
    if use_h5:
        hf = h5py.File(outpath, 'w')

    for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
        cur_ids = []
        cur_sents = []
        cur_lengths = []
        for i in xrange(batch_start, batch_end):
            (cur_id, cur_sent) = sentences[i]
            cur_length = len(cur_sent)
            if max_length != -1:
                if cur_length > max_length:
                    cur_length = max_length
                    cur_sent = cur_sent[:cur_length]
            cur_ids.append(cur_id)
            cur_sents.append(cur_sent)
            cur_lengths.append(cur_length)

        # Create batches of data.
        sent_ids = batcher.batch_sentences(cur_sents)
        # Compute ELMo representations (here for the input only, for simplicity).
        st = time.time()
        elmo_sent_output = sess.run(sent_embeddings,
                                    feed_dict={
                                        sent_token_ids: sent_ids
                                    })  # [batch_size, sent_length, 3*lm_dim]
        print('Length: {}, time: {}'.format(elmo_sent_output.shape,
                                            time.time() - st))
        sys.stdout.flush()

        st = time.time()
        for i in xrange(len(cur_ids)):
            cur_id = cur_ids[i]
            cur_length = cur_lengths[i]
            if use_h5:
                if not isinstance(cur_id, unicode):
                    cur_id = cur_id.encode('hex').decode('utf-8')
                if cur_id not in hf.keys():
                    embedding = elmo_sent_output[i, :cur_length, :]
                    hf.create_dataset(cur_id,
                                      embedding.shape,
                                      dtype='float32',
                                      data=embedding)
            else:
                all_results[cur_id] = elmo_sent_output[i, :cur_length, :]
        print('Storing time: {}'.format(time.time() - st))
    if use_h5:
        hf.close()
    else:
        import compress_utils
        compress_utils.save(all_results, outpath)
Beispiel #10
0
    def __init__(self,
                 options,
                 all_instances,
                 word_vocab,
                 char_vocab,
                 pos_vocab,
                 edgelabel_vocab,
                 isShuffle=False,
                 isLoop=False,
                 isSort=True,
                 is_training=False):
        self.options = options
        batch_size = options.batch_size
        # index tokens and filter the dataset
        processed_instances = []
        unk_count, total_count = 0.0, 0.0
        unk_idx = word_vocab.getIndex('UNK')
        for (toks, poses, nes, entity_indices, in_neigh, in_label, in_prob,
             out_neigh, out_label, out_prob, ref, id) in all_instances:
            in_neigh = [x[:options.max_in_neigh_num] for x in in_neigh]
            in_label = [x[:options.max_in_neigh_num] for x in in_label]
            out_neigh = [x[:options.max_out_neigh_num] for x in out_neigh]
            out_label = [x[:options.max_out_neigh_num] for x in out_label]
            if in_prob != None:
                in_prob = [x[:options.max_in_neigh_num] for x in in_prob]
                out_prob = [x[:options.max_out_neigh_num] for x in out_prob]

            toks_idx = word_vocab.to_index_sequence_for_list(toks)
            unk_count += sum([x == unk_idx for x in toks_idx])
            total_count += len(toks_idx)

            if is_training and options.word_dropout_rate > 0.0:
                for i in range(len(toks_idx)):
                    if random.random() < options.word_dropout_rate:
                        toks_idx[i] = unk_idx

            toks_chars_idx = None
            if options.with_char:
                toks_chars_idx = char_vocab.to_character_matrix_for_list(
                    toks, max_char_per_word=options.max_char_per_word)

            poses_idx = None
            if options.with_POS:
                poses_idx = word_vocab.to_index_sequence_for_list(poses)

            in_label_idx = [
                edgelabel_vocab.to_index_sequence_for_list(il)
                for il in in_label
            ]
            out_label_idx = [
                edgelabel_vocab.to_index_sequence_for_list(ol)
                for ol in out_label
            ]

            processed_instances.append(
                (toks_idx, toks_chars_idx, poses_idx, nes, entity_indices,
                 in_neigh, in_label_idx, in_prob, out_neigh, out_label_idx,
                 out_prob, ref, id))

        print('UNK percent {}'.format(unk_count / total_count))
        all_instances = processed_instances

        # sort instances based on length
        if isSort:
            all_instances = sorted(all_instances,
                                   key=lambda inst: len(inst[0]))
        elif isShuffle:
            random.shuffle(all_instances)
            random.shuffle(all_instances)
        self.num_instances = len(all_instances)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances,
                                                 batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_instances = []
            for i in xrange(batch_start, batch_end):
                cur_instances.append(all_instances[i])
            cur_batch = G2SBatch(cur_instances, options, word_vocab=word_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0
    def __init__(self,
                 all_questions,
                 enc_word_vocab=None,
                 dec_word_vocab=None,
                 char_vocab=None,
                 options=None,
                 isShuffle=False,
                 isLoop=False,
                 isSort=True,
                 batch_size=-1):
        self.options = options
        if batch_size == -1: batch_size = options.batch_size
        # index tokens and filter the dataset
        instances = []
        for (sent1, sent2,
             id) in all_questions:  # sent1 is the long passage or article
            sent1_idx = enc_word_vocab.to_index_sequence(sent1)
            sent2_idx = dec_word_vocab.to_index_sequence(sent2)
            oov_rate1 = 1.0 * np.sum(x == enc_word_vocab.vocab_size
                                     for x in sent1_idx) / len(sent1_idx)
            oov_rate2 = 1.0 * np.sum(x == dec_word_vocab.vocab_size
                                     for x in sent2_idx) / len(sent2_idx)
            if oov_rate1 > 0.2 or oov_rate2 > 0.2:
                print('!!!!!oov_rate for ENC {} and DEC {}'.format(
                    oov_rate1, oov_rate2))
                print(sent1)
                print(sent2)
                print('==============')
            if options.max_passage_len != -1:
                sent1_idx = sent1_idx[:options.max_passage_len]
            if options.max_answer_len != -1:
                sent2_idx = sent2_idx[:options.max_answer_len]
            instances.append((sent1_idx, sent2_idx, sent1, sent2, id))

        all_questions = instances
        instances = None

        # sort instances based on length
        if isSort:
            all_questions = sorted(all_questions,
                                   key=lambda xxx: (len(xxx[0]), len(xxx[1])))
        else:
            pass
        self.num_instances = len(all_questions)

        # distribute questions into different buckets
        batch_spans = padding_utils.make_batches(self.num_instances,
                                                 batch_size)
        self.batches = []
        for batch_index, (batch_start, batch_end) in enumerate(batch_spans):
            cur_questions = []
            for i in xrange(batch_start, batch_end):
                cur_questions.append(all_questions[i])
            cur_batch = Batch(cur_questions,
                              options,
                              word_vocab=dec_word_vocab,
                              char_vocab=char_vocab)
            self.batches.append(cur_batch)

        self.num_batch = len(self.batches)
        self.index_array = np.arange(self.num_batch)
        self.isShuffle = isShuffle
        if self.isShuffle: np.random.shuffle(self.index_array)
        self.isLoop = isLoop
        self.cur_pointer = 0