Beispiel #1
0
    def __init__(self,
                 data_files,
                 vocab_file,
                 label_file,
                 batch_size=32,
                 reverse=False,
                 split_word=True,
                 max_len=1200):
        self.reverse = reverse
        self.split_word = split_word
        self.data_files = data_files
        self.batch_size = batch_size
        self.max_len = max_len

        self.vocab, self.w2i = read_vocab(vocab_file)
        self.i2w = {v: k for k, v in self.w2i.items()}
        self.label_names, self.l2i = read_vocab(label_file)
        self.i2l = {v: k for k, v in self.l2i.items()}

        self.tag_l2i = {"0": 0, "1": 1}
        self.tag_i2l = {v: k for k, v in self.tag_l2i.items()}

        self._raw_data = []
        self.items = []
        self._preprocess()
def output_newquery(inputs):
    inputs = str(inputs)
    # vocab_file='scripts/data/vocab.txt'
    # label_file='scripts/data/labels.txt'
    # checkpoint_dir='scripts/data/elmo_ema_0120'
    # out_file='scripts/data/new_query.json'

    # vocab_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/vocab.txt'
    # label_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/labels.txt'
    # checkpoint_dir = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/elmo_ema_0120'
    # out_file = '/home/kg/PycharmProjects/nlp_p2/nlp_2/static/modelData/new_query.json'

    vocab_file = 'static/modelData/vocab.txt'
    label_file = 'static/modelData/labels.txt'
    checkpoint_dir = 'static/modelData/elmo_ema_0120'
    out_file = 'static/modelData/new_query.json'

    feature_list = [
        "location_traffic_convenience",
        "location_distance_from_business_district", "location_easy_to_find",
        "service_wait_time", "service_waiters_attitude",
        "service_parking_convenience", "service_serving_speed", "price_level",
        "price_cost_effective", "price_discount", "environment_decoration",
        "environment_noise", "environment_space", "environment_cleaness",
        "dish_portion", "dish_taste", "dish_look", "dish_recommendation",
        "others_overall_experience", "others_willing_to_consume_again"
    ]
    new_dict = defaultdict()
    vocab, w2i = read_vocab(vocab_file)
    label_names, l2i = read_vocab(label_file)
    i2l = {v: k for k, v in l2i.items()}
    tag_l2i = {"1": 0, "0": 1, "-1": 2, "-2": 3}
    tag_i2l = {v: k for k, v in tag_l2i.items()}

    model_item = search_process(inputs, new_dict, feature_list, out_file, w2i)
    if isinstance(model_item, list):
        hparams = load_hparams(
            checkpoint_dir, {
                "mode": 'inference',
                'checkpoint_dir': checkpoint_dir + "/best_eval",
                'embed_file': None
            })
        with tf.Session(config=get_config_proto(
                log_device_placement=False)) as sess:
            model = Model(hparams)
            model.build()
            try:
                model.restore_model(sess)  #restore best solution
            except Exception as e:
                print("unable to restore model with exception", e)
                exit(1)
            (source, lengths, _, ids) = process_item(model_item)
            predict, logits = model.inference_clf_one_batch(
                sess, source, lengths)
            for i, (p, l) in enumerate(zip(predict, logits)):
                new_dict['id'] = 'new_query'
                new_dict['content'] = inputs
                for j in range(20):
                    label_name = i2l[j]
                    tag = tag_i2l[np.argmax(p[j])]
                    new_dict[label_name] = tag
            with open(out_file, 'w') as f:
                f.write(json.dumps(new_dict, ensure_ascii=False) + '\n')
    return new_dict
Beispiel #3
0
                labels = self.get_label(labels, self.tag_l2i)
                item_labels.append(labels)
            self._raw_data.append(
                DataItem(content=content,
                         labels=np.asarray(item_labels),
                         length=len(content),
                         id=int(item['id'])))
            self.items.append(item)

    self.num_batches = len(self._raw_data) // self.batch_size
    self.data_size = len(self._raw_data)
    print_out("# Got %d data items with %d batches" %
              (self.data_size, self.num_batches))


vocab, w2i = read_vocab(flags.vocab_file)

UNK_ID = 0
SOS_ID = 1
EOS_ID = 2


def _tokenize(content, w2i, max_tokens=1200, reverse=False, split=True):
    def get_tokens(content):
        tokens = content.strip().split()
        ids = []
        for t in tokens:
            if t in w2i:
                ids.append(w2i[t])
            else:
                for c in t: