def run_online(device): # predict labels online for l in sys.stdin: l = l.strip() l_lst = l.split('\t') if not l or l_lst < 2: print('# blank line') continue text1 = nlp_utils.normalize_text(l_lst[0]) text2 = nlp_utils.normalize_text(l_lst[1]) words1 = nlp_utils.split_text(text1, char_based=setup['char_based']) words2 = nlp_utils.split_text(text2, char_based=setup['char_based']) xs = nlp_utils.transform_to_array2([[words1, words2]], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs['xs1'], xs['xs2'], softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) print('{}\t{:.4f}\t{}\t{}'.format(answer, score, ' '.join(words1), ' '.join(words2)))
def run_batch(device, batchsize=64): # predict labels by batch def predict_batch(words_batch): xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = model.predict(xs, softmax=True) answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() for words, answer, score in zip(words_batch, answers, scores): print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words))) batch = [] for l in sys.stdin: l = l.strip() if not l: if batch: predict_batch(batch) batch = [] print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) batch.append(words) if len(batch) >= batchsize: predict_batch(batch) batch = [] if batch: predict_batch(batch)
def run_batch(device, batchsize=64): # predict labels by batch def predict_batch(words_batch): xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = model.predict(xs, softmax=True) answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() for words, answer, score in zip(words_batch, answers, scores): print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words))) batch = [] print('Enter inputs for Batch Predictions') for l in sys.stdin: l = l.strip() if not l: if batch: predict_batch(batch) batch = [] print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) batch.append(words) if len(batch) >= batchsize: predict_batch(batch) batch = [] if batch: predict_batch(batch)
def read_snli(path, split, shrink=1, char_based=False): path = os.path.join(path, 'snli_1.0_{}.jsonl'.format(split)) dataset = [] labels = {'entailment': 0, 'neutral': 1, 'contradiction': 2} with open(path) as f: for i, x in enumerate(f.readlines()): if i % shrink != 0: continue x = json.loads(x) if x['gold_label'] in labels: label = labels[x['gold_label']] else: label = labels[most_common(x['annotator_labels'])] premise = split_text(normalize_text(x['sentence1']), char_based) hypothesis = split_text(normalize_text(x['sentence2']), char_based) dataset.append((premise, hypothesis, label)) return dataset
def read_dbpedia(tf, split, shrink=1, char_based=False): dataset = [] f = tf.extractfile('dbpedia_csv/{}.csv'.format(split)) for i, (label, title, text) in enumerate(csv.reader(f)): if i % shrink != 0: continue label = int(label) - 1 # Index begins from 1 tokens = split_text(normalize_text(text), char_based) dataset.append((tokens, label)) return dataset
def read_other_dataset(path, shrink=1, char_based=False): dataset = [] with io.open(path, encoding='utf-8', errors='ignore') as f: for i, l in enumerate(f): if i % shrink != 0 or not len(l.strip()) >= 3: continue label, text = l.strip().split(None, 1) label = int(label) tokens = split_text(normalize_text(text), char_based) dataset.append((tokens, label)) return dataset
def read_dbpedia(dbpedia_dir, split, shrink=1, char_based=False): dataset = [] f = open(os.path.join(dbpedia_dir, '{}.csv'.format(split))) for i, (label, title, text) in enumerate(csv.reader(f)): if i % shrink != 0: continue label = int(label) - 1 # Index begins from 1 tokens = split_text(normalize_text(text), char_based) dataset.append((tokens, label)) f.close() return dataset
def read_sst(sst_dir, split, shrink=1, char_based=False): dataset = [] f = open(os.path.join(sst_dir, '{}.txt'.format(split))) for i, line in enumerate(f.readlines()): if i % shrink != 0: continue tree = Tree.fromstring(line) tokens = ' '.join(tree.leaves()) tokens = split_text(normalize_text(tokens), char_based) label = int(tree.label()) dataset.append((tokens, label)) f.close() return dataset
def predict_fn(input_data, model): """ This function receives a NumPy array and makes a prediction on it using the model returned by `model_fn`. The default predictor used by `Chainer` serializes input data to the 'npy' format: https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html The Chainer container provides an overridable pre-processing function `input_fn` that accepts the serialized input data and deserializes it into a NumPy array. `input_fn` is invoked before `predict_fn` and passes its return value to this function (as `input_data`) The Chainer container provides an overridable post-processing function `output_fn` that accepts this function's return value and serializes it back into `npy` format, which the Chainer predictor can deserialize back into a NumPy array on the client. Args: input_data: a numpy array containing the data serialized by the Chainer predictor model: the return value of `model_fn` Returns: a NumPy array containing predictions which will be returned to the client For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository: https://github.com/aws/sagemaker-python-sdk For more on the Chainer container, please visit the sagemaker-chainer-containers repository: https://github.com/aws/sagemaker-chainer-containers """ trained_model, vocab = model words_batch = [] for sentence in input_data.tolist(): text = normalize_text(sentence) words = split_text(text) words_batch.append(words) xs = transform_to_array(words_batch, vocab, with_label=False) xs = convert_seq(xs, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = trained_model.predict(xs, softmax=True) answers = trained_model.xp.argmax(probs, axis=1) scores = probs[trained_model.xp.arange(answers.size), answers].tolist() output = [] for words, answer, score in zip(words_batch, answers, scores): output.append([' '.join(words), answer, score]) return np.array(output)
def run_batch(device, batchsize=64): # predict labels by batch def predict_batch(words_batch): xs = nlp_utils.transform_to_array2(words_batch, vocab, with_label=False) xs = nlp_utils.convert_seq2(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): probs = model.predict(xs['xs1'], xs['xs2'], softmax=True) answers = model.xp.argmax(probs, axis=1) scores = probs[model.xp.arange(answers.size), answers].tolist() for words, answer, score in zip(words_batch, answers, scores): print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words))) batch = [] for l in sys.stdin: l = l.strip() l_lst = l.split('\t') if not l or l_lst < 2: if batch: predict_batch(batch) batch = [] print('# blank line') continue text1 = nlp_utils.normalize_text(l_lst[0]) text2 = nlp_utils.normalize_text(l_lst[1]) words1 = nlp_utils.split_text(text1, char_based=setup['char_based']) words2 = nlp_utils.split_text(text2, char_based=setup['char_based']) batch.append((words1, words2)) if len(batch) >= batchsize: predict_batch(batch) batch = [] if batch: predict_batch(batch)
def run_online(device): # predict labels online for l in sys.stdin: l = l.strip() if not l: print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=device, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
def read_and_label(posneg, label): dataset = [] target = os.path.join(path, 'aclImdb', split, posneg, '*') for i, f_path in enumerate(glob.glob(target)): if i % shrink != 0: continue with io.open(f_path, encoding='utf-8', errors='ignore') as f: text = f.read().strip() tokens = split_text(normalize_text(text), char_based) if fine_grained: # extract from f_path. e.g. /pos/200_8.txt -> 8 label = fg_label_dict[f_path.split('_')[-1][:-4]] dataset.append((tokens, label)) else: dataset.append((tokens, label)) return dataset
def run_online(gpu): # predict labels online for l in sys.stdin: l = l.strip() if not l: print('# blank line') continue text = nlp_utils.normalize_text(l) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=gpu, with_label=False) with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))