def read_lines_separated(path, shrink=1, char_based=False): dataset = [] with open(path, encoding='utf-8', errors='ignore') as f: for i, l in enumerate(f): if len(l.strip()) < 3: continue label, text = l.strip().split(None, 1) label = int(label) % 2 # TODO: don't do this, implement shift tokens = split_text(normalize_text(text), char_based) dataset.append((tokens, label)) return dataset
def predict(model, sentence): model, vocab, setup = model sentence = sentence.strip() text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) return answer, score
def read_other_dataset(path, shrink=1, char_based=False): dataset = [] with io.open(path, encoding='utf-8', errors='ignore') as f: for i, l in enumerate(f): if i % shrink != 0 or not len(l.strip()) >= 3: continue label, text = l.strip().split(None, 1) label = int(label) % 2 # todo only support binary classification # print(text) tokens = split_text(normalize_text(text), char_based) dataset.append((tokens, label)) return dataset
def get_vectors(model, sentences): model, vocab, setup = model vectors = [] for sentence in sentences: sentence = sentence.strip() text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): vector = model.encoder(xs) vectors.append(vector.data[0]) vectors = numpy.asarray(vectors) return vectors