Beispiel #1
0
def read_lines_separated(path, shrink=1, char_based=False):
    dataset = []
    with open(path, encoding='utf-8', errors='ignore') as f:
        for i, l in enumerate(f):
            if len(l.strip()) < 3:
                continue
            label, text = l.strip().split(None, 1)
            label = int(label) % 2  # TODO: don't do this, implement shift
            tokens = split_text(normalize_text(text), char_based)
            dataset.append((tokens, label))
    return dataset
def predict(model, sentence):
    model, vocab, setup = model
    sentence = sentence.strip()
    text = nlp_utils.normalize_text(sentence)
    words = nlp_utils.split_text(text, char_based=setup['char_based'])
    xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
    xs = nlp_utils.convert_seq(xs, device=-1, with_label=False)  # todo use GPU
    with chainer.using_config('train', False), chainer.no_backprop_mode():
        prob = model.predict(xs, softmax=True)[0]
    answer = int(model.xp.argmax(prob))
    score = float(prob[answer])
    return answer, score
Beispiel #3
0
def read_other_dataset(path, shrink=1, char_based=False):
    dataset = []
    with io.open(path, encoding='utf-8', errors='ignore') as f:
        for i, l in enumerate(f):
            if i % shrink != 0 or not len(l.strip()) >= 3:
                continue
            label, text = l.strip().split(None, 1)
            label = int(label) % 2  # todo only support binary classification
            # print(text)
            tokens = split_text(normalize_text(text), char_based)
            dataset.append((tokens, label))
    return dataset
def get_vectors(model, sentences):
    model, vocab, setup = model
    vectors = []
    for sentence in sentences:
        sentence = sentence.strip()
        text = nlp_utils.normalize_text(sentence)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=-1,
                                   with_label=False)  # todo use GPU
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            vector = model.encoder(xs)
            vectors.append(vector.data[0])
    vectors = numpy.asarray(vectors)
    return vectors