Esempio n. 1
0
def get_imdb(vocab=None, shrink=1, fine_grained=False, char_based=False):
    tmp_path = download_imdb()

    print('read imdb')
    train = read_imdb(tmp_path,
                      'train',
                      shrink=shrink,
                      fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(tmp_path,
                     'test',
                     shrink=shrink,
                     fine_grained=fine_grained,
                     char_based=char_based)

    shutil.rmtree(tmp_path)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 2
0
def get_other_text_dataset(name,
                           vocab=None,
                           shrink=1,
                           char_based=False,
                           seed=777):
    assert (name in [
        'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity',
        'subj'
    ])
    datasets = download_other_dataset(name)
    train = read_other_dataset(datasets[0],
                               shrink=shrink,
                               char_based=char_based)
    if len(datasets) == 2:
        test = read_other_dataset(datasets[1],
                                  shrink=shrink,
                                  char_based=char_based)
    else:
        numpy.random.seed(seed)
        alldata = numpy.random.permutation(train)
        train = alldata[:-len(alldata) // 10]
        test = alldata[-len(alldata) // 10:]

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 3
0
def get_dbpedia(vocab=None, shrink=1, char_based=False):
    tf = download_dbpedia()

    print('read dbpedia')
    train = read_dbpedia(tf, 'train', shrink=shrink, char_based=char_based)
    test = read_dbpedia(tf, 'test', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 4
0
def get_quizbowl(data_dir='data/nn_guesser',
                 split_sentences=True,
                 num_answers=-1,
                 min_answer_freq=-1):
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    train_dir = os.path.join(data_dir, 'train.json')
    dev_dir = os.path.join(data_dir, 'dev.json')
    answers_dir = os.path.join(data_dir, 'answers.json')
    existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]]

    if all(existance):
        with open(train_dir, 'r') as f:
            train = json.loads(f.read())
        with open(dev_dir, 'r') as f:
            dev = json.loads(f.read())
        with open(answers_dir, 'r') as f:
            answers = json.loads(f.read())
    else:
        train, dev, answers = load_quizbowl(split_sentences, num_answers,
                                            min_answer_freq)
        with open(train_dir, 'w') as f:
            f.write(json.dumps(train))
        with open(dev_dir, 'w') as f:
            f.write(json.dumps(dev))
        with open(answers_dir, 'w') as f:
            f.write(json.dumps(answers))

    print('# train data: {}'.format(len(train)))
    print('# dev data: {}'.format(len(dev)))
    print('# class: {}'.format(len(answers)))

    vocab_dir = os.path.join(data_dir, 'vocab.json')
    if os.path.isfile(vocab_dir):
        with open(vocab_dir, 'r') as f:
            vocab = json.loads(f.read())
    else:
        vocab = make_vocab(train)
        with open(vocab_dir, 'w') as f:
            f.write(json.dumps(vocab))

    print('# vocab: {}'.format(len(vocab)))

    train = transform_to_array(train, vocab)
    dev = transform_to_array(dev, vocab)

    return train, dev, vocab, answers
Esempio n. 5
0
def convert_into_features_using_vocab(sentences, vocab):
    contents = []
    for doc_id, sent, tokens in sentences:
        features = [token['lemma'] for token in tokens]
        contents.append(features)

    features = transform_to_array(contents, vocab, with_label=False)
    return features
def get_sst(vocab=None, shrink=1, char_based=False):
    sst_dir = os.path.join(DATA_DIR, 'trees')
    if not os.path.exists(sst_dir):
        download_sst()

    print('read sst')
    train = read_sst(sst_dir, 'train', shrink=shrink, char_based=char_based)
    test = read_sst(sst_dir, 'dev', shrink=shrink, char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 7
0
def get_quizbowl(data_dir='data/nn_guesser', split_sentences=True,
        num_answers=-1, min_answer_freq=-1):
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    train_dir = os.path.join(data_dir, 'train.json')
    dev_dir = os.path.join(data_dir, 'dev.json')
    answers_dir = os.path.join(data_dir, 'answers.json')
    existance = [os.path.isfile(x) for x in [train_dir, dev_dir, answers_dir]]

    if all(existance):
        with open(train_dir, 'r') as f:
            train = json.loads(f.read())
        with open(dev_dir, 'r') as f:
            dev = json.loads(f.read())
        with open(answers_dir, 'r') as f:
            answers = json.loads(f.read())
    else:
        train, dev, answers = load_quizbowl(
                split_sentences, num_answers, min_answer_freq)
        with open(train_dir, 'w') as f:
            f.write(json.dumps(train))
        with open(dev_dir, 'w') as f:
            f.write(json.dumps(dev))
        with open(answers_dir, 'w') as f:
            f.write(json.dumps(answers))

    print('# train data: {}'.format(len(train)))
    print('# dev data: {}'.format(len(dev)))
    print('# class: {}'.format(len(answers)))

    vocab_dir = os.path.join(data_dir, 'vocab.json')
    if os.path.isfile(vocab_dir):
        with open(vocab_dir, 'r') as f:
            vocab = json.loads(f.read())
    else:
        vocab = make_vocab(train)
        with open(vocab_dir, 'w') as f:
            f.write(json.dumps(vocab))

    print('# vocab: {}'.format(len(vocab)))

    train = transform_to_array(train, vocab)
    dev = transform_to_array(dev, vocab)

    return train, dev, vocab, answers
Esempio n. 8
0
 def predict_batch(words_batch):
     xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
     xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
     with chainer.using_config('train', False), chainer.no_backprop_mode():
         probs = model.predict(xs, softmax=True)
     answers = model.xp.argmax(probs, axis=1)
     scores = probs[model.xp.arange(answers.size), answers].tolist()
     for words, answer, score in zip(words_batch, answers, scores):
         print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
Esempio n. 9
0
 def predict_batch(words_batch):
     xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
     xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
     with chainer.using_config('train', False), chainer.no_backprop_mode():
         probs = model.predict(xs, softmax=True)
     answers = model.xp.argmax(probs, axis=1)
     scores = probs[model.xp.arange(answers.size), answers].tolist()
     for words, answer, score in zip(words_batch, answers, scores):
         print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
Esempio n. 10
0
def get_imdb(vocab=None, shrink=1, fine_grained=False,
             char_based=False):
    tmp_path = download_imdb()

    print('read imdb')
    train = read_imdb(tmp_path, 'train',
                      shrink=shrink, fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(tmp_path, 'test',
                     shrink=shrink, fine_grained=fine_grained,
                     char_based=char_based)

    shutil.rmtree(tmp_path)

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
def get_imdb(vocab=None, shrink=1, fine_grained=False,
             char_based=False):
    imdb_path = os.path.join(DATA_DIR, 'aclImdb')
    if not os.path.exists(imdb_path):
        download_imdb()

    print('read imdb')
    train = read_imdb(DATA_DIR, 'train',
                      shrink=shrink, fine_grained=fine_grained,
                      char_based=char_based)
    test = read_imdb(DATA_DIR, 'test',
                     shrink=shrink, fine_grained=fine_grained,
                     char_based=char_based)

    if vocab is None:
        print('construct vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 12
0
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
Esempio n. 14
0
def get_other_text_dataset(name, vocab=None, shrink=1,
                           char_based=False, seed=777):
    assert(name in ['TREC', 'stsa.binary', 'stsa.fine',
                    'custrev', 'mpqa', 'rt-polarity', 'subj'])
    datasets = download_other_dataset(name)
    train = read_other_dataset(
        datasets[0], shrink=shrink, char_based=char_based)
    if len(datasets) == 2:
        test = read_other_dataset(
            datasets[1], shrink=shrink, char_based=char_based)
    else:
        numpy.random.seed(seed)
        alldata = numpy.random.permutation(train)
        train = alldata[:-len(alldata) // 10]
        test = alldata[-len(alldata) // 10:]

    if vocab is None:
        print('constract vocabulary based on frequency')
        vocab = make_vocab(train)

    train = transform_to_array(train, vocab)
    test = transform_to_array(test, vocab)

    return train, test, vocab
Esempio n. 15
0
def run_online(device):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
def run_online(gpu):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=gpu, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))