Beispiel #1
0
    - test_texts
    - test_labels
    :return:
    """
    pass


def tokenizer(texts):  # create a tokenizer function
    tok = TextTokenizer('en')
    return tok.process_all(texts)


if __name__ == "__main__":
    # 1. Download data
    # untar_data(URI)

    # 2. Read data and save with 'normal' format: text, label
    # texts, labels, label_index = parse_text_data()
    # df = pd.DataFrame.from_dict({'text': texts, 'label': labels})
    # df.to_csv('./data/20_newsgroup.csv', index=None)

    # 3. Tokenize text to create vocabulary
    df = pd.read_csv('./data/20_newsgroup.csv')

    tokens = tokenizer(df[:10]['text'].tolist())
    vocab = Vocab.create(tokens, max_vocab=1000, min_freq=2)
    print(vocab.itos)
    print(vocab.stoi)

    # 4. create embedding matrix from pretrained word vectors
import numpy
import torch
from torch.nn import functional
from torch import nn
from fastai.text.transform import Vocab
import unidecode
import string

# Taken from https://gist.github.com/jvns/b6dda36b2fdcc02b833ed5b0c7a09112
# Download Hans Christian Anderson's fairy tales
# !wget -O fairy-tales.txt https://www.gutenberg.org/cache/epub/27200/pg27200.txt > /dev/null 2>&1

file = unidecode.unidecode(open('fairy-tales.txt').read())
# Remove the table of contents & Gutenberg preamble
text = file[5000:]
v = Vocab.create((x for x in text), max_vocab=400, min_freq=1)
num_letters = len(v.itos)
# training_set = torch.Tensor(v.numericalize([x for x in text])).type(torch.LongTensor).cuda()
training_set = torch.Tensor(v.numericalize([x for x in text
                                            ])).type(torch.LongTensor)
training_set = training_set[:100000]


class MyLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.h2o = nn.Linear(hidden_size, input_size)
        self.input_size = input_size
        self.hidden = None