Ejemplo n.º 1
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """

        max_length = kwargs.get('max_length', 10**6)

        if self.language == 'de':
            nlp = de_core_news_md.load(max_length=max_length)
        else:
            nlp = spacy.load(self.language, max_length=max_length)
        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
Ejemplo n.º 2
0
def prepare_data():
    spacy_ger = de_core_news_md.load()
    spacy_eng = en_core_web_sm.load()

    def tokenize_ger(text):
        return [tok.text for tok in spacy_ger.tokenizer(text)]

    def tokenize_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text)]

    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

    train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                        fields=(german,
                                                                english))

    german.build_vocab(train_data, max_size=10000, min_freq=2)
    english.build_vocab(train_data, max_size=10000, min_freq=2)

    return train_data, valid_data, test_data, german, english
Ejemplo n.º 3
0
def lemmatize(sentences, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Lemmatize all words i.e. gefunden -> finden"""
    texts_out = []
    nlp = de_core_news_md.load()
    for sent in sentences:
        doc = nlp(sent)
        texts_out.append(' '.join(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]))
    return texts_out
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required):
    '''returns the spacy nlp function corresponding to the language of a document'''
    if default_lingo in supported_languages:
        if bigmodel_required == False:
            if default_lingo == "German":
                import de_core_news_sm
                nlp = de_core_news_sm.load()
            elif default_lingo == "English":
                import en_core_web_sm
                nlp = en_core_web_sm.load()
            elif default_lingo == "Spanish":
                import es_core_news_sm
                nlp = es_core_news_sm.load()
            elif default_lingo == "French":
                import fr_core_news_sm
                nlp = fr_core_news_sm.load()
            elif default_lingo == "Portuguese":
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                import it_core_news_sm
                nlp = it_core_news_sm.load()
        else:
            if default_lingo == "German":
                import de_core_news_md
                nlp = de_core_news_md.load()
            elif default_lingo == "English":
                import en_core_web_md
                nlp = en_core_web_md.load()
            elif default_lingo == "Spanish":
                import es_core_news_md
                nlp = es_core_news_md.load()
            elif default_lingo == "French":
                import fr_core_news_md
                nlp = fr_core_news_md.load()
            elif default_lingo == "Portuguese":
                # there is no pt_md model
                import pt_core_news_sm
                nlp = pt_core_news_sm.load()
            else:
                # there is no it_md model
                import it_core_news_sm
                nlp = it_core_news_sm.load()
    else:
        print("NOT A SUPPORTED LANGUAGE!")
    return nlp
Ejemplo n.º 5
0
from typing import List

import de_core_news_md
import en_core_web_md

spacy_de = de_core_news_md.load()
spacy_en = en_core_web_md.load()


def tokenize_de(text: str) -> List[str]:
    '''
  Tokenize German text from a string into a list of strings
  '''
    return [token.text for token in spacy_de.tokenizer(text)]


def tokenize_en(text: str) -> List[str]:
    '''
  Tokenize English text from a string into a list of strings
  '''
    return [token.text for token in spacy_en.tokenizer(text)]
Ejemplo n.º 6
0
def train():
    spacy_ger = de_core_news_md.load()
    spacy_eng = en_core_web_sm.load()

    def tokenize_ger(text):
        return [tok.text for tok in spacy_ger.tokenizer(text)]

    def tokenize_eng(text):
        return [tok.text for tok in spacy_eng.tokenizer(text)]

    german = Field(tokenize=tokenize_ger,
                   lower=True,
                   init_token="<sos>",
                   eos_token="<eos>")

    english = Field(tokenize=tokenize_eng,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>")

    train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                        fields=(german,
                                                                english))

    german.build_vocab(train_data, max_size=10000, min_freq=2)
    english.build_vocab(train_data, max_size=10000, min_freq=2)

    ### We're ready to define everything we need for training our Seq2Seq model ###

    # Training hyperparameters
    num_epochs = 20
    learning_rate = 0.001
    batch_size = 64

    # Model hyperparameters
    load_model = False
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_size_encoder = len(german.vocab)
    input_size_decoder = len(english.vocab)
    output_size = len(english.vocab)
    encoder_embedding_size = 300
    decoder_embedding_size = 300
    hidden_size = 1024  # Needs to be the same for both RNN's
    num_layers = 2
    enc_dropout = 0.5
    dec_dropout = 0.5

    # Tensorboard to get nice loss plot
    writer = SummaryWriter(f"runs/loss_plot")
    step = 0

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device,
    )

    encoder_net = Encoder(input_size_encoder, encoder_embedding_size,
                          hidden_size, num_layers, enc_dropout).to(device)

    decoder_net = Decoder(
        input_size_decoder,
        decoder_embedding_size,
        hidden_size,
        output_size,
        num_layers,
        dec_dropout,
    ).to(device)

    model = Seq2Seq(encoder_net, decoder_net, len(english.vocab),
                    device).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print(
        f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters"
    )

    pad_idx = english.vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model,
                        optimizer)

    sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

    for epoch in range(num_epochs):
        print(
            f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]"
        )

        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        # save_checkpoint(checkpoint)

        model.eval()

        translated_sentence = translate_sentence(model,
                                                 sentence,
                                                 german,
                                                 english,
                                                 device,
                                                 max_length=50)

        print(f"Translated example sentence: \n {translated_sentence}")

        model.train()

        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            inp_data = batch.src.to(device)
            target = batch.trg.to(device)

            # Forward prop
            output = model(inp_data, target)

            # print('\n')
            # print('Input', inp_data.shape)
            # print('Target', target.shape)
            # print('Output', output.shape)
            # print('---------------------')

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin. While we're at it
            # Let's also remove the start token while we're at it
            output = output[1:].reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()
            loss = criterion(output, target)

            # Back prop
            loss.backward()

            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # Plot to tensorboard
            writer.add_scalar("Training loss", loss, global_step=step)
            # print("Training loss", loss)
            step += 1

    score = bleu(test_data[1:100], model, german, english, device)
    print(f"Bleu score {score*100:.2f}")
Ejemplo n.º 7
0
import nltk
nltk.download('punkt')

from germalemma import GermaLemma
from HanTa import HanoverTagger as ht
import math
from langdetect import detect
from sklearn.model_selection import GridSearchCV, train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

import spacy
import de_core_news_md
nlp = de_core_news_md.load()

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

"""**Load the data**"""

train_df = pd.read_csv('train.csv', sep=";")
test_reduced_df = pd.read_csv('test_reduced.csv', sep=";")

tagger = ht.HanoverTagger('morphmodel_ger.pgz')

# durchlaufe Preprocess-Pipeline und verwende nur Nomen.
def preprocess(text):
    try:
Ejemplo n.º 8
0
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torch.utils.tensorboard import SummaryWriter
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator, bleu_score
import de_core_news_md
import en_core_web_sm

spacy_ger = de_core_news_md.load()
spacy_eng = en_core_web_sm.load()


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenize_ger,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")
english = Field(tokenize=tokenize_eng,
                lower=True,
                init_token="<sos>",
                eos_token="<eos>")