def read(self, text, **kwargs): """Read the input file and use spacy to pre-process. Args: text (str): raw text to pre-process. max_length (int): maximum number of characters in a single text for spacy, default to 1,000,000 characters (1mb). """ max_length = kwargs.get('max_length', 10**6) if self.language == 'de': nlp = de_core_news_md.load(max_length=max_length) else: nlp = spacy.load(self.language, max_length=max_length) spacy_doc = nlp(text) sentences = [] for sentence_id, sentence in enumerate(spacy_doc.sents): sentences.append({ "words": [token.text for token in sentence], "lemmas": [token.lemma_ for token in sentence], "POS": [token.pos_ for token in sentence], "char_offsets": [(token.idx, token.idx + len(token.text)) for token in sentence] }) doc = Document.from_sentences(sentences, input_file=kwargs.get( 'input_file', None), **kwargs) return doc
def prepare_data(): spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) return train_data, valid_data, test_data, german, english
def lemmatize(sentences, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """Lemmatize all words i.e. gefunden -> finden""" texts_out = [] nlp = de_core_news_md.load() for sent in sentences: doc = nlp(sent) texts_out.append(' '.join( [token.lemma_ for token in doc if token.pos_ in allowed_postags])) return texts_out
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
from typing import List import de_core_news_md import en_core_web_md spacy_de = de_core_news_md.load() spacy_en = en_core_web_md.load() def tokenize_de(text: str) -> List[str]: ''' Tokenize German text from a string into a list of strings ''' return [token.text for token in spacy_de.tokenizer(text)] def tokenize_en(text: str) -> List[str]: ''' Tokenize English text from a string into a list of strings ''' return [token.text for token in spacy_en.tokenizer(text)]
def train(): spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) ### We're ready to define everything we need for training our Seq2Seq model ### # Training hyperparameters num_epochs = 20 learning_rate = 0.001 batch_size = 64 # Model hyperparameters load_model = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 # Needs to be the same for both RNN's num_layers = 2 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard to get nice loss plot writer = SummaryWriter(f"runs/loss_plot") step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) decoder_net = Decoder( input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout, ).to(device) model = Seq2Seq(encoder_net, decoder_net, len(english.vocab), device).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: The model has {count_parameters(model):,} trainable parameters" ) pad_idx = english.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) if load_model: load_checkpoint(torch.load("my_checkpoint_2_2.pth.tar"), model, optimizer) sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen." for epoch in range(num_epochs): print( f"{time.strftime('%Y/%m/%d-%H:%M:%S')}: [Epoch {epoch} / {num_epochs}]" ) checkpoint = { "state_dict": model.state_dict(), "optimizer": optimizer.state_dict() } # save_checkpoint(checkpoint) model.eval() translated_sentence = translate_sentence(model, sentence, german, english, device, max_length=50) print(f"Translated example sentence: \n {translated_sentence}") model.train() for batch_idx, batch in enumerate(train_iterator): # Get input and targets and get to cuda inp_data = batch.src.to(device) target = batch.trg.to(device) # Forward prop output = model(inp_data, target) # print('\n') # print('Input', inp_data.shape) # print('Target', target.shape) # print('Output', output.shape) # print('---------------------') # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss # doesn't take input in that form. For example if we have MNIST we want to have # output to be: (N, 10) and targets just (N). Here we can view it in a similar # way that we have output_words * batch_size that we want to send in into # our cost function, so we need to do some reshapin. While we're at it # Let's also remove the start token while we're at it output = output[1:].reshape(-1, output.shape[2]) target = target[1:].reshape(-1) optimizer.zero_grad() loss = criterion(output, target) # Back prop loss.backward() # Clip to avoid exploding gradient issues, makes sure grads are # within a healthy range torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) # Gradient descent step optimizer.step() # Plot to tensorboard writer.add_scalar("Training loss", loss, global_step=step) # print("Training loss", loss) step += 1 score = bleu(test_data[1:100], model, german, english, device) print(f"Bleu score {score*100:.2f}")
import nltk nltk.download('punkt') from germalemma import GermaLemma from HanTa import HanoverTagger as ht import math from langdetect import detect from sklearn.model_selection import GridSearchCV, train_test_split from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import LabelBinarizer import spacy import de_core_news_md nlp = de_core_news_md.load() # plotting import seaborn as sns import matplotlib.pyplot as plt """**Load the data**""" train_df = pd.read_csv('train.csv', sep=";") test_reduced_df = pd.read_csv('test_reduced.csv', sep=";") tagger = ht.HanoverTagger('morphmodel_ger.pgz') # durchlaufe Preprocess-Pipeline und verwende nur Nomen. def preprocess(text): try:
import torch import torch.nn as nn import torch.optim as optim import spacy from torch.utils.tensorboard import SummaryWriter from torchtext.datasets import Multi30k from torchtext.data import Field, BucketIterator, bleu_score import de_core_news_md import en_core_web_sm spacy_ger = de_core_news_md.load() spacy_eng = en_core_web_sm.load() def tokenize_ger(text): return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>")