def load_model(self): """ Imports the German language model. :return: """ nlp = de_core_news_sm.load() return nlp
def __init__(self): import spacy # nlp = spacy.load('de', disable=['ner', 'parser']) import de_core_news_sm # !python -m spacy download de_core_news_sm nlp = de_core_news_sm.load(disable=['parser', 'ner']) self.processor = nlp
def __init__(self, lang): if lang == LANG.EN: self.nlp = en_core_web_md.load() else: self.nlp = de_core_news_sm.load() self.stanford_ner = StanfordNERTagger(model, '../models/stanford-ner.jar', encoding='utf-8')
def de_lang(cls): me_list = ['ich', 'mein', 'meine'] embeddings_model = FlairEmbeddingModels().de_lang() nlp = de_core_news_sm.load() relationship_list = [ 'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester', 'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann', 'ehefrau', 'onkel', 'tante', 'freund' ] return cls(me_list, embeddings_model, nlp, relationship_list)
def translate_sentence(model, sentence, german, english, device, max_length=50): print(sentence) # sys.exit() # Load german tokenizer spacy_ger = de_core_news_sm.load() # Create tokens using spacy and everything in lower case (which is what our vocab is) if type(sentence) == str: tokens = [token.text.lower() for token in spacy_ger(sentence)] else: tokens = [token.lower() for token in sentence] # print(tokens) # sys.exit() # Add <SOS> and <EOS> in beginning and end respectively tokens.insert(0, german.init_token) tokens.append(german.eos_token) # Go through each german token and convert to an index text_to_indices = [german.vocab.stoi[token] for token in tokens] # Convert to Tensor sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device) # Build encoder hidden, cell state with torch.no_grad(): hidden, cell = model.encoder(sentence_tensor) outputs = [english.vocab.stoi["<sos>"]] for _ in range(max_length): previous_word = torch.LongTensor([outputs[-1]]).to(device) with torch.no_grad(): output, hidden, cell = model.decoder(previous_word, hidden, cell) best_guess = output.argmax(1).item() outputs.append(best_guess) # Model predicts it's the end of the sentence if output.argmax(1).item() == english.vocab.stoi["<eos>"]: break translated_sentence = [english.vocab.itos[idx] for idx in outputs] # remove start token return translated_sentence[1:]
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50): model.eval() # tokenize input if isinstance(sentence, str): nlp = de_core_news_sm.load() tokens = [token.text.lower() for token in nlp(sentence)] else: tokens = [token.lower() for token in sentence] # add <sos> and <eos> tokens = [src_field.init_token] + tokens + [src_field.eos_token] # get input's one-hot vec src_indexes = [src_field.vocab.stoi[token] for token in tokens] # add a batch dim and convert into tensor src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device) with torch.no_grad(): encoder_outputs = model.encoder(src_tensor) hidden = encoder_outputs # get first decoder input (<sos>)'s one hot trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] for i in range(max_len): trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device) with torch.no_grad(): output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs) pred_token = output.argmax(1).item() trg_indexes.append(pred_token) if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: break trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] return trg_tokens[1:]
def de_lang(cls): nlp = de_core_news_sm.load() embeddings_model = FlairEmbeddingModels().de_lang() # PP: e.g. 'I habe einen Sohn', 'I habe einen kleinen Bruder' # NP: e.g. 'Meine kleine Schwester' grammar = r""" PP: {<PRON><AUX><DET><ADJ>?<NOUN>} NP: {<DET><ADJ>?<NOUN>} REL: {<PP>|<NP>}""" relationship_list = [ 'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester', 'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann', 'ehefrau', 'onkel', 'tante', 'freund' ] me_list = ['ich', 'mein', 'meine'] return cls(nlp, grammar, relationship_list, me_list, embeddings_model)
def get_spacy_tokenizer(default_lingo, supported_languages, bigmodel_required): '''returns the spacy nlp function corresponding to the language of a document''' if default_lingo in supported_languages: if bigmodel_required == False: if default_lingo == "German": import de_core_news_sm nlp = de_core_news_sm.load() elif default_lingo == "English": import en_core_web_sm nlp = en_core_web_sm.load() elif default_lingo == "Spanish": import es_core_news_sm nlp = es_core_news_sm.load() elif default_lingo == "French": import fr_core_news_sm nlp = fr_core_news_sm.load() elif default_lingo == "Portuguese": import pt_core_news_sm nlp = pt_core_news_sm.load() else: import it_core_news_sm nlp = it_core_news_sm.load() else: if default_lingo == "German": import de_core_news_md nlp = de_core_news_md.load() elif default_lingo == "English": import en_core_web_md nlp = en_core_web_md.load() elif default_lingo == "Spanish": import es_core_news_md nlp = es_core_news_md.load() elif default_lingo == "French": import fr_core_news_md nlp = fr_core_news_md.load() elif default_lingo == "Portuguese": # there is no pt_md model import pt_core_news_sm nlp = pt_core_news_sm.load() else: # there is no it_md model import it_core_news_sm nlp = it_core_news_sm.load() else: print("NOT A SUPPORTED LANGUAGE!") return nlp
def _nlp(spacy_module: str) -> Optional[NLP]: print("Loading spacy language model for '", spacy_module, "'") if spacy_module == 'en': nlp = en_core_web_sm.load() elif spacy_module == 'es': nlp = es_core_news_sm.load() elif spacy_module == 'de': nlp = de_core_news_sm.load() elif spacy_module == 'fr': nlp = fr_core_news_sm.load() elif spacy_module == 'it': nlp = it_core_news_sm.load() elif spacy_module == 'pt': nlp = pt_core_news_sm.load() else: raise ValueError(f'Unsupported language {spacy_module}') return nlp
def get_sentiment_scores(data, emoji_dict): nlp = de_core_news_sm.load() sentiws = spaCySentiWS(sentiws_path="data\sentiws") nlp.add_pipe(sentiws) scores = np.zeros((len(data), 1)) for i in range(len(data)): doc = nlp(data[i]) for j, token in enumerate(doc): if token._.sentiws: scores[i][0] += token._.sentiws elif str(token).startswith('U0') and len(str(token)) == 10: emoji = str(token) emoji = emoji.replace("U000", "0x") emoji = emoji.lower() if emoji in emoji_dict.keys(): scores[i][0] += emoji_dict[emoji] return scores
def find_location_in_query(self, query): self.found_cities = [] nlp = de_core_news_sm.load() doc = nlp(query) found_locations = set() for ent in doc.ents: if ent.label_ == "LOC": found_locations.add(ent.text) for token in doc: # If the name of the city consists of only one word, then it should be checked whether the word belongs # to the tag "NE", since cities always belong to this tag. if token.text == ent.text and token.tag_ != "NE": # If the word cannot be assigned to the tag "NE", then it is not recognized as a city. found_locations.remove(token.text) self.found_cities = list(found_locations) if len(self.found_cities) == 0: return None return self.found_cities[0]
def translate_sentence(model, sentence, german, english, device, max_length=50): # Load german tokenizer import de_core_news_sm spacy_ger = de_core_news_sm.load() print('here') #spacy_ger = spacy.load("de") # Create tokens using spacy and everything in lower case (which is what our vocab is) if type(sentence) == str: tokens = [token.text.lower() for token in spacy_ger(sentence)] else: tokens = [token.lower() for token in sentence] # Add <SOS> and <EOS> in beginning and end respectively tokens.insert(0, german.init_token) tokens.append(german.eos_token) # Go through each german token and convert to an index text_to_indices = [german.vocab.stoi[token] for token in tokens] # Convert to Tensor sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device) outputs = [english.vocab.stoi["<sos>"]] for i in range(max_length): trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device) with torch.no_grad(): output = model(sentence_tensor, trg_tensor) best_guess = output.argmax(2)[-1, :].item() outputs.append(best_guess) if best_guess == english.vocab.stoi["<eos>"]: break translated_sentence = [english.vocab.itos[idx] for idx in outputs] # remove start token return translated_sentence[1:]
def doPreprocessing(DM): # read the text text = DM.readText() # load spacy spacy = de_core_news_sm.load() sentences = [] # mainIndex is needed for the comparison with xml-data mainIndex = 1 # read the text sentence by sentence for sent in text: sentence = [] # get the tags from spacy doc = spacy(sent) for tok in doc: # ignore spaces if tok.pos_ != "SPACE": # prune the token: not all spacy-infos are needed sentence.append(pruneToken(tok, mainIndex)) mainIndex += 1 # expand the token with morphological information expandToken(sentence) sentences.append(sentence) printAllToFile(sentences, DM)
def translate_annotated_encoder_decoder_de_en( model: annotated_encoder_decoder_de_en.EncoderDecoder, meta: Dict[str, Any], source_text: str, ) -> str: spacy_de: German = de_core_news_sm.load() def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] src_tok: List[str] = tokenize_de(source_text) src_idx: List[int] = [meta["SRC.vocab.stoi"][x] for x in src_tok ] + [meta["SRC.vocab.stoi"][meta["EOS_TOKEN"]]] src: Tensor = torch.LongTensor(src_idx) src_mask: Tensor = ( src != meta["SRC.vocab.stoi"][meta["PAD_TOKEN"]]).unsqueeze(-2) src_length: Tensor = torch.tensor(len(src)) # convert to batch size 1 src = src.unsqueeze(0) src_mask = src_mask.unsqueeze(0) src_length = src_length.unsqueeze(0) output = annotated_encoder_decoder_de_en.greedy_decode( model, src, src_mask, src_length, max_len=100, sos_index=meta["TRG.vocab.stoi"][meta["SOS_TOKEN"]], eos_index=meta["TRG.vocab.stoi"][meta["EOS_TOKEN"]], ) return " ".join([meta["TRG.vocab.itos"][x] for x in output])
else: return location return location # Main-Methode if __name__ == '__main__': try: #house_number = Housenumber() # Read tweets from fileLocation fileTweets = open("tweets.txt", "r") fileLocation = open("locations.txt", "w") nlp = de_core_news_sm.load() for line in fileTweets: tweet = json.loads(line) tweetText = tweet["text"] # Entity Detection nlpTweet = nlp(tweetText) entities = [(i, i.label_, i.label) for i in nlpTweet.ents] # write result in json format to file for obj in entities: #try: # location = house_number.add_housenumber(str(obj[0]), tweet["text"]) #except (Exception) as error : location = str(obj[0])
import random import spacy SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True import en_core_web_sm import de_core_news_sm spacy_eng = en_core_web_sm.load() spacy_ger = de_core_news_sm.load() def tokenize_ger(text): ### Tokenization of German sentence return [tok.text for tok in spacy_ger.tokenizer(text)] def tokenize_eng(text): ### Tokenization of English sentence return [tok.text for tok in spacy_eng.tokenizer(text)] german = Field(tokenize=tokenize_ger,lower= True,init_token = "<sos>", eos_token="<eos>") english = Field(tokenize=tokenize_eng,lower= True,init_token = "<sos>", eos_token="<eos>")
import string import unidecode from nltk.stem import WordNetLemmatizer from html.parser import HTMLParser import unicodedata from tqdm.auto import tqdm from nltk.corpus import wordnet as wn import fr_core_news_sm nlp_fr = fr_core_news_sm.load() import en_core_web_sm nlp_en = en_core_web_sm.load() import de_core_news_sm nlp_de = de_core_news_sm.load() import es_core_news_sm nlp_es = es_core_news_sm.load() import it_core_news_sm nlp_it = it_core_news_sm.load() import pt_core_news_sm nlp_pt = pt_core_news_sm.load() import nl_core_news_sm nlp_nl = nl_core_news_sm.load() # global variables wnl = WordNetLemmatizer()
def check_spacy_models(main, lang, pipeline): if lang == 'other': lang = 'eng' if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['tokenization', 'sentence_tokenization']: nlp_pipelines = ['sbd'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] if lang in ['nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa']: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) if 'sbd' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sbd' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def evaulate_parsers(article_de, article_en, article_fr): # Get all articles from the database # print(f"German Article to parse: {article_de}") # print(f"English Article to parse: {article_en}") # print(f"French Article to parse: {article_fr}") # TODO Create AllenNLP Parsing Function, and call the proper values.... allen_scores = [] allen_scores.append(50) allen_scores.append(60) allen_scores.append(70) # Stanford Parser Baseline CODE """ The Stanford Parser is the Baseline for this Application, we compare every other parser to the output of the Stanford Parser. Define Models, Assign Dataframe to List """ config = "tokenize,mwt,pos,lemma,depparse" nlp_en = stanza.Pipeline(lang='en', processors=config) nlp_de = stanza.Pipeline(lang='de', processors=config) nlp_fr = stanza.Pipeline(lang='fr', processors=config) df_stanford_en = stanford.parse_stan(article_en, nlp_en) df_stanford_de = stanford.parse_stan(article_de, nlp_de) df_stanford_fr = stanford.parse_stan(article_fr, nlp_fr) """ Spacy Parser: Define Spacy Models, Assign Dataframe to List """ df_spacy_de = spacyparser.parse_spacy(article_de, de_core_news_sm.load()) df_spacy_en = spacyparser.parse_spacy(article_en, en_core_web_sm.load()) df_spacy_fr = spacyparser.parse_spacy(article_fr, fr_core_news_sm.load()) # Evaluate Parsers against each other.... df_complete_de = pd.concat([df_stanford_de, df_spacy_de], axis=1, sort=False) df_complete_en = pd.concat([df_stanford_en, df_spacy_en], axis=1, sort=False) df_complete_fr = pd.concat([df_stanford_fr, df_spacy_fr], axis=1, sort=False) # print("German Dataframe Combined") df_complete_de['spacy_eval_upos'] = df_complete_de['upos'].str.lower( ) == df_complete_de['sp_upos'].str.lower() df_complete_de['spacy_eval_deprel'] = df_complete_de['deprel'].str.lower( ) == df_complete_de['sp_deprel'].str.lower() df_complete_de['spacy_eval'] = df_complete_de[ 'spacy_eval_upos'] == df_complete_de['spacy_eval_deprel'] # print("English Dataframe Combined") df_complete_en['spacy_eval_upos'] = df_complete_en['upos'].str.lower( ) == df_complete_en['sp_upos'].str.lower() df_complete_en['spacy_eval_deprel'] = df_complete_en['deprel'].str.lower( ) == df_complete_en['sp_deprel'].str.lower() df_complete_en['spacy_eval'] = df_complete_en[ 'spacy_eval_upos'] == df_complete_en['spacy_eval_deprel'] # print("French Dataframe Combined") df_complete_fr['spacy_eval_upos'] = df_complete_fr['upos'].str.lower( ) == df_complete_fr['sp_upos'].str.lower() df_complete_fr['spacy_eval_deprel'] = df_complete_fr['deprel'].str.lower( ) == df_complete_fr['sp_deprel'].str.lower() df_complete_fr['spacy_eval'] = df_complete_fr[ 'spacy_eval_upos'] == df_complete_fr['spacy_eval_deprel'] # Evaluate the Parsers Against the Stanford Parse # print(df_stanford_de.equals(df_spacy_de)) allen_scores = [0, 0, 0] spacy_scores = [] stanford_scores = [1, 1, 1] spacy_de_scores = df_complete_de.spacy_eval.value_counts().tolist() spacy_en_scores = df_complete_en.spacy_eval.value_counts().tolist() spacy_fr_scores = df_complete_fr.spacy_eval.value_counts().tolist() spacy_de_score = calculate_score(spacy_de_scores[0], spacy_de_scores[0] + spacy_de_scores[1]) spacy_en_score = calculate_score(spacy_en_scores[0], spacy_en_scores[0] + spacy_en_scores[1]) spacy_fr_score = calculate_score(spacy_fr_scores[0], spacy_fr_scores[0] + spacy_fr_scores[1]) spacy_scores.append(spacy_de_score) spacy_scores.append(spacy_en_score) spacy_scores.append(spacy_fr_score) # The Report Data sets Stanford Parser Output to 100 by default, as it is the parser we wan't to compare against. The other parsers are set by their values of true and false in comparison to the stanford parser report_data = { 'de_stan': stanford_scores[0], 'en_stan': stanford_scores[1], 'fr_stan': stanford_scores[2], 'de_spacy': spacy_scores[0], 'en_spacy': spacy_scores[1], 'fr_spacy': spacy_scores[2], 'de_allen': allen_scores[0], 'en_allen': allen_scores[1], 'fr_allen': allen_scores[2] } # report_data = {'de_stan': 100, 'en_stan': 100, 'fr_stan': 100, 'de_spacy': 93.2, 'en_spacy': 92.6, 'fr_spacy': 90.7, 'de_allen': 87.9, 'en_allen': 88.6, 'fr_allen': 90.2} return (report_data)
def __init__(self): self.nlp = de_core_news_sm.load() # grammar for spaCy POS Tags # extracts noun phrases (NP) and relationships (REL) self.grammar = r"""NP: {<DET>?<ADJ>*<NOUN>?<PROPN|PRON>*}
import os # os.environ['CUDA_VISIBLE_DEVICES'] = '3' import math import torch import torch.nn as nn from torch.autograd import Variable from torchtext.data import Field, BucketIterator import numpy as np import spacy import en_core_web_sm import de_core_news_sm spacy_en = en_core_web_sm.load() spacy_de = de_core_news_sm.load() from IPython import embed from model import NAT input = torch.randint(0, 100, (16, 20)).long() # input: [N, S] # torch.cuda.empty_cache() # vocab_src, vocab_tgt, S, d_embed=512, L=50, nhead=8, num_encoder_layers=6, # num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation="relu" model = NAT(vocab_src=100, vocab_tgt=50, S=20, num_encoder_layers=4, num_decoder_layers=4,\ dim_feedforward=512) num_parameters_train = sum(p.numel() for p in model.parameters() if p.requires_grad)
def check_spacy_models(main, lang, pipeline): if pipeline == 'word_tokenization': nlp_pipelines = [] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['sentence_tokenization', 'tokenization']: nlp_pipelines = ['sentencizer'] nlp_disable = ['tagger', 'parser', 'ner'] elif pipeline in ['pos_tagging', 'lemmatization']: nlp_pipelines = ['tagger'] nlp_disable = ['parser', 'ner'] # Languages with models if lang in [ 'nld', 'eng', 'fra', 'deu', 'ell', 'ita', 'por', 'spa', 'other' ]: if f'spacy_nlp_{lang}' in main.__dict__: if main.__dict__[f'spacy_nlp_{lang}'].pipe_names != nlp_pipelines: del main.__dict__[f'spacy_nlp_{lang}'] if f'spacy_nlp_{lang}' not in main.__dict__: # Dutch if lang == 'nld': import nl_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = nl_core_news_sm.load( disable=nlp_disable) # English elif lang == 'eng': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # French elif lang == 'fra': import fr_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = fr_core_news_sm.load( disable=nlp_disable) # German elif lang == 'deu': import de_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = de_core_news_sm.load( disable=nlp_disable) # Greek (Modern) elif lang == 'ell': import el_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = el_core_news_sm.load( disable=nlp_disable) # Italian elif lang == 'ita': import it_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = it_core_news_sm.load( disable=nlp_disable) # Portuguese elif lang == 'por': import pt_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = pt_core_news_sm.load( disable=nlp_disable) # Spanish elif lang == 'spa': import es_core_news_sm main.__dict__[f'spacy_nlp_{lang}'] = es_core_news_sm.load( disable=nlp_disable) # Other Languages elif lang == 'other': import en_core_web_sm main.__dict__[f'spacy_nlp_{lang}'] = en_core_web_sm.load( disable=nlp_disable) # Languages without models else: # Serbian (Cyrillic) & Serbian (Latin) if lang in ['srp_cyrl', 'srp_latn']: main.__dict__['spacy_nlp_srp_cyrl'] = spacy.blank('rs') main.__dict__['spacy_nlp_srp_latn'] = spacy.blank('rs') else: main.__dict__[f'spacy_nlp_{lang}'] = spacy.blank( wordless_conversion.to_iso_639_1(main, lang)) if 'sentencizer' in nlp_pipelines: nlp = main.__dict__[f'spacy_nlp_{lang}'] if 'sentencizer' not in nlp.pipe_names: nlp.add_pipe(nlp.create_pipe('sentencizer'))
def text_analysis(numberOfTopics, numberOfTopWords, textData, perplexity): """ preprocesses text + calculates the LDA with specific number of topics. args- -numberOfTopics: the number of topics over all documents -numberOfTopWords: the number of top words to be displayed -textData: DataFrame of text documents returns [pyLDAvis.thml, pd.DataFrame all results, DataFrame for wordPerTopic] """ # text preprocessing # initialize nlp nlp = de_core_news_sm.load() # feed the document into the object document_list = [nlp(answer) for answer in textData] # delete stop words without_stop_words = [] for doc in document_list: without_stop_words.append( [token for token in doc if token.is_stop != True]) # lemmatize lemma_list = [] for doc in without_stop_words: lemma_list.append([token.lemma_ for token in doc]) # clean expressions expressions = [ ":", "-", "(", ")", "\n", "\n\n", "?", ":", "\'", '\"', ".", ",", "'s", "...", "&", "+", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";-)", " ", ";", "/", "z.", "b." ] # acutal cleaning cleaned_lemma = [] for doc in lemma_list: cleaned_lemma.append( [token for token in doc if token not in expressions]) # convert text to lowercase low = [] for doc in cleaned_lemma: low.append([token.lower() for token in doc]) # terminate empty cells or 1-word cells final = [] text_list = [] for doc, t in zip(low, textData): if len(doc) > 1: final.append(doc) text_list.append(t) # cleane non informative words final2 = [] for doc in final: final2.append([ token for token in doc if token not in ["risiko", "chance", "ki"] ]) # text mining! # call vectorizer cV = CountVectorizer(tokenizer=dummy, preprocessor=dummy) # fit vecotrizer cV.fit(final2) # create bow corpus bow_corpus_sk = cV.transform(final2) # LDA alpha = 0.5 # the higher the more topics in one document beta = 0.1 # the higher the more words of the corpus are in the topic #call the lda object lda_sk = LatentDirichletAllocation(n_components=numberOfTopics, doc_topic_prior=beta, topic_word_prior=alpha, random_state=1) #fitting lda_sk.fit(bow_corpus_sk) """ # currently killed # pyLDAvis vis_sk = pyLDAvis.sklearn.prepare(lda_sk, bow_corpus_sk, cV) vis_html = pyLDAvis.prepared_data_to_html(vis_sk, template_type="simple") """ #prepare the pd.DataFrame! # probability of each word in a topic wordPerTopic_sk = pd.DataFrame( lda_sk.components_, index=["topic" + str(num) for num in range(lda_sk.n_components)], columns=cV.get_feature_names()) # top words for each topic top = numberOfTopWords topWordPerTopic_sk = pd.DataFrame( [[name, rows.sort_values(ascending=False).index.tolist()[:top]] for name, rows in wordPerTopic_sk.iterrows()]) # probability of each topic per document topicPerDoc_sk = pd.DataFrame( lda_sk.transform(bow_corpus_sk), index=["commentary" + str(i) for i in range(len(final2))], columns=["topic" + str(i) for i in range(lda_sk.n_components)]) topTopicPerDoc_sk = topicPerDoc_sk.T.apply(lambda x: x.idxmax()) # merge different parts merged = pd.DataFrame(topTopicPerDoc_sk).merge(topWordPerTopic_sk, how="left") merged = pd.concat([merged, pd.Series(text_list)], axis=1) # rename merged.columns = ["topic", "words", "text"] #split words in seperate cols merged[["word" + str(n) for n in range(top)]] = pd.DataFrame(merged["words"].tolist()) newSorting = ["topic"] + ["word" + str(i) for i in range(top)] + ["text"] merged = merged.loc[:, newSorting] topWords = topWordPerTopic_sk.iloc[:, 0] topWords = pd.concat([ topWords, pd.DataFrame(topWordPerTopic_sk.iloc[:, 1].tolist(), columns=["word " + str(n + 1) for n in range(top)]) ], axis=1) topWords.rename(columns={0: "Topic"}, inplace=True) # tsne for visualization --> probability of each word in bow_embedded = pd.DataFrame( TSNE(n_components=2, random_state=5, perplexity=perplexity).fit_transform( lda_sk.transform(bow_corpus_sk))) bow_embedded.topic = ["value1", "value2"] bow_embedded["topic"] = merged["topic"] bow_embedded["text"] = merged["text"] return [bow_embedded, merged, topWords]
import re from trainers.misc import embedding_dict from torchtext import data from torchtext.datasets.sequence_tagging import CoNLL2000Chunking from macros import DATA_PATH from loaders.dictionary import Dictionary, PretrainedDictionary import en_core_web_sm import de_core_news_sm import torch spacy_de = en_core_web_sm.load() spacy_en = de_core_news_sm.load() url = re.compile('(<url>.*</url>)') def chunking102(pretrain=False, emb_type='word'): corpus = CoNLLCorpus(emb_type='word', pretrain=pretrain) corp = {} corp['word2ind'] = corpus.dictionary.word2idx print("Vocabulary Size: {}".format(len(corp['word2ind']))) corp['ind2word'] = corpus.dictionary.idx2word corp['word2vec'] = corpus.dictionary.wv corp['id2vec'] = None return corp def tokenize(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]
def load(): # spacy.load() won't work with models over pip, use de_core_news_sm.load() instead. # see https://spacy.io/usage/models#models-loading import de_core_news_sm return de_core_news_sm.load()
def de_lang(cls): nlp = de_core_news_sm.load() me_list = ['ich', 'mein', 'meine'] spacy_per_symbol = 'PER' return cls(nlp, me_list, spacy_per_symbol)
#!/usr/bin/python3 import spacy from spacy import displacy import pt_core_news_sm import de_core_news_sm import en_core_web_sm from fuzzywuzzy import fuzz, process NAMED_ENTITY_MINIMUM_LENGTH = 3 SIMILARITY_RATIO_THRESHOLD = 70 NLP_PT = pt_core_news_sm.load() NLP_DE = de_core_news_sm.load() NLP_EN = en_core_web_sm.load() MODELS = { 'de': NLP_DE, 'en': NLP_EN, 'pt': NLP_PT, } def _get_nlp_model(language): return MODELS[language] def _get_named_entities(text, language): nlp_model = _get_nlp_model(language) named_entities = nlp_model(text).ents return named_entities