def spacy_sent_benchmark(datasets): nlpS = load_spacy_model(textcat='sentiment', vectorError=True) for dataset in datasets: if dataset == 'euparlsent': data = EuroparlSentiment1() if dataset == 'lccsent': data = LccSentiment() df = data.load_with_pandas() df['valence'] = df['valence'].map(to_label) # predict with spacy sentiment def predict(x): doc = nlpS(x) pred = max(doc.cats.items(), key=operator.itemgetter(1))[0] #mathc the labels labels = { 'positiv': 'positive', 'neutral': 'neutral', 'negativ': 'negative' } return labels[pred] df['pred'] = df.text.map(lambda x: predict(x)) report(df['valence'], df['pred'], 'Spacy sentiment (polarity)', dataset)
def spacy_sent_benchmark(datasets): nlpS = load_spacy_model(textcat='sentiment', vectorError=True) for dataset in datasets: if dataset == 'euparlsent': data = EuroparlSentiment1() if dataset == 'lccsent': data = LccSentiment() df = data.load_with_pandas() df['valence'] = df['valence'].map(sentiment_score_to_label) # predict with spacy sentiment def predict(x): doc = nlpS(x) pred = max(doc.cats.items(), key=operator.itemgetter(1))[0] #match the labels labels = {'positiv': 'positive', 'neutral': 'neutral', 'negativ': 'negative'} return labels[pred] spellings_map = {'subjective': 'subjektivt', 'objective': 'objektivt', 'positive': 'positiv', 'negative': 'negativ', 'neutral': 'neutral'} start = time.time() df['pred'] = df.text.map(lambda x: spellings_map[predict(x)]) print_speed_performance(start, len(df)) f1_report(df['valence'], df['pred'], 'Spacy sentiment (polarity)', dataset)
def benchmark_spacy_mdl(): def normalize_spacy_head(i, hd): return 0 if i == hd else hd + 1 nlp = load_spacy_model() parser = nlp.parser start = time.time() deps_pred = [] for sent in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(sent) doc = parser(doc) deprels = [] depheads = [] for i, tok in enumerate(doc): deprels.append(tok.dep_.lower()) depheads.append(normalize_spacy_head(i, tok.head.i)) deps_pred.append([(r, h) for r, h in zip(deprels, depheads)]) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(deps_pred) == num_sentences assert sum([len(s) for s in deps_pred]) == num_tokens print(dependency_report(deps_true, deps_pred))
def benchmark_spacy_mdl(): nlp = load_spacy_model() ner = nlp.entity predictions = [] start = time.time() for token in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(token) ner(doc) ents = [] for t in doc: if t.ent_iob_ == 'O': ents.append(t.ent_iob_) else: ents.append(t.ent_iob_ + "-" + t.ent_type_) predictions.append(ents) print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start) ) assert len(predictions) == num_sentences print(classification_report(sentences_entities, remove_miscs(predictions), digits=4))
def benchmark_spacy_mdl(): nlp = load_spacy_model() tagger = nlp.tagger start = time.time() tags_pred = [] for sent in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(sent) doc = tagger(doc) tags = [] for tok in doc: tags.append(tok.pos_) tags_pred.append(tags) print('**Spacy model**') print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start)) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(classification_report(tags_true, tags_pred, digits=4))
def test_predictions(self): nlp = load_spacy_model() some_text = "Jeg gik en tur med Lars" doc = nlp(some_text) self.assertTrue(doc.is_parsed) self.assertTrue(doc.is_nered) self.assertTrue(doc.is_tagged)
def test_predictions(self): nlp = load_spacy_model(textcat='sentiment') some_text = "Vi er glade for spacy!" doc = nlp(some_text) self.assertTrue(doc.is_parsed) self.assertTrue(doc.is_nered) self.assertTrue(doc.is_tagged) self.assertEqual(max(doc.cats.items(), key=operator.itemgetter(1))[0],'positiv')
def spacy_benchmark(): nlpS = load_spacy_model(textcat='sentiment', vectorError=True) # predict with spacy sentiment def predict(x): doc = nlpS(x) return max(doc.cats.items(), key=operator.itemgetter(1))[0] df_val['spacy'] = df_val.text.map(lambda x: predict(x)) report(df_val['polarity'], df_val['spacy'], 'Spacy', "twitter_sentiment(val)")
def test_predictions(self): nlp = load_spacy_model() some_text = "Jeg gik en tur med Lars Bo Jensen i går" doc = nlp(some_text) self.assertTrue(doc.is_parsed) self.assertTrue(doc.is_nered) self.assertTrue(doc.is_tagged) chunker = load_spacy_chunking_model(spacy_model=nlp) chunks_from_text = chunker.predict(some_text) chunks_from_tokens = chunker.predict([t.text for t in doc]) self.assertEqual(chunks_from_text, chunks_from_tokens) self.assertEqual(len(chunks_from_text), len(doc))
def __init__(self, hisia=True): try: from afinn import Afinn self.afinn = Afinn(language='da') except: print('afinn not installed') self.afinn = False try: from sentida import Sentida self.sent = Sentida() except: print('sentida not loading') self.sent = False try: from danlp.models import load_bert_emotion_model self.classifier = load_bert_emotion_model() except: self.classifier = False print('bert emotion not loading') try: from danlp.models import load_bert_tone_model self.classifier_tone = load_bert_tone_model() except: print('bert tone not working') self.classifier_tone = False try: from danlp.models import load_spacy_model self.nlp = load_spacy_model( textcat='sentiment', vectorError=True ) # if you got an error saying da.vectors not found, try setting vectorError=True - it is an temp fix except: print('spacy sentiment not working') self.nlp = False if hisia: try: from hisia import Hisia self.hisia = Hisia except: self.hisia = False print('hisia not working') else: self.hisia = False
def benchmark_spacy_mdl(): nlp = load_spacy_model() ner = nlp.entity predictions = [] start = time.time() for token in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(token) ner(doc) ents = [] for t in doc: if t.ent_iob_ == 'O': ents.append(t.ent_iob_) else: ents.append(t.ent_iob_ + "-" + t.ent_type_) predictions.append(ents) print('spaCy:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def benchmark_spacy_mdl(): nlp = load_spacy_model() tagger = nlp.tagger start = time.time() tags_pred = [] for sent in sentences_tokens: doc = nlp.tokenizer.tokens_from_list(sent) doc = tagger(doc) tags = [] for tok in doc: tags.append(tok.pos_) tags_pred.append(tags) print('**Spacy model**') print_speed_performance(start, num_sentences, num_tokens) assert len(tags_pred) == num_sentences assert sum([len(s) for s in tags_pred]) == num_tokens print(accuracy_report(tags_true, tags_pred), end="\n\n")
from utils.lipht_visualization import topic_distribution_barplot from utils.lipht_lda_utils import PrepareDictionary, lda_predict_df, df_lda_features, get_topics_and_probability, get_lda_topics, df_lda_preprocessing, TrainLDAModel from utils.lipht_data import getEngine, save_model, list_to_stringlist, stringlist_to_list import sys import warnings import logging.config if not sys.warnoptions: warnings.simplefilter("ignore") #Load the danish spacy model from danlp import spacy from danlp.models import load_spacy_model nlp = load_spacy_model() # Load models to train from train_pipeline import pipe as train_pipeline ############### #### Define global variables ############### # establish connection to sql server engine = getEngine('THN-P53','GRE') ############### #### Start main loop