def get_and_save_ner(unprocessed_text: str, lang_code: str, doc_title: str): if lang_code == 'en': nlp = en_core_web_sm.load() else: nlp = xx_ent_wiki_sm.load() doc = nlp(unprocessed_text) entity_label_pairs = [(x.text, x.label_) for x in doc.ents] label_counts = Counter([x.label_ for x in doc.ents]) html_title = re.sub(r'[\W\s]', '_', doc_title) if html_title[-1] == '_': html_title = html_title[:-1] html_title = f'static/labelled_texts/html_{html_title}.html' # displacy.serve(doc, style='ent') ner_text = displacy.render(doc, style='ent') with open(html_title, 'w', encoding='utf-8') as html_file: html_file.write(ner_text) return entity_label_pairs, label_counts, ner_text, html_title
def run_english(sent): import spacy import xx_ent_wiki_sm nlp = xx_ent_wiki_sm.load() sentence = nlp (sent) l = [(X.text, X.label_) for X in sentence.ents] # print(l) to_return = [] # print(l) for item in l: if len(item[0].split(' ')) == 1: if item[1] == 'PER': new_tuple = (item[0], 'PERSON') elif item[1] == 'ORG': new_tuple = (item[0], 'ORGANIZATION') elif item[1] == 'GPE': new_tuple = (item[0], 'GPE') else: new_tuple = (item[0], 'MISC') to_return.append(new_tuple) else: for tab in item[0].split(' '): if item[1] == 'PER': new_tuple = (tab, 'PERSON') to_return.append(new_tuple) elif item[1] == 'ORG': new_tuple = (tab, 'ORGANIZATION') to_return.append(new_tuple) elif item[1] == 'GPE': new_tuple = (tab, 'GPE') to_return.append(new_tuple) else: new_tuple = (tab, 'MISC') to_return.append(new_tuple) return to_return #print(run_english("Alex prezinta licenta."))
import json import re import pandas as pd import numpy as np from post import Post from polyglot.detect import Detector import spacy import xx_ent_wiki_sm import multiprocessing as mp from pathlib import Path spacy.require_gpu() nlp = xx_ent_wiki_sm.load() false_langs = {"kn", "un", "or", "chr", "xx"} translation_words = np.loadtxt("translation_prob.csv", usecols=0, dtype="str") translation_words = set(translation_words) def code_switch_polyglot(country, translation=True): """ Function to find code switch posts given a country :param translation: bool if we should remove translation posts :param country: str country to find code switching posts in :return: array array of Post objects """ global valid_countries comments = [] input_folder = Path("/ais/hal9000/masih/codeswitch/allposts/")
import spacy import pandas as pd import xx_ent_wiki_sm #Multi-language # import en_core_web_sm en_model = xx_ent_wiki_sm.load() # en_core_web_sm.load() import csv import json df_2 = pd.DataFrame([]) sample_text = "WTF, die grüne Grenze zwischen Österreich und Deutschland ist 800 km lang. Kein Problem für #Flüchtlinge. #Grenzkontrollen ist keine Lösung!" nlp = spacy.load('xx_ent_wiki_sm') #('en_core_web_sm') # df= pd.read_csv(r"C:\Users\insan\OneDrive\Desktop\task_initial\Tweet_data\raw_tweet.csv") # for index, row in df.iterrows(): # #print(row['Tweet']) # #sample_text+=(row['Tweet']) def ner(text): ent_docs = en_model(text) return [(ent.text, ent.label_) for ent in ent_docs.ents] print(ner(sample_text)) text_list, ne_list = map(list, zip(*ner(sample_text))) d = {'Tweet': text_list, 'Named Entity': ne_list} df_2 = pd.DataFrame(data=d, columns=['Tweet', 'Named Entity']) print(df_2) doc = nlp(sample_text)
import re import time from urllib.request import urlopen, Request import en_core_web_sm # download via python -m spacy download en_core_web_sm import nltk import xx_ent_wiki_sm # download via python -m spacy download xx_ent_wiki_sm nlpmultilang = xx_ent_wiki_sm.load() nlpeng = en_core_web_sm.load()'punkt', quiet=True)'averaged_perceptron_tagger', quiet=True) TIMEOUT_LIMIT = 32 def tokenize_title(title): """split title into initial tokens""" tokens = nltk.word_tokenize(title) tagged = nltk.pos_tag(tokens) # remove tokens that are tagged with CC (coordinating conjunction), DT (Determiner), EX (existential there), # IN (preposition of subordinating conjunction) PDT, PRP, PRP$, RP, TO, UH, WP, WP$, WRB, WDT tagged = list( filter( lambda x: x[1] != 'CC' and x[1] != 'DT' and x[1] != 'EX' and x[1] != 'IN' and x[1] != 'PDT' and x[1] != 'PRP' and x[1] != 'PRP$' and x[1] != 'RP' and x[1] != 'TO' and x[1] != 'UH' and x[1] != 'WP' and x[1] != 'WP$' and x[1] != 'WRB' and x[1] != 'WDT', tagged)) # TODO: potentially develop this to gather by clauses