Exemple #1
0
def get_and_save_ner(unprocessed_text: str, lang_code: str, doc_title: str):
    if lang_code == 'en':
        nlp = en_core_web_sm.load()
    else:
        nlp = xx_ent_wiki_sm.load()
    doc = nlp(unprocessed_text)
    entity_label_pairs = [(x.text, x.label_) for x in doc.ents]
    label_counts = Counter([x.label_ for x in doc.ents])

    html_title = re.sub(r'[\W\s]', '_', doc_title)
    if html_title[-1] == '_':
        html_title = html_title[:-1]
    html_title = f'static/labelled_texts/html_{html_title}.html'
    # displacy.serve(doc, style='ent')
    ner_text = displacy.render(doc, style='ent')
    with open(html_title, 'w', encoding='utf-8') as html_file:
        html_file.write(ner_text)

    return entity_label_pairs, label_counts, ner_text, html_title
Exemple #2
0
def run_english(sent):
        import spacy
        import xx_ent_wiki_sm
        nlp = xx_ent_wiki_sm.load()
        
        sentence = nlp (sent)

        l =  [(X.text, X.label_) for X in sentence.ents]
        # print(l)
        to_return = []
        
        # print(l)
        for item in l: 
                if len(item[0].split(' ')) == 1:
                        if item[1] == 'PER':
                                new_tuple = (item[0], 'PERSON')
                        elif item[1] == 'ORG':
                                new_tuple = (item[0], 'ORGANIZATION')
                        elif item[1] == 'GPE':
                                new_tuple = (item[0], 'GPE')
                        else:
                                new_tuple = (item[0], 'MISC')
                        to_return.append(new_tuple)
                else:
                        for tab in item[0].split(' '):
                                if item[1] == 'PER':
                                        new_tuple = (tab, 'PERSON')
                                        to_return.append(new_tuple)
                                elif item[1] == 'ORG':
                                        new_tuple = (tab, 'ORGANIZATION')
                                        to_return.append(new_tuple)
                                elif item[1] == 'GPE':
                                        new_tuple = (tab, 'GPE')
                                        to_return.append(new_tuple)
                                else:
                                        new_tuple = (tab, 'MISC')
                                        to_return.append(new_tuple)

        return to_return

#print(run_english("Alex prezinta licenta."))
import json
import re
import pandas as pd
import numpy as np
from post import Post
from polyglot.detect import Detector

import spacy
import xx_ent_wiki_sm
import multiprocessing as mp
from pathlib import Path

spacy.require_gpu()
nlp = xx_ent_wiki_sm.load()
false_langs = {"kn", "un", "or", "chr", "xx"}
translation_words = np.loadtxt("translation_prob.csv", usecols=0, dtype="str")
translation_words = set(translation_words)


def code_switch_polyglot(country, translation=True):
    """
    Function to find code switch posts given a country
    :param translation: bool
        if we should remove translation posts
    :param country: str country to find code switching posts in
    :return: array
        array of Post objects
    """
    global valid_countries
    comments = []
    input_folder = Path("/ais/hal9000/masih/codeswitch/allposts/")
import spacy
import pandas as pd
import xx_ent_wiki_sm  #Multi-language
# import en_core_web_sm
en_model = xx_ent_wiki_sm.load()
# en_core_web_sm.load()
import csv
import json
df_2 = pd.DataFrame([])

sample_text = "WTF, die grüne Grenze zwischen Österreich und Deutschland ist 800 km lang. Kein Problem für #Flüchtlinge. #Grenzkontrollen ist keine Lösung!"
nlp = spacy.load('xx_ent_wiki_sm')  #('en_core_web_sm')
# df= pd.read_csv(r"C:\Users\insan\OneDrive\Desktop\task_initial\Tweet_data\raw_tweet.csv")
# for index, row in df.iterrows():
#     #print(row['Tweet'])
#     #sample_text+=(row['Tweet'])


def ner(text):
    ent_docs = en_model(text)
    return [(ent.text, ent.label_) for ent in ent_docs.ents]


print(ner(sample_text))
text_list, ne_list = map(list, zip(*ner(sample_text)))
d = {'Tweet': text_list, 'Named Entity': ne_list}
df_2 = pd.DataFrame(data=d, columns=['Tweet', 'Named Entity'])

print(df_2)
doc = nlp(sample_text)
Exemple #5
0
import re
import time
from urllib.request import urlopen, Request

import en_core_web_sm  # download via python -m spacy download en_core_web_sm
import nltk
import xx_ent_wiki_sm  # download via python -m spacy download xx_ent_wiki_sm

nlpmultilang = xx_ent_wiki_sm.load()
nlpeng = en_core_web_sm.load()
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

TIMEOUT_LIMIT = 32


def tokenize_title(title):
    """split title into initial tokens"""
    tokens = nltk.word_tokenize(title)
    tagged = nltk.pos_tag(tokens)

    # remove tokens that are tagged with CC (coordinating conjunction), DT (Determiner), EX (existential there),
    # IN (preposition of subordinating conjunction) PDT, PRP, PRP$, RP, TO, UH, WP, WP$, WRB, WDT
    tagged = list(
        filter(
            lambda x: x[1] != 'CC' and x[1] != 'DT' and x[1] != 'EX' and x[1]
            != 'IN' and x[1] != 'PDT' and x[1] != 'PRP' and x[1] != 'PRP$' and
            x[1] != 'RP' and x[1] != 'TO' and x[1] != 'UH' and x[1] != 'WP' and
            x[1] != 'WP$' and x[1] != 'WRB' and x[1] != 'WDT', tagged))

    # TODO: potentially develop this to gather by clauses