Python Text.Text Exemples, polyglot.text.Text.Text Python Exemples

Exemple #1

0

Afficher le fichier

def search():
    query = request.args.get("search")
    sentiment = request.args.get("sentiment")

    db_history_search = es.search(index='post_index', doc_type='post', q=query)

    items = list()
    for post in db_history_search['hits']['hits']:
        if sentiment == "positive" and post['_source']['score'] > 0.1:
            items.append([
                post['_source']['message'], post['_source']['score'],
                post['_source']['views'], post['_source']['likes'],
                post['_source']['date']
            ])
        elif sentiment == "negative" and post['_source']['score'] < -0.1:
            items.append([
                post['_source']['message'], post['_source']['score'],
                post['_source']['views'], post['_source']['likes'],
                post['_source']['date']
            ])
        elif sentiment == "neutral" and post['_source'][
                'score'] > -0.1 and post['_source']['score'] < 0.1:
            items.append([
                post['_source']['message'], post['_source']['score'],
                post['_source']['views'], post['_source']['likes'],
                post['_source']['date']
            ])

    if len(items) == 0:
        vk_session = vk_api.VkApi(login, password)
        vk_session.auth()
        vk = vk_session.get_api()
        groups = ["40316705", "15755094"]
        posts = list()
        for group in groups:
            news = vk.wall.search(owner_id=("-" + group),
                                  query=query,
                                  count=100,
                                  v="5.92",
                                  owners_only=0)
            news = news['items']

            for article in news:
                if article['post_type'] == 'post':
                    posts.append(article)

        posts_clean = list()
        for post in posts:
            if post['text'] != '':
                post_date = datetime.utcfromtimestamp(int(
                    post['date'])).strftime('%Y-%m-%d')
                posts_clean.append([
                    post['text'], post['views']['count'],
                    post['likes']['count'], post_date
                ])

        for post in posts_clean:
            text = re.sub(r"http\S+", "", post[0])
            text = Text(text)
            # calculate polarity
            polarity = 0
            norm = 0
            for w in text.words:
                polarity += w.polarity
                if polarity == w.polarity:
                    norm += 0.1
                else:
                    norm += 1
            polarity /= norm
            polarity = round(polarity, 2)
            if polarity > 0.1 and sentiment == "positive":
                items.append([post[0], polarity, post[1], post[2], post[3]])
            elif polarity < -0.1 and sentiment == "negative":
                items.append([post[0], polarity, post[1], post[2], post[3]])
            elif polarity > -0.1 and polarity < 0.1 and sentiment == "neutral":
                items.append([post[0], polarity, post[1], post[2], post[3]])
            es.index(index='post_index',
                     doc_type="post",
                     body={
                         'message': post[0],
                         'score': polarity,
                         'views': post[1],
                         'likes': post[2],
                         'date': post[3]
                     })

    #message score tags author date
    return render_template('search.html', items=items)

Exemple #2

0

Afficher le fichier

positive_sentence = []
negative_sentence = []
neutral_sentence = []
positive_with_negative_adjective = []
positive_with_negative_verb = []

positive_with_positive_adjective = []
positive_with_positive_verb = []

negative_with_positive_adjective = []
negative_with_positive_verb = []

negative_with_negative_adjective = []
negative_with_negative_verb = []

text = Text(raw, hint_language_code='fi')

for sentence in text.sentences:
    for word in words:
        #check if comment has entity in it
        if word in sentence:
            sentence_polarity = sentence.polarity
            sentiment += sentence_polarity
            if sentence_polarity < 0:
                negative_sentence.append(sentence)
            elif sentence_polarity > 0:
                positive_sentence.append(sentence)
            elif sentence_polarity == 0:
                neutral_sentence.append((sentence))

print("amount of positive sentences")

Exemple #3

0

Afficher le fichier

Fichier : analyze.py Projet : uvacw/EconomicNews_sentiment_CMM

 def polygloter(t):
     try:
         return Text(t, hint_language_code='NL').polarity
     except:
         return 0

Exemple #4

0

Afficher le fichier

Fichier : parser.py Projet : Catastropoulos/Speech2Graph

#DET: determiner
#INTJ: interjection
#NOUN: noun
#NUM: numeral
#PART: particle
#PRON: pronoun
#PROPN: proper noun
#PUNCT: punctuation
#SCONJ: subordinating conjunction
#SYM: symbol
#VERB: verb
#X: other

blob = """Je veux la moyenne d'âge des agents en fonction de leur salaire."""
blob2 = """ Quel est la moyenne d'âge des personnes travaillant en mairie et qui gagnent plus 3000 par mois?"""
text = Text(blob, hint_language_code='fr')
print(text.pos_tags)


def parser(text):
    """Parses the text into sub groups of words
    Returns a list of the subgroups"""
    agregation = []
    metric = []
    dimension = [[]]
    filters = [[]]

    text = Text(text, hint_language_code='fr')
    pos_tags = text.pos_tags

    #Finding the agregation

Exemple #5

0

Afficher le fichier

Fichier : TextRankingWithHybridTesting3.py Projet : mahimulislam/HybridBanglaTextSummarizerDjan

def main(textarea):
    pattern = re.compile(u'[\u0980-\u09FF]+', re.UNICODE)
    new_path1 = cur_path + '/entertainmentwordswithscore.txt'
    totaltext = Text(textarea)
    sample_file2 = open(new_path1, 'r', encoding='utf8')
    text = sample_file2.read()
    textual = Text(text)
    s = ""
    model = Sentence2Vec(cur_path + '/trainedsentence.model')
    scoresht = 0
    storescore = []
    sentencecount = 0
    eachsentu = []
    normfactortextr = 0.0
    normfactorsenti = 0.0
    normfactorkey = 0.0
    ll = 0

    for eachsent in totaltext.sentences:
        sentencecount = sentencecount + 1
        for wordsi in eachsent.words:
            if (wordsi == "," or wordsi == "'"):
                continue
            strings = wordsi
            try:
                answer = textual.find(strings)
                sd = len(strings)
                dot = ""
                for i in range(0, 12):
                    dot = dot + textual[answer + sd + 4 + i]
                normfactorkey = normfactorkey + (float(dot))
            except:
                print("S")
            if wordsi.polarity == 0:
                normfactorsenti = normfactorsenti + .0001

        strings = ""
        for wordssx in eachsent.words:
            strings += wordssx + ' '
        for xx in range(0, len(totaltext.sentences)):
            if xx == sentencecount - 1:
                continue
            stringx = ""
            for words2x in totaltext.sentences[xx].words:
                stringx += words2x + ' '
            simscore = 0.0
            simscore = model.similarity(stringx, strings)
            normfactortextr = normfactortextr + (float(simscore))
    sentencecount = 0

    for eachsent in totaltext.sentences:
        sentencecount = sentencecount + 1
        eachsentu.append(eachsent)
        scoresht = 0.0
        scoresht2 = 0.0

        for wordsi in eachsent.words:
            if (wordsi == "," or wordsi == "'"):
                continue
            strings = wordsi
            try:
                answer = textual.find(strings)
                sd = len(strings)
                dot = ""
                for i in range(0, 12):
                    dot = dot + textual[answer + sd + 4 + i]
                    scoresht = scoresht + (float(dot))
            except:
                print("S")
            if wordsi.polarity == 0:
                scoresht2 = scoresht2 + .0001

        scoresht3 = 0.0

        strings = ""
        for wordssx in eachsent.words:
            strings += wordssx + ' '
        for xx in range(0, len(totaltext.sentences)):
            if xx == sentencecount - 1:
                continue
            stringx = ""
            for words2x in totaltext.sentences[xx].words:
                stringx += words2x + ' '
            simscore = 0.0
            simscore = model.similarity(stringx, strings)
            scoresht3 = scoresht3 + (float(simscore))
        mixedscore = .5 * (scoresht3 / normfactortextr) + .3 * (
            scoresht / normfactorkey) + .2 * (scoresht2 / normfactorsenti)
        storescore.append(mixedscore)

    n = sentencecount

    for i in range(n):
        for j in range(0, n - i - 1):
            if storescore[j] < storescore[j + 1]:
                storescore[j], storescore[j + 1] = storescore[j +
                                                              1], storescore[j]
                eachsentu[j], eachsentu[j + 1] = eachsentu[j + 1], eachsentu[j]
                totaltext.sentences[j], totaltext.sentences[
                    j + 1] = totaltext.sentences[j + 1], totaltext.sentences[j]

    for k in range(0, int(sentencecount * .40)):
        if int(len(eachsentu[k])) < 10:
            continue
        s = s + str(eachsentu[k]) + '\n'
    return s

Exemple #6

0

Afficher le fichier

from polyglot.text import Text
txt = Text(r"""Lina del Castillo es profesora en el Instituto de Estudios Latinoamericanos Teresa Lozano Long (LLILAS) y el Departamento de Historia de la Universidad de Texas en Austin. Ella será la moderadora del panel “Los Mundos Políticos de Gabriel García Márquez” este viernes, Oct. 30, en el simposio Gabriel García Márquez: Vida y Legado.


LIna del Castillo


Actualmente, sus investigaciones abarcan la intersección de cartografía, disputas a las demandas de tierra y recursos, y la formación del n...el tren de medianoche que lleva a miles y miles de cadáveres uno encima del otro como tantos racimos del banano que acabarán tirados al mar. Ningún recuento periodístico podría provocar nuestra imaginación y nuestra memoria como este relato de García Márquez.


Contenido Relacionado


Lea más artículos sobre el archivo de Gabriel García Márquez


Reciba mensualmente las últimas noticias e información del Harry Ransom Center con eNews, nuestro correo electrónico mensual. ¡Suscríbase hoy!
""")

Exemple #7

0

Afficher le fichier

Fichier : morph.py Projet : alexeyev/awesome-azeri-nlp

from polyglot.text import Word, Text

words = "həmişə bütün hüquq normalarda hər üç element olmur".split(" ")

for w in words:
    w = Word(w, language="az")
    print("{:<20}{}".format(w, w.morphemes))
"""

həmişə              ['həmişə']
bütün               ['bütün']
hüquq               ['hüquq']
normalarda          ['norma', 'larda']
hər                 ['hər']
üç                  ['üç']
element             ['element']
olmur               ['olmur']

"""

text = "həmişəbütünhüquqnormalardahərüçelementolmur"

splitted_text = Text(text)
splitted_text.language = "az"
print(splitted_text.morphemes)
"""

['həmişə', 'bütün', 'hüquq', 'norma', 'larda', 'hər', 'üç', 'element', 'olmur']

"""

Exemple #8

0

Afficher le fichier

def polySentTokenize(text):
    sent_array = set()
    text = Text(text)
    for sent in text.sentences:
        sent_array.add(sent)
    return sent_array

Exemple #9

0

Afficher le fichier

Fichier : web_scraper5.py Projet : andreasbr1991/gender_classifier

def webscraper(url_boys,url_girls,pos_boys_filename,pos_girls_filename,text_boys_filename,text_girls_filename):
    boys_urllist=filelister(url_boys)
    girls_urllist=filelister(url_girls)
    girl_names=sorted(filelister('pigenavne.txt'),key=len,reverse=True)
    boy_names=sorted(filelister('drengenavne.txt'),key=len,reverse=True)
    unisex_names=sorted(filelister('unisexnavne.txt'),key=len,reverse=True) 
        
    boy_names_edt=[boy_name.strip().lower() for boy_name in boy_names if boy_name not in unisex_names]
    girl_names_edt=[girl_name.strip().lower() for girl_name in girl_names if girl_name not in unisex_names] 
    
    
    text_boys_file=open(text_boys_filename,'wb')
    text_girls_file=open(text_girls_filename,'wb')
    pos_boys_file=open(pos_boys_filename,'wb')
    pos_girls_file=open(pos_girls_filename,'wb')

    text_list_boys=[]
    pos_list_boys=[]
    text_list_girls=[]
    pos_list_girls=[]
    
    links_boys=set()
    links_girls=set()
    for url_boyname in boys_urllist:
        url_boyname_split=url_boyname.split(',')
        link=url_boyname_split[0]
        if not link.startswith('https'): 
            try:
                page=urllib.request.urlopen(link)
                try:
                    soup = BeautifulSoup(page,'html5lib')
                    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
                    try:
                        if soup.find(class_='author')!=None:
                            author=soup.find(class_='author').text
                            author_forename=author.split()[0].strip().lower()
                        elif soup.find(rel='author')!=None:
                            author=soup.find(rel='author').text
                            author_forename=author.split()[0].strip().lower()
                    except (ValueError,IndexError):
                        print('ValueError or IndexError')
                    for article in soup.find_all('article'):
                        for p in article.find_all('p'):
                            try:
                                if author_forename in boy_names_edt  and link not in links_boys:
                                    print(link)
                                    links_boys.add(link)
                                    visible_text = p.get_text() 
                                    text=Text(visible_text,hint_language_code='da')
                                    text_list_boys.append((author_forename,visible_text))
                                    pos=text.pos_tags
                                    pos_list_boys.append((author_forename,pos))
                                elif author_forename in girl_names_edt  and link not in links_boys:
                                    print(link)
                                    links_boys.add(link)
                                    visible_text = p.get_text()
                                    text=Text(visible_text,hint_language_code='da')
                                    text_list_girls.append((author_forename,visible_text))
                                    pos=text.pos_tags
                                    pos_list_girls.append((author_forename,pos))
                            except ValueError:
                                pass
                except TypeError:
                    pass
            except HTTPError as e:
               print('Error message:',e.msg)
               continue
            except URLError as e:
                print('Error reason:',e.reason)
            except http.client.IncompleteRead as e:
                page = e.partial
            except socket.gaierror:
                pass
            except TimeoutError:
                pass
            except ValueError:
                pass
            
    for url_girlname in girls_urllist:
        url_girlname_split=url_girlname.split(',')
        link=url_girlname_split[0]
        if not link.startswith('https'): 
            try:
                page=urllib.request.urlopen(link)
                try:
                    soup = BeautifulSoup(page,'html5lib')
                    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
                    try:
                        if soup.find(class_='author')!=None:
                            author=soup.find(class_='author').text
                            author_forename=author.split()[0].strip().lower()
                        elif soup.find(rel='author')!=None:
                            author=soup.find(rel='author').text
                            author_forename=author.split()[0].strip().lower()
                    except (ValueError,IndexError):
                        print('ValueError or IndexError')
                    for article in soup.find_all('article'):
                        for p in article.find_all('p'):
                            try:
                                if author_forename in boy_names_edt and link not in links_girls:
                                    print(link)
                                    links_girls.add(link)
                                    visible_text = p.get_text() 
                                    text=Text(visible_text,hint_language_code='da')
                                    text_list_boys.append((author_forename,visible_text))
                                    pos=text.pos_tags
                                    pos_list_boys.append((author_forename,pos))
                                elif author_forename in girl_names_edt and link not in links_girls:
                                    print(link)
                                    links_girls.add(link)
                                    visible_text = p.get_text()
                                    text=Text(visible_text,hint_language_code='da')
                                    text_list_girls.append((author_forename,visible_text))
                                    pos=text.pos_tags
                                    pos_list_girls.append((author_forename,pos))
                            except ValueError:
                                pass
                except TypeError:
                    pass
            except HTTPError as e:
               print('Error message:',e.msg)
               continue
            except URLError as e:
                print('Error reason:',e.reason)
            except http.client.IncompleteRead as e:
                page = e.partial
            except socket.gaierror:
                pass
            except TimeoutError:
                pass
            except ValueError:
                pass        
        

    try:
        pickle.dump(text_list_boys,text_boys_file)
        pickle.dump(pos_list_boys,pos_boys_file)
        pickle.dump(text_list_girls,text_girls_file)
        pickle.dump(pos_list_girls,pos_girls_file)
    except ValueError as e:
        pass

    pos_boys_file.close()
    pos_girls_file.close()
    text_boys_file.close()
    text_girls_file.close()

Exemple #10

0

Afficher le fichier

Fichier : en_NER.py Projet : burakyldrm/calisma

 def get_tagged_tokens(self, text):
     ptext = Text(text)  # output can be re-organised
     # @TODO # do this per sentence
     entities = [(" ".join(entity), entity.tag)
                 for entity in ptext.entities]
     return entities

Exemple #11

0

Afficher le fichier

import polyglot
from polyglot.text import Text, Word
# EXECUTE THIS COMMAND ON YOUR TERMINAL
# polyglot download embeddings2.en pos2.en
text = Text("Bonjour, Mesdames.")
print("Language Detected: Code={}, Name={}\n".format(text.language.code,
                                                     text.language.name))

zen = Text("Beautiful is better than ugly. "
           "Explicit is better than implicit. "
           "Simple is better than complex.")
print(zen.words)
text = Text("This is a car")

print("{:<16}{}".format("Word", "POS Tag") + "\n" + "-" * 30)
for word, tag in text.pos_tags:
    print(u"{:<16}{:>2}".format(word, tag))

Exemple #12

0

Afficher le fichier

    'acaoAutor', 'acaoVitima', 'acesso', 'armaAutor', 'autor', 'bensVitima',
    'caracteristicaFisicaPessoa', 'caracteristicaVeiculo', 'deslocamentoAutor',
    'idadeAutor', 'instrumentoAutor', 'local', 'quantidade', 'vestimentaAutor'
]

for filename in glob.glob(os.path.join(path, '*.json')):
    keys_file = open(filename, 'r', encoding='utf8')
    keys_json = json.loads(keys_file.read())  # Convert arquivo para json

    for entidade in entidades:
        print("Arquivo: " + filename + " - Entidade:  " + entidade)
        acaoAutor = keys_json[entidade]
        # Fazer splite de frase para palavas
        for dadosText in acaoAutor:
            vtTexto = dadosText
            text = Text(vtTexto, hint_language_code='pt')

            #Gravar palavras
            try:
                # Identificar em qual classes variaveis a palavra se enquadra (artigo, adjetivo, pronome, numeral, substantivo e verbo)
                for word, tag in text.pos_tags:
                    neighbors = embeddings.nearest_neighbors(word)
                    for w, d in zip(neighbors,
                                    embeddings.distances(word, neighbors)):
                        print("{:<8}{:.4f}".format(w, d))

                    # Garantir nao ter palavras duplicadas
                    sql_search_palavra = " Select count(id) from palavras Where palavra = ? And tag = ? "
                    where = (word, tag)
                    for count in cur.execute(sql_search_palavra,
                                             where).fetchall():

Exemple #13

0

Afficher le fichier

    def on_data(self, data):
        #while True:

        try:
            all_data = json.loads(data)
            #print(all_data)

        except Exception as e:
            raise e
        #print ("Debug Message: ",all_data)
        try:

            tweetTime = all_data["created_at"]
            #tweet = all_data["text"]
            originaltweet = all_data["text"]
            if all_data["truncated"] == "true":
                originaltweet = all_data["extended_tweet"]["full_text"]
            tweet_in_reply_to_status_id = all_data["in_reply_to_status_id"]
            tweet_in_reply_to_screen_name = all_data["in_reply_to_screen_name"]
            #tweet_mentions_screen_name = all_data["extended_tweet"]["entities"]["user_mentions"]["screen_name"] # []
            #tweet_mentions_name = all_data["extended_tweet"]["entities"]["user_mentions"]["name"] # []
            #tweet_mentions_is = all_data["extended_tweet"]["entities"]["user_mentions"]["id"] # []

            #tweet = self.clean_tweet(originaltweet)

            tweet = self.tweet_preprocessor(self.links_remover(originaltweet))

            #username twitter
            username = all_data["user"]["screen_name"]
            #name twitter
            name = all_data["user"]["name"]
            userid = all_data["user"]["id"]
            userdesc = all_data["user"]["description"]
            user_follower = all_data["user"]["followers_count"]
            user_location = all_data["user"]["location"]
            #user_place = all_data["user"]["place"]
            tweet_id = all_data["id"]
            #linguaggio del tweet
            language = all_data["lang"]
            #cosa abbiamo trovato
            print(
                "___________________________________________________________________________________________________"
            )
            print("ORIGINAL: Lang", language, "User:"******"Follower:",
                  user_follower, "Tweet:", originaltweet)
            print("MODIFIED: Lang", language, "User:"******"Follower:",
                  user_follower, "Tweet:", tweet)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            #pass
            raise

        avoid = False

        for profilo_twitter in AVOID_PROFILES:
            if profilo_twitter in str(tweet.encode('utf8')):
                avoid = True
                pass

        try:

            if avoid:
                print("Tweet have not passed semanthic filters")
            else:

                reply = False
                retweet = False
                like = False
                avoid = False
                follow = False
                message = False

                pos = 0
                neg = 0

                understood = False

                polytext = Text(str(tweet))

                #TODO: polytext.entities get a cursor with entities[]
                try:
                    print(polytext.pos_tags)
                except Exception as e:
                    pass

            # ADJ: adjective
            # ADP: adposition
            # ADV: adverb
            # AUX: auxiliary verb
            # CONJ: coordinating conjunction
            # DET: determiner
            # INTJ: interjection
            # NOUN: noun
            # NUM: numeral
            # PART: particle
            # PRON: pronoun
            # PROPN: proper noun
            # PUNCT: punctuation
            # SCONJ: subordinating conjunction
            # SYM: symbol
            # VERB: verb
            # X: other

                for idx, topicfound in enumerate(TOPICS):
                    if str(topicfound[0]).lower() in tweet.lower():
                        topic = TOPICS[idx]
                        print("topic=", topic)
                        understood = True

                if understood:
                    # FIRST LEVEL OF RESPONSE
                    try:
                        # punt tweet in polyglot object
                        sentences = polytext.sentences
                        print("BEFOR LOGIC")
                        like, reply, message = self.make_logic_reaction(
                            sentences, topic[0], topic[1])
                        print("AFTER LOGIC")
                    except Exception as e:
                        raise e
                        #pass

                    print("Processing: REPLY:", reply, " RETWEET:", retweet,
                          " LIKE:", like, " MSG: ", message)
                    if reply:
                        api.update_status(message,
                                          in_reply_to_status_id=tweet_id)
                        REPLIES_USERS.append(username)
                        print("######--> Replyed with:", message)
                    if retweet:
                        api.retweet(tweet_id)
                        print("######--> Retweet")
                        time.sleep(5)
                    if like:
                        api.create_favorite(tweet_id)
                        print("######--> Like")
                    if follow:
                        api.create_friendship(username)
                        print("######--> Follow ", username)

                    #if API.exists_friendship(user_a, user_b):
                    if message:
                        api.send_direct_message(username, "interessante :)")
                    return True
                else:
                    print("Put tis tweet in the trashcan .... ")
                    return True

        except Exception as exx:
            print("aborted", exx)
            print(sys.exc_info()[0])

            pass
        finally:
            time.sleep(20)

        return True

Exemple #14

0

Afficher le fichier

'(' \
'  id integer primary key AUTOINCREMENT, '\
'   word varchar(50), ' \
'   radical varchar(50), ' \
'   tag varchar(50)' \
')'
cur.execute(sql_create)
sql_insert = ' insert into miniDicionario (radical, word, tag) values (?, ?, ?) '
sql_update = ' update miniDicionario  set radical = ? Where word = ? And tag = ? '

path = './dados-base'
for filename in glob.glob(os.path.join(path, '*.dic')):
    arquivo = open(filename, 'r', encoding='utf8')

    for line in arquivo.readlines():
        line = line.split('/')
        text = line[0].replace('\n', '')
        print(text)
        text = Text(text, hint_language_code='pt')
        for word, tag in text.pos_tags:
            sql_search_palavra = " Select count(id) from miniDicionario Where word = ? And tag = ? "
            #radicao = text.morphemes
            where = (word, tag)
            for count in cur.execute(sql_search_palavra, where).fetchall():
                rec = ('', word, tag)
                if count[0] == 0:
                    cur.execute(sql_insert, rec)
                else:
                    cur.execute(sql_update, rec)

        con.commit()

Exemple #15

0

Afficher le fichier

"""
POS tagger for sermons in content.dat
"""
import pandas as pd
import nltk.data
from polyglot.text import Text

# data
DF = pd.read_csv("content.dat", header=0, index_col=None)
content = DF["content"].tolist()
fnames = DF["id"].tolist()

tokenizer = nltk.data.load("tokenizers/punkt/norwegian.pickle")
DATA_pos = []
i = 0
for i, text in enumerate(content):
    print("file {}".format(i))
    # sentence disambiguation
    sents = tokenizer.tokenize(text)
    # POS
    text_pos = []
    for blob in sents:
        textblob = Text(blob, hint_language_code='da')
        if textblob.pos_tags:
            text_pos.append(textblob.pos_tags)
    DATA_pos.append([fnames[i], text_pos])

DF_pos = pd.DataFrame(DATA_pos)
DF_pos.columns = ["id", "POS"]
DF_pos.to_csv("content_pos.dat", index=False)

Exemple #16

0

Afficher le fichier

Fichier : features.py Projet : mcnakhaee/hack_the_fake_news

 def get_pos_tags(self, input_text):
     pos_tags = []
     text = Text(input_text, hint_language_code='bg')
     pos_tags = text.pos_tags
     pos_tags = [pt[1] for pt in pos_tags]
     return pos_tags

Exemple #17

0

Afficher le fichier

#In this exercise and the next, you'll use the polyglot library to identify French entities. The library functions slightly differently than spacy, so you'll use a few of the new things you learned in the last video to display the named entity text and category.
#
#You have access to the full article string in article. Additionally, the Text class of polyglot has been imported from polyglot.text.

from polyglot.text import Text

article = """
French NER with polyglot I

In this exercise and the next, you'll use the polyglot library to identify French entities. The library functions slightly differently than spacy, so you'll use a few of the new things you learned in the last video to display the named entity text and category.

You have access to the full article string in article. Additionally, the Text class of polyglot has been imported from polyglot.text.
"""

# Create a new text object using Polyglot's Text class: txt
txt = Text(article)

# Print each of the entities found
for ent in txt.entities:
    print(ent)

# Print the type of ent
print(type(ent))

#French NER with polyglot II
#
#Here, you'll complete the work you began in the previous exercise.
#
#Your task is to use a list comprehension to create a list of tuples, in which the first element is the entity tag, and the second element is the full string of the entity text.

# Create the list of tuples: entities

Exemple #18

0

Afficher le fichier

from konlpy.corpus import my_corpus
from konlpy.tag import Hannanum
from konlpy.utils import concordance, pprint
from matplotlib import pyplot
from konlpy.tag import Kkma

from polyglot.text import Text

with open('vegetariankoreansister.txt', 'r') as myfile:
    data = myfile.read().replace('\n', ' ')

kkma = Kkma()

sentlist = (kkma.sentences(data))

sentpolarity = 0
senttotal = 0
sentcount = 0

for i in sentlist:
    text = Text(i, hint_language_code='ko')

    for w in text.words:
        senttotal += w.polarity
        sentcount += 1

    sentpolarity = senttotal / sentcount
    print(i)
    sentpolarity = 0
    senttotal = 0
    sentcount = 0

Exemple #19

0

Afficher le fichier

Fichier : pipeline1 - one source.py Projet : simonm3/compass

# %% [markdown] {"heading_collapsed": true}
# # sentiments

# %% {"hidden": true}
# compare polyglot, textblob
sents = [
    "he is strongly criticised", "he died yesterday", "he was killed by a car",
    "he is the least popular", "he is never any good",
    "he visited a cancer charity", "Exquisite jewels were worn by the Queen",
    "he is good", "he is bad", "he avoids the issue", "he hates jews",
    "jews hate him", "everyone hates him", "he is the most ineffective"
]
print("**** textblob, polyglot ****")
for sent in sents:
    print(round(tb(sent).polarity, 1), Text(sent).polarity, sent)

# %% {"hidden": true}
pol = []
for doc in docs:
    for s in doc.sents:
        if s.text.find("Brexiteer") >= 0:
            print(s.text.strip())
            try:
                pg = Text(s.text).polarity
                print(pg)
            except:
                pg = 0
            pol.append(pg)

# %% {"hidden": true}

Exemple #20

0

Afficher le fichier

def morphological(text):
    tokens = Text(text)
    return tokens.pos_tags

Exemple #21

0

Afficher le fichier

Fichier : day6_polyglot.py Projet : enliktjioe/28daysofnlp

## installation
# !pip install polyglot

# installing dependency packages [2]
get_ipython().system('pip install pyicu morfessor pycld2')


# # Quick Tutorial [1]

import polyglot
from polyglot.text import Text, Word


# ## Language Detection

text = Text("Bonjour, Mesdames.")
print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))


zen = Text("Beautiful is better than ugly. "
           "Explicit is better than implicit. "
           "Simple is better than complex.")
print(zen.words)


print(zen.sentences)


# ## POS Tagging

## dependencies

Exemple #22

0

Afficher le fichier

def webscraper(url_filename, urls_boys_filename, urls_girls_filename,
               pos_boys_filename, pos_girls_filename, text_boys_filename,
               text_girls_filename):
    urllist = filelister(url_filename)
    girl_names = sorted(filelister('pigenavne.txt'), key=len, reverse=True)
    boy_names = sorted(filelister('drengenavne.txt'), key=len, reverse=True)
    unisex_names = sorted(filelister('unisexnavne.txt'), key=len, reverse=True)

    boy_names_edt = [
        boy_name.strip().lower() for boy_name in boy_names
        if boy_name not in unisex_names
    ]
    girl_names_edt = [
        girl_name.strip().lower() for girl_name in girl_names
        if girl_name not in unisex_names
    ]

    urls_boys_file = codecs.open(urls_boys_filename, 'w', encoding='utf-8')
    urls_girls_file = codecs.open(urls_girls_filename, 'w', encoding='utf-8')

    text_boys_file = open(text_boys_filename, 'wb')
    text_girls_file = open(text_girls_filename, 'wb')
    pos_boys_file = open(pos_boys_filename, 'wb')
    pos_girls_file = open(pos_girls_filename, 'wb')

    text_list_boys = []
    pos_list_boys = []
    text_list_girls = []
    pos_list_girls = []

    for url in urllist:
        url = url.strip()
        author = None
        author_forename = None
        page = urllib.request.urlopen(url)
        soup = BeautifulSoup(page, 'html5lib')
        try:
            for lines in soup.find_all('div', class_='article_date'):
                if lines.find('a', class_='authorName') != None:
                    print(url)
                    author = lines.find('a', class_='authorName').text
                    author_forename = author.split()[0].strip().lower()
        except (ValueError, IndexError):
            print('ValueError or IndexError')
        if author_forename in boy_names_edt:
            urls_boys_file.write(url.strip() + ',' + author_forename)
            urls_boys_file.write('\n')
            for article in soup.find_all('div', class_='remaining_paragraphs'):
                for p in article.find_all('p'):
                    try:
                        visible_text = p.get_text()
                        text = Text(visible_text, hint_language_code='da')
                        text_list_boys.append((author_forename, visible_text))
                        pos = text.pos_tags
                        pos_list_boys.append((author_forename, pos))
                    except ValueError:
                        pass
        elif author_forename in girl_names_edt:
            urls_girls_file.write(url.strip() + ',' + author_forename)
            urls_girls_file.write('\n')
            for article in soup.find_all('div', class_='remaining_paragraphs'):
                for p in article.find_all('p'):
                    try:
                        visible_text = p.get_text()
                        text = Text(visible_text, hint_language_code='da')
                        text_list_girls.append((author_forename, visible_text))
                        pos = text.pos_tags
                        pos_list_girls.append((author_forename, pos))
                    except ValueError:
                        pass

    try:
        pickle.dump(text_list_boys, text_boys_file)
        pickle.dump(pos_list_boys, pos_boys_file)
        pickle.dump(text_list_girls, text_girls_file)
        pickle.dump(pos_list_girls, pos_girls_file)
    except ValueError as e:
        pass
    urls_boys_file.close()
    urls_girls_file.close()
    text_boys_file.close()
    text_girls_file.close()

Exemple #23

0

Afficher le fichier

Fichier : parser.py Projet : Catastropoulos/Speech2Graph

def parser(text):
    """Parses the text into sub groups of words
    Returns a list of the subgroups"""
    agregation = []
    metric = []
    dimension = [[]]
    filters = [[]]

    text = Text(text, hint_language_code='fr')
    pos_tags = text.pos_tags

    #Finding the agregation
    verb_counter = 0
    noun = None
    i = 0

    while noun == None and i < len(pos_tags):

        if pos_tags[i][1] == 'VERB':
            verb_counter += 1

        if verb_counter > 0:

            if pos_tags[i][1] == 'NOUN':
                noun = pos_tags[i][0]
                agregation_pos = i

        i += 1

    agregation.append(noun)

    #Finding the metric
    i = agregation_pos + 1

    while i < len(pos_tags) and pos_tags[i][1] != 'ADP':

        metric.append(pos_tags[i][0])
        i += 1

    end_metric_pos = i - 1

    #Finding the dimensions
    i = end_metric_pos + 1
    dimension_counter = 0

    while i < len(pos_tags) and pos_tags[i][1] != 'CONJ':
        print(pos_tags[i][1])

        if pos_tags[i][0] == 'et':
            dimension_counter += 1
            dimension.append([])

        else:
            dimension[dimension_counter].append(pos_tags[i][0])

        i += 1

    end_dimension_pos = i - 1

    #Finding the filters
    i = end_dimension_pos + 1
    filter_counter = 0

    while i < len(pos_tags):

        if pos_tags[i][0] == 'et':
            filter_counter += 1
            filters.append([])

        else:
            filters[filter_counter].append(pos_tags[i][0])

        i += 1

    return ([agregation, metric, dimension, filters])

Exemple #24

0

Afficher le fichier

    @param text The text that must be split in to sentences.
    """
    # sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
    sentence_delimiters = re.compile(
        u'[৷\u002f\u0053\u0056\u00a0\u00ad\u00d0\u00da\u00e6\u00f2\u00f3\u2013\u2014\\[\\]\n!?,;:\।\t\\"\\(\\)\\\'\‘\‘‘]')
    sentences = sentence_delimiters.split(text)
    # print(sentences, file=open("bangla_splitted_sentence.txt", "a", encoding='utf8'))
    return sentences

sample_file1 = open('Accident/Codes For Accident/Documents/accidentwordswithscore.txt', 'r', encoding='utf8')
sample_file2 = open('Economics/Codes For Economics/Documents/economicswordswithscore.txt', 'r', encoding='utf8')
sample_file3 = open('Entertainment/Codes For Entertainment/Documents/entertainmentwordswithscore.txt', 'r', encoding='utf8')
sample_file4 = open('Politics/Codes For Politics/Documents/politicswordswithscore.txt', 'r', encoding='utf8')

text1 = sample_file1.read()
textual1 = Text(text1)
scoresht1 = 0

text2 = sample_file2.read()
textual2 = Text(text2)
scoresht2 = 0

text3 = sample_file3.read()
textual3 = Text(text3)
scoresht3 = 0

text4 = sample_file4.read()
textual4 = Text(text4)
scoresht4 = 0

countacc=0

Exemple #25

0

Afficher le fichier

Fichier : emotechintentionmodelv2.py Projet : YuaNWA/process_chatbot_model

def emotechintentionmodel(SentenceToBe, synonym_num):
    ########
    # detector = Detector(SentenceToBe)
    # if detector.language.code == "vi":
    #     langthesaurusload("vn",FileLocThe)
    #     langmodelload("vn",LibLocLang)
    # else:
    #     langthesaurusload(detector.language.code,FileLocThe)
    #     langmodelload(detector.language.code,LibLocLang)
    # langthesaurusload("en", FileLocThe)
    # langmodelload("en",LibLocLang)
    ########
    intention_filters = ['PRON', "VERB"]
    object_filters = ['NOUN', 'PROPN']
    ########
    sentences = model.tokenize(SentenceToBe)
    ########
    for s in sentences:
        model.tag(s)  # inplace tagging
        model.parse(s)  # inplace parsing
    datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4)
    PosTagIntention = datause[datause.columns[1:4]].values.tolist()
    print(PosTagIntention)
    ################################
    #     ADJ: adjective
    #     ADP: adposition
    #     ADV: adverb
    #     AUX: auxiliary
    #     CCONJ: coordinating conjunction
    #     DET: determiner
    #     INTJ: interjection
    #     NOUN: noun
    #     NUM: numeral
    #     PART: particle
    #     PRON: pronoun
    #     PROPN: proper noun
    #     PUNCT: punctuation
    #     SCONJ: subordinating conjunction
    #     SYM: symbol
    #     VERB: verb
    #     X: other
    ################################
    sentence_intention = []
    sentence_object = []
    ####
    if len(PosTagIntention) > 1:
        for i in range(0, len(PosTagIntention)):
            #####
            if i == 0:
                if any(str(word).lower() in str(PosTagIntention[i][0]).lower() for word in question_words):
                    sentence_intention.append("Question")
            #####
            else:
                if all(str(word).lower() != str(PosTagIntention[i][0]).lower() for word in stop_words):
                    if any(str(word).lower() in str(PosTagIntention[i][2]).lower() for word in intention_filters):
                        sentence_intention.append(PosTagIntention[i][0])
                    if any(str(word).lower() in str(PosTagIntention[i][2]).lower() for word in object_filters):
                        sentence_object.append(PosTagIntention[i][0])
        #####
        sentence_intention = list(set(sentence_intention))
        sentence_object = list(set(sentence_object))
    #####
    else:
        sentence_intention = []
        sentence_object = []
    #####
    intlength = len(sentence_intention)
    for i in range(0, intlength):
        temp = thesaurus[(thesaurus['subject'] == sentence_intention[i].lower()) & (thesaurus['weight'] >= 0.9)]
        # get top 2
        sorted_temp = temp.sort_values(by=['weight'], ascending=False)
        try:
            sorted_temp = sorted_temp[0:synonym_num]
        except IndexError:
            print("not enough items")
        if len(sorted_temp) > 1:
            sentence_intention.extend((sorted_temp['word']))
    #####
    intlength = len(sentence_object)
    for i in range(0, intlength):
        temp = thesaurus[(thesaurus['subject'] == sentence_object[i].lower()) & (thesaurus['weight'] >= 0.9)]
        # get top 2
        sorted_temp = temp.sort_values(by=['weight'], ascending=False)
        try:
            sorted_temp = sorted_temp[0:synonym_num]
        except IndexError:
            print("not enough items")
        if len(sorted_temp) > 1:
            sentence_object.extend((sorted_temp['word']))
    #####
    NERobj = Text(SentenceToBe, hint_language_code="en").entities
    #####
    sentence_intention = list(set(sentence_intention))
    sentence_object = list(set(sentence_object))
    #####
    overall = sentence_intention + sentence_object
    return (sentence_intention, sentence_object, overall, NERobj)

Exemple #26

0

Afficher le fichier

from polyglot.text import Text, Word
import arabic_reshaper
from bidi.algorithm import get_display

# print(downloader.supported_languages_table("sentiment2", 3))

# text = Text("The movie was really good.")
blob = """"آمریکا و چین در عالیترین سطح امنیتی، درباره چه موضوعاتی مذاکره می‌کنند."""

print("{:<16}{}".format("Word", "Polarity") + "\n" + "-" * 30)
# for w in text.words:
#     text_ = arabic_reshaper.reshape(w)
#     bidi_text = get_display(text_)
#     print("{:<16}{:>2}".format(bidi_text, w.polarity))
# blob = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world"."""
text = Text(blob)
for sent in text.sentences:
    print(sent, "\n")
    for entity in sent.entities:
        print(entity.tag, entity)

first_sentence = text.sentences[0]
first_entity = first_sentence.entities[0]
# print(first_entity)
# # print(first_entity.positive_sentiment)
# # print(first_entity.negative_sentiment)
for w in first_sentence.entities:
    # text_ = arabic_reshaper.reshape(w)
    # bidi_text = get_display(text_)
    print("{} :  positive: {:<2} | negative: {:>2}".format(
        w, str(w.positive_sentiment), str(w.negative_sentiment)))

Exemple #27

0

Afficher le fichier

Fichier : africacheck.py Projet : Zoher15/Centralifact

    def parse(self, response):
        claimdf = pd.DataFrame()
        #Extructing microdata or json in RDFA format
        data = extruct.extract(response.text, response.url)
        #Domain Name
        domain = urlparse(response.url).netloc.strip('www').strip('.com')
        #Selecting Microdata
        selected = [
            properties for properties in data['microdata']
            if properties['type'] == 'http://schema.org/ClaimReview'
        ]
        if selected:
            mode = 'micro'
        else:
            #If micro fails, selecting JSON
            try:
                selected = [
                    properties for properties in data['json-ld']
                    if properties['@type'] == 'ClaimReview'
                    or properties['@type'] == ["ClaimReview"]
                ]
            except KeyError:
                selected = [
                    properties for properties in data['json-ld'][0]['@graph']
                    if properties['@type'] == 'ClaimReview'
                ]
            mode = 'json'
        if selected:
            #If JSON or micro succeed
            for elements in selected:
                if mode == 'micro':
                    elements = elements['properties']
                for key in elements:
                    if type(elements[key]) == list:
                        elements[key] = elements[key][0]
                ##Flattening Dictionary
                scraped_data = pd.io.json.json_normalize(elements)
                ##Renaming the columns of the dataframe
                scraped_data.columns = map(self.column_mapper,
                                           list(scraped_data.columns))
                ##Dropping unimportant columns
                scraped_data = scraped_data.drop([None], axis=1)
                ##Checking if fact_checker_name exists or review_author_name
                try:
                    scraped_data.loc[:, 'fact_checker_name'] = scraped_data[
                        'fact_checker_name']
                except KeyError:
                    try:
                        scraped_data.loc[:,
                                         'fact_checker_name'] = scraped_data[
                                             'review_author_name']
                    except KeyError:
                        #As a last resort extracting domain name from the url
                        domname = urlparse(
                            scraped_data.loc[0, 'fact_checker_url']
                        ).hostname.split('.')[1].capitalize()
                        scraped_data['fact_checker_name'] = domname
                        scraped_data['review_author_name'] = domname
                try:
                    scraped_data.loc[:,
                                     'claim_text'] = scraped_data['claim_text']
                except KeyError:
                    scraped_data.loc[:, 'claim_text'] = scraped_data[
                        'claim_description']
                    scraped_data = scraped_data.drop(['claim_description'],
                                                     axis=1)
                ##Appending to the dataframe
                claimdf = claimdf.append(scraped_data, ignore_index=True)
            ##Filtering columns needed for Claim_review table
            claim_review = claimdf.filter(
                regex=
                '(review_url|review_date|claimID|fact_checkerID|best_rating|worst_rating|rating_value|rating_name|review_author_name|review_rating_img|review_modified_date|review_headline|review_img|review_description)'
            )
            claim_review.loc[:, 'claimID'] = 0
            claim_review.loc[:, 'fact_checkerID'] = 0
            claim_review.loc[:, 'review_crawl_date'] = str(
                datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))
            claim = claimdf.filter(
                regex=
                '(claim_text|claim_description|claim_author_name|claim_url|claim_date|claim_author_img|claim_author_job|claim_location|claim_location_url)'
            )
            claim.loc[:, 'claim_crawl_date'] = str(
                datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S'))
            ##Filtering columns needed for Fact_checker table
            fact_checker = claimdf.filter(
                regex='(fact_checker_name|fact_checker_url|fact_checker_img)')
            ####Creating MySQL Engine
            engine = create_engine(URL(**settings.DATABASE),
                                   connect_args={'charset': 'utf8'})
            #################Checking if fact checker exists
            fact_checker_check = pd.read_sql_query(
                'select * from fact_checker where fact_checker_name="%s"' %
                (fact_checker.loc[0, 'fact_checker_name']),
                con=engine)
            if len(fact_checker_check) == 0:
                ###################If fact checker does not exist
                pd.DataFrame(fact_checker.iloc[0]).T.to_sql('fact_checker',
                                                            engine,
                                                            if_exists='append',
                                                            index=False)
                fact_checker_check = pd.read_sql_query(
                    'select * from fact_checker where fact_checker_name="%s"' %
                    (fact_checker.loc[0, 'fact_checker_name']),
                    con=engine)
                fact_checkerID = fact_checker_check.loc[0, 'fact_checkerID']
            else:
                ##Storing fact_checkerID for claim_review
                fact_checkerID = fact_checker_check.loc[0, 'fact_checkerID']

            #############Iterating through the Claim_reiew Dataframe
            for i in range(len(claim_review)):
                ################Checking if claim exists
                flag = 0
                claim_review_check = pd.read_sql_query(
                    "select * from claim_review where review_url='%s'" %
                    (claim_review.loc[i, 'review_url']),
                    con=engine)
                if len(claim_review_check) == 0:
                    #Row for this review does not exist
                    flag = 1
                else:
                    #Row exists
                    modified_date = claim_review_check.loc[
                        0, 'review_modified_date']
                    try:
                        if claim_review.loc[
                                i, 'review_modified_date'] != modified_date:
                            #If review_modified_data has changed
                            flag = 1
                    except KeyError:
                        #If review_modified_data does not exist in the parsed dataframe
                        if modified_date != None:
                            flag = 1
                if flag == 1:
                    claim_check = pd.read_sql_query(
                        "select * from claim where claim_text='%s'" %
                        (claim.loc[i, 'claim_text'].replace("%", "%%").replace(
                            "'", "''")),
                        con=engine)
                    if len(claim_check) == 0:
                        #####################If claim does not exist
                        pd.DataFrame(claim.iloc[i]).T.to_sql(
                            'claim', engine, if_exists='append', index=False)
                        claim_check = pd.read_sql_query(
                            "select * from claim where claim_text='%s'" %
                            (claim.loc[i, 'claim_text'].replace(
                                "%", "%%").replace("'", "''")),
                            con=engine)
                        claimID = claim_check.loc[0, 'claimID']
                        claim_review.loc[i, 'claimID'] = claimID
                        ######Polyglot ner Language specific to spider and domain
                        #extracting entities from claim text and setting lang
                        text1 = Text(claim.loc[i, 'claim_text'].strip('"'))
                        text1.language = domainlang
                        try:
                            #extracting entities from description and setting lang
                            text2 = Text(claim.loc[i, 'claim_description'])
                            text2.language = domainlang
                            ner_entities = textpt1.entities + textpt2.entities
                        except KeyError:
                            #if description does not exist
                            ner_entities = text1.entities
                        for entity in ner_entities:
                            entity_tag = str(entity.tag)
                            entity = " ".join(entity).replace("\\", "")
                            entity_check = pd.read_sql_query(
                                'select * from entity where entity_text="%s"' %
                                (entity),
                                con=engine)
                            if len(entity_check) == 0:
                                pd.DataFrame([[entity_tag, entity]],
                                             columns=['type',
                                                      'entity_text']).to_sql(
                                                          'entity',
                                                          engine,
                                                          if_exists='append',
                                                          index=False)
                                entity_check = pd.read_sql_query(
                                    'select * from entity where entity_text="%s"'
                                    % (entity),
                                    con=engine)
                                entityID = entity_check.loc[0, 'entityID']
                                pd.DataFrame([[entityID, claimID]],
                                             columns=['entityID',
                                                      'claimID']).to_sql(
                                                          'claim_entity',
                                                          engine,
                                                          if_exists='append',
                                                          index=False)
                            else:
                                entityID = entity_check.loc[0, 'entityID']
                                pd.DataFrame([[entityID, claimID]],
                                             columns=['entityID',
                                                      'claimID']).to_sql(
                                                          'claim_entity',
                                                          engine,
                                                          if_exists='append',
                                                          index=False)
                        ##############################################
                    else:
                        claimID = claim_check.loc[0, 'claimID']
                        claim_review.loc[i, 'claimID'] = claimID
                    claim_review.loc[i, 'fact_checkerID'] = fact_checkerID
                    pd.DataFrame(claim_review.iloc[i]).T.to_sql(
                        'claim_review',
                        engine,
                        if_exists='append',
                        index=False)
        return claimdf.to_dict()

Exemple #28

0

Afficher le fichier

Fichier : anonymizer.py Projet : Digitalia-Xamk/python-anonymizer

 if len(content)<2 or content==" " or content=="":
     printmessage("There is no text content in the submitted file")
     printmessage("Please use some ocr software before submitting a file")
     printmessage("Later on we will add a functionality to ocr things automatically")
 else:
     printmessage("Content = {}".format(content))
     printmessage("Encoding = {}".format(encoding))
     """
     Before parsing the pdf structure, let's ner the content if requested
     """
     if useNERRecognition:
         languages = langDetect(content)                
         sortedLangDict =  sorted(languages.items(), key=lambda i:i[1], reverse=True)
         lang, conf = sortedLangDict[0]
         printmessage("{} {}".format(lang, conf))                
         content = Text(content, hint_language_code=lang)
             
         ners = content.entities
         nersInText = []
         printmessage("NERS = {}".format(ners))
         for onener in ners:
             if onener.tag == 'I-PER':
                 for name in onener:
                     #printmessage(name)
                     """Add value to storage"""
                     nersInText.append(name)
             
     encodingstr = "encoding='"+str(encoding)+"'"
     pdfMinerData = minePDF.startParsingPDF(fullitempath) #pdfMinerData = [document, interpreter, device]
     printmessage("pdf document parsed, length of return string is {}".format(len(pdfMinerData)))
     printmessage(pdfMinerData)

Exemple #29

0

Afficher le fichier

Fichier : code-polyglot-NER.py Projet : lrpopeyou/Financial_Models

# This script shows named entity recognition (NER) with polyglot.
from polyglot.text import Text
text = '''Abraham Lincoln fue un politico y abogado estadounidense 
          que ejercio como decimosexto presidente de los 
          Estados Unidos de America'''
ptext = Text(
    text)  # No need to specify language here; recognized automatically.
ptext.entities  # `entities` attribute; see a list of chunks (with label).
for ent in ptext.entities:  # Print each of the entities found.
    print(ent)
type(ent)  # Print the type of the (last) entity.
ent.tag  # Tag of (last) entity.
'los' in ent  # Check is 'los' is in the (last) entity.
'Abraham' in ent  # Is 'Abraham' in the (last) entity?
# List comprehension to get tuples. First tuple element is the entity
# tag, the second is the full string of the entity text (separate by
# space).
[(ent.tag, ' '.join(ent)) for ent in ptext.entities]
# The `pos_tags` attribute queries all the tagged words.
for word, tag in ptext.pos_tags:
    print(word, tag)

Exemple #30

0

Afficher le fichier

def keywords(sent):
	is_noun = lambda pos: pos[:2] == 'NN'
	is_nounp = lambda pos: pos[:2] == 'NNP'
	li_bad = ['moderate','security','update','perform','failure','violation','perform','vulnerabilities','us','end','life','project','Critical',
	'privileges','execution','Keys','account','configuration','low','bug','fix','deserialization','vulnerability','files','deviation','Version','common','findings','vulnerabilites',
	'ibm','spectrum','protect','storage','manager','windows','macintosh','client','commons','fileupload','managed','file','transfer','component','affects','r',
	'message','headers','transmission','channels','data','error','important','application','@','secure','creation','may','tool','fixes',
	'code','guidance','side-channel','response','spectre','meltdown','operations','center','management','service','updates','critical',
	'process','designer','business','automation','workflow','speculative','store','bypass','patch','live','speculative','enhancement','Analysis',
	'violations','scan','please','refer','description','column','details','medium','behavior','insecure','permission','privilege',
	'escalation',']','keys','single','sign-on','april','information','disclosure','remote','confidential','escallation','[','format','multiple','path','libraries','july',
	'july','circumstances','algorithm','password','users','installation','support','cheklist','kt','s/accesses','/','id','access',
	'managment','team','ac2','identity','connect','activity','history','upgrade','patches','virtualization','new','function','propagation','flaw','certificate',
	'extension','ipfdressfamily','queue','clients','channel','server','entity','attack','injection','vulnerable','external','modules',
	'buffer','impacts','announce','hosted','validation','overwrite','tools','new','admin','console','packages','tm','edition','technology','health','check','container','platform',
	'novell','suse','impact','bulletin','control','vulnerabities','unsafe','driver','party','library','hardware','power','\x96','high',
	'space','environments','protection','performance','liberty','product','denial','gateways','appliance','caching','splitting','proxy','edge',
	'faces','affect','source','open','malformed','certficate','editor','protection','forgery','request','cross','site','machine','guest','key','os',
	'advisory','generation','august','june','desktop','protocol','malware','engine','elevation','variant','time','implementation','midrange',
	'applications','servers','s/accesses','contracts','online','below','need','activities','compliance','policies','date','target','days','openssh-unix-announce',
	'softwares','pack','risk','system','belongs','endpoint','issues','mar','apr','au02uap875ghox2.ahe.au.ibm.com','version','sk.ibm.com','g01cxnp20065','b03zcimq101.boulder.ibm.com',
	"'s/accesses",'.the','port','gi_svc_spo_s1','rsk100018054','services1bte.pok.ibm.com','suport','gi_cba_sap_s3','gi_svc_gsc_s1','canada','ownership','revalidation',
	'corrección','apr2017','exc','desviaciones','cycle','chapter','currency','ends','unsecured','group1','analysis','protocols','g01aciwas062.ahe.pok.ibm.com','b06cxhd0p0230.portsmouth.uk.ibm.com', 'b06cxhd0p0330.portsmouth.uk.ibm.com', 'b06cxnr0p0231.portsmouth.uk.ibm.com',
	'b06cxnr0p0232.portsmouth.uk.ibm.com','analysis','scanning-jan-2016','protocols','g01aciwas062.ahe.pok.ibm.com','instance','smallbluep8.pok','authorization',
	'smay18','production','argentina']

	useful=['flash-plugin','qemu-kvm-rhev','plexus-archiver','chromium-browser','openslp','yum-utils','kernel-alt','firefox','vdsm',
	'qemu-kvm','libvirt','rhevm-setup-plugins','rhvm-appliance','kernel-rt','ghostscript-library','nautilus','nagios-nrpe','util-linux',
	'unixODBC','ucode-intel','nautilus','AS2','OpenSSH','AIX']

	tokenized = nltk.word_tokenize(sent)
	
	nouns = set(word for (word, pos) in nltk.pos_tag(tokenized) if (is_noun(pos) or is_nounp(pos)) and word.lower() not in li_bad)
	#print(nouns)
	keywords_ab = ''
	try:
		text = Text(l)
		for entity in text.entities:
			if entity.tag == "I-ORG":
				nouns.add(entity)
	except :
		pass
	nouns=list(nouns)
	java = ''
	for j in tokenized:
		if j[0:4] =='java' or j[0:8] =='rh-mysql' or j[0:6]=='python' or j[0:6] =='xmlrpc' or j[0:6] =='rh-php' or j[0:10]=='go-toolset' or j[0:7]=='rh-java' or j[0:8]=='rh-maven':
			java=j
	if java not in nouns and java !='':
		nouns.append(java)
	for k in nouns:
		if k[0:4] == 'RHSA' or k[0:4] =='SUSE' or k[0:3]=='CVE' or k[0:3]=='SIA' or k[0:4]=='CVE-' or k[0:5]=='MEESA':
			nouns.remove(k)
	for k in nouns:
		if k[0:3]=='CVE' or k[0:3] =='CVE' or k[0:4]=='bo3z' or k[0:6] =='AC2_GI':
			nouns.remove(k)
	if 'bug' in tokenized and 'fix' in tokenized:
		nouns.append('bug fix')
	if 'dhcp' in tokenized:
		nouns.append('dhcp')
	for j in tokenized:
		if j in useful and j not in nouns:
			nouns.append(j)
	try:
		if 'SLE' in nouns:
			index = tokenized.index('SLE')
			nouns = [iter.replace('SLE',tokenized[index]+" "+tokenized[index+1]+' '+tokenized[index+2]) for iter in nouns]
			nouns.remove(tokenized[index+2])
	except:
		pass

	return nouns