def search(): query = request.args.get("search") sentiment = request.args.get("sentiment") db_history_search = es.search(index='post_index', doc_type='post', q=query) items = list() for post in db_history_search['hits']['hits']: if sentiment == "positive" and post['_source']['score'] > 0.1: items.append([ post['_source']['message'], post['_source']['score'], post['_source']['views'], post['_source']['likes'], post['_source']['date'] ]) elif sentiment == "negative" and post['_source']['score'] < -0.1: items.append([ post['_source']['message'], post['_source']['score'], post['_source']['views'], post['_source']['likes'], post['_source']['date'] ]) elif sentiment == "neutral" and post['_source'][ 'score'] > -0.1 and post['_source']['score'] < 0.1: items.append([ post['_source']['message'], post['_source']['score'], post['_source']['views'], post['_source']['likes'], post['_source']['date'] ]) if len(items) == 0: vk_session = vk_api.VkApi(login, password) vk_session.auth() vk = vk_session.get_api() groups = ["40316705", "15755094"] posts = list() for group in groups: news = vk.wall.search(owner_id=("-" + group), query=query, count=100, v="5.92", owners_only=0) news = news['items'] for article in news: if article['post_type'] == 'post': posts.append(article) posts_clean = list() for post in posts: if post['text'] != '': post_date = datetime.utcfromtimestamp(int( post['date'])).strftime('%Y-%m-%d') posts_clean.append([ post['text'], post['views']['count'], post['likes']['count'], post_date ]) for post in posts_clean: text = re.sub(r"http\S+", "", post[0]) text = Text(text) # calculate polarity polarity = 0 norm = 0 for w in text.words: polarity += w.polarity if polarity == w.polarity: norm += 0.1 else: norm += 1 polarity /= norm polarity = round(polarity, 2) if polarity > 0.1 and sentiment == "positive": items.append([post[0], polarity, post[1], post[2], post[3]]) elif polarity < -0.1 and sentiment == "negative": items.append([post[0], polarity, post[1], post[2], post[3]]) elif polarity > -0.1 and polarity < 0.1 and sentiment == "neutral": items.append([post[0], polarity, post[1], post[2], post[3]]) es.index(index='post_index', doc_type="post", body={ 'message': post[0], 'score': polarity, 'views': post[1], 'likes': post[2], 'date': post[3] }) #message score tags author date return render_template('search.html', items=items)
positive_sentence = [] negative_sentence = [] neutral_sentence = [] positive_with_negative_adjective = [] positive_with_negative_verb = [] positive_with_positive_adjective = [] positive_with_positive_verb = [] negative_with_positive_adjective = [] negative_with_positive_verb = [] negative_with_negative_adjective = [] negative_with_negative_verb = [] text = Text(raw, hint_language_code='fi') for sentence in text.sentences: for word in words: #check if comment has entity in it if word in sentence: sentence_polarity = sentence.polarity sentiment += sentence_polarity if sentence_polarity < 0: negative_sentence.append(sentence) elif sentence_polarity > 0: positive_sentence.append(sentence) elif sentence_polarity == 0: neutral_sentence.append((sentence)) print("amount of positive sentences")
def polygloter(t): try: return Text(t, hint_language_code='NL').polarity except: return 0
#DET: determiner #INTJ: interjection #NOUN: noun #NUM: numeral #PART: particle #PRON: pronoun #PROPN: proper noun #PUNCT: punctuation #SCONJ: subordinating conjunction #SYM: symbol #VERB: verb #X: other blob = """Je veux la moyenne d'âge des agents en fonction de leur salaire.""" blob2 = """ Quel est la moyenne d'âge des personnes travaillant en mairie et qui gagnent plus 3000 par mois?""" text = Text(blob, hint_language_code='fr') print(text.pos_tags) def parser(text): """Parses the text into sub groups of words Returns a list of the subgroups""" agregation = [] metric = [] dimension = [[]] filters = [[]] text = Text(text, hint_language_code='fr') pos_tags = text.pos_tags #Finding the agregation
def main(textarea): pattern = re.compile(u'[\u0980-\u09FF]+', re.UNICODE) new_path1 = cur_path + '/entertainmentwordswithscore.txt' totaltext = Text(textarea) sample_file2 = open(new_path1, 'r', encoding='utf8') text = sample_file2.read() textual = Text(text) s = "" model = Sentence2Vec(cur_path + '/trainedsentence.model') scoresht = 0 storescore = [] sentencecount = 0 eachsentu = [] normfactortextr = 0.0 normfactorsenti = 0.0 normfactorkey = 0.0 ll = 0 for eachsent in totaltext.sentences: sentencecount = sentencecount + 1 for wordsi in eachsent.words: if (wordsi == "," or wordsi == "'"): continue strings = wordsi try: answer = textual.find(strings) sd = len(strings) dot = "" for i in range(0, 12): dot = dot + textual[answer + sd + 4 + i] normfactorkey = normfactorkey + (float(dot)) except: print("S") if wordsi.polarity == 0: normfactorsenti = normfactorsenti + .0001 strings = "" for wordssx in eachsent.words: strings += wordssx + ' ' for xx in range(0, len(totaltext.sentences)): if xx == sentencecount - 1: continue stringx = "" for words2x in totaltext.sentences[xx].words: stringx += words2x + ' ' simscore = 0.0 simscore = model.similarity(stringx, strings) normfactortextr = normfactortextr + (float(simscore)) sentencecount = 0 for eachsent in totaltext.sentences: sentencecount = sentencecount + 1 eachsentu.append(eachsent) scoresht = 0.0 scoresht2 = 0.0 for wordsi in eachsent.words: if (wordsi == "," or wordsi == "'"): continue strings = wordsi try: answer = textual.find(strings) sd = len(strings) dot = "" for i in range(0, 12): dot = dot + textual[answer + sd + 4 + i] scoresht = scoresht + (float(dot)) except: print("S") if wordsi.polarity == 0: scoresht2 = scoresht2 + .0001 scoresht3 = 0.0 strings = "" for wordssx in eachsent.words: strings += wordssx + ' ' for xx in range(0, len(totaltext.sentences)): if xx == sentencecount - 1: continue stringx = "" for words2x in totaltext.sentences[xx].words: stringx += words2x + ' ' simscore = 0.0 simscore = model.similarity(stringx, strings) scoresht3 = scoresht3 + (float(simscore)) mixedscore = .5 * (scoresht3 / normfactortextr) + .3 * ( scoresht / normfactorkey) + .2 * (scoresht2 / normfactorsenti) storescore.append(mixedscore) n = sentencecount for i in range(n): for j in range(0, n - i - 1): if storescore[j] < storescore[j + 1]: storescore[j], storescore[j + 1] = storescore[j + 1], storescore[j] eachsentu[j], eachsentu[j + 1] = eachsentu[j + 1], eachsentu[j] totaltext.sentences[j], totaltext.sentences[ j + 1] = totaltext.sentences[j + 1], totaltext.sentences[j] for k in range(0, int(sentencecount * .40)): if int(len(eachsentu[k])) < 10: continue s = s + str(eachsentu[k]) + '\n' return s
from polyglot.text import Text txt = Text(r"""Lina del Castillo es profesora en el Instituto de Estudios Latinoamericanos Teresa Lozano Long (LLILAS) y el Departamento de Historia de la Universidad de Texas en Austin. Ella será la moderadora del panel “Los Mundos Políticos de Gabriel García Márquez” este viernes, Oct. 30, en el simposio Gabriel García Márquez: Vida y Legado. LIna del Castillo Actualmente, sus investigaciones abarcan la intersección de cartografía, disputas a las demandas de tierra y recursos, y la formación del n...el tren de medianoche que lleva a miles y miles de cadáveres uno encima del otro como tantos racimos del banano que acabarán tirados al mar. Ningún recuento periodístico podría provocar nuestra imaginación y nuestra memoria como este relato de García Márquez. Contenido Relacionado Lea más artículos sobre el archivo de Gabriel García Márquez Reciba mensualmente las últimas noticias e información del Harry Ransom Center con eNews, nuestro correo electrónico mensual. ¡Suscríbase hoy! """)
from polyglot.text import Word, Text words = "həmişə bütün hüquq normalarda hər üç element olmur".split(" ") for w in words: w = Word(w, language="az") print("{:<20}{}".format(w, w.morphemes)) """ həmişə ['həmişə'] bütün ['bütün'] hüquq ['hüquq'] normalarda ['norma', 'larda'] hər ['hər'] üç ['üç'] element ['element'] olmur ['olmur'] """ text = "həmişəbütünhüquqnormalardahərüçelementolmur" splitted_text = Text(text) splitted_text.language = "az" print(splitted_text.morphemes) """ ['həmişə', 'bütün', 'hüquq', 'norma', 'larda', 'hər', 'üç', 'element', 'olmur'] """
def polySentTokenize(text): sent_array = set() text = Text(text) for sent in text.sentences: sent_array.add(sent) return sent_array
def webscraper(url_boys,url_girls,pos_boys_filename,pos_girls_filename,text_boys_filename,text_girls_filename): boys_urllist=filelister(url_boys) girls_urllist=filelister(url_girls) girl_names=sorted(filelister('pigenavne.txt'),key=len,reverse=True) boy_names=sorted(filelister('drengenavne.txt'),key=len,reverse=True) unisex_names=sorted(filelister('unisexnavne.txt'),key=len,reverse=True) boy_names_edt=[boy_name.strip().lower() for boy_name in boy_names if boy_name not in unisex_names] girl_names_edt=[girl_name.strip().lower() for girl_name in girl_names if girl_name not in unisex_names] text_boys_file=open(text_boys_filename,'wb') text_girls_file=open(text_girls_filename,'wb') pos_boys_file=open(pos_boys_filename,'wb') pos_girls_file=open(pos_girls_filename,'wb') text_list_boys=[] pos_list_boys=[] text_list_girls=[] pos_list_girls=[] links_boys=set() links_girls=set() for url_boyname in boys_urllist: url_boyname_split=url_boyname.split(',') link=url_boyname_split[0] if not link.startswith('https'): try: page=urllib.request.urlopen(link) try: soup = BeautifulSoup(page,'html5lib') [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] try: if soup.find(class_='author')!=None: author=soup.find(class_='author').text author_forename=author.split()[0].strip().lower() elif soup.find(rel='author')!=None: author=soup.find(rel='author').text author_forename=author.split()[0].strip().lower() except (ValueError,IndexError): print('ValueError or IndexError') for article in soup.find_all('article'): for p in article.find_all('p'): try: if author_forename in boy_names_edt and link not in links_boys: print(link) links_boys.add(link) visible_text = p.get_text() text=Text(visible_text,hint_language_code='da') text_list_boys.append((author_forename,visible_text)) pos=text.pos_tags pos_list_boys.append((author_forename,pos)) elif author_forename in girl_names_edt and link not in links_boys: print(link) links_boys.add(link) visible_text = p.get_text() text=Text(visible_text,hint_language_code='da') text_list_girls.append((author_forename,visible_text)) pos=text.pos_tags pos_list_girls.append((author_forename,pos)) except ValueError: pass except TypeError: pass except HTTPError as e: print('Error message:',e.msg) continue except URLError as e: print('Error reason:',e.reason) except http.client.IncompleteRead as e: page = e.partial except socket.gaierror: pass except TimeoutError: pass except ValueError: pass for url_girlname in girls_urllist: url_girlname_split=url_girlname.split(',') link=url_girlname_split[0] if not link.startswith('https'): try: page=urllib.request.urlopen(link) try: soup = BeautifulSoup(page,'html5lib') [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] try: if soup.find(class_='author')!=None: author=soup.find(class_='author').text author_forename=author.split()[0].strip().lower() elif soup.find(rel='author')!=None: author=soup.find(rel='author').text author_forename=author.split()[0].strip().lower() except (ValueError,IndexError): print('ValueError or IndexError') for article in soup.find_all('article'): for p in article.find_all('p'): try: if author_forename in boy_names_edt and link not in links_girls: print(link) links_girls.add(link) visible_text = p.get_text() text=Text(visible_text,hint_language_code='da') text_list_boys.append((author_forename,visible_text)) pos=text.pos_tags pos_list_boys.append((author_forename,pos)) elif author_forename in girl_names_edt and link not in links_girls: print(link) links_girls.add(link) visible_text = p.get_text() text=Text(visible_text,hint_language_code='da') text_list_girls.append((author_forename,visible_text)) pos=text.pos_tags pos_list_girls.append((author_forename,pos)) except ValueError: pass except TypeError: pass except HTTPError as e: print('Error message:',e.msg) continue except URLError as e: print('Error reason:',e.reason) except http.client.IncompleteRead as e: page = e.partial except socket.gaierror: pass except TimeoutError: pass except ValueError: pass try: pickle.dump(text_list_boys,text_boys_file) pickle.dump(pos_list_boys,pos_boys_file) pickle.dump(text_list_girls,text_girls_file) pickle.dump(pos_list_girls,pos_girls_file) except ValueError as e: pass pos_boys_file.close() pos_girls_file.close() text_boys_file.close() text_girls_file.close()
def get_tagged_tokens(self, text): ptext = Text(text) # output can be re-organised # @TODO # do this per sentence entities = [(" ".join(entity), entity.tag) for entity in ptext.entities] return entities
import polyglot from polyglot.text import Text, Word # EXECUTE THIS COMMAND ON YOUR TERMINAL # polyglot download embeddings2.en pos2.en text = Text("Bonjour, Mesdames.") print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name)) zen = Text("Beautiful is better than ugly. " "Explicit is better than implicit. " "Simple is better than complex.") print(zen.words) text = Text("This is a car") print("{:<16}{}".format("Word", "POS Tag") + "\n" + "-" * 30) for word, tag in text.pos_tags: print(u"{:<16}{:>2}".format(word, tag))
'acaoAutor', 'acaoVitima', 'acesso', 'armaAutor', 'autor', 'bensVitima', 'caracteristicaFisicaPessoa', 'caracteristicaVeiculo', 'deslocamentoAutor', 'idadeAutor', 'instrumentoAutor', 'local', 'quantidade', 'vestimentaAutor' ] for filename in glob.glob(os.path.join(path, '*.json')): keys_file = open(filename, 'r', encoding='utf8') keys_json = json.loads(keys_file.read()) # Convert arquivo para json for entidade in entidades: print("Arquivo: " + filename + " - Entidade: " + entidade) acaoAutor = keys_json[entidade] # Fazer splite de frase para palavas for dadosText in acaoAutor: vtTexto = dadosText text = Text(vtTexto, hint_language_code='pt') #Gravar palavras try: # Identificar em qual classes variaveis a palavra se enquadra (artigo, adjetivo, pronome, numeral, substantivo e verbo) for word, tag in text.pos_tags: neighbors = embeddings.nearest_neighbors(word) for w, d in zip(neighbors, embeddings.distances(word, neighbors)): print("{:<8}{:.4f}".format(w, d)) # Garantir nao ter palavras duplicadas sql_search_palavra = " Select count(id) from palavras Where palavra = ? And tag = ? " where = (word, tag) for count in cur.execute(sql_search_palavra, where).fetchall():
def on_data(self, data): #while True: try: all_data = json.loads(data) #print(all_data) except Exception as e: raise e #print ("Debug Message: ",all_data) try: tweetTime = all_data["created_at"] #tweet = all_data["text"] originaltweet = all_data["text"] if all_data["truncated"] == "true": originaltweet = all_data["extended_tweet"]["full_text"] tweet_in_reply_to_status_id = all_data["in_reply_to_status_id"] tweet_in_reply_to_screen_name = all_data["in_reply_to_screen_name"] #tweet_mentions_screen_name = all_data["extended_tweet"]["entities"]["user_mentions"]["screen_name"] # [] #tweet_mentions_name = all_data["extended_tweet"]["entities"]["user_mentions"]["name"] # [] #tweet_mentions_is = all_data["extended_tweet"]["entities"]["user_mentions"]["id"] # [] #tweet = self.clean_tweet(originaltweet) tweet = self.tweet_preprocessor(self.links_remover(originaltweet)) #username twitter username = all_data["user"]["screen_name"] #name twitter name = all_data["user"]["name"] userid = all_data["user"]["id"] userdesc = all_data["user"]["description"] user_follower = all_data["user"]["followers_count"] user_location = all_data["user"]["location"] #user_place = all_data["user"]["place"] tweet_id = all_data["id"] #linguaggio del tweet language = all_data["lang"] #cosa abbiamo trovato print( "___________________________________________________________________________________________________" ) print("ORIGINAL: Lang", language, "User:"******"Follower:", user_follower, "Tweet:", originaltweet) print("MODIFIED: Lang", language, "User:"******"Follower:", user_follower, "Tweet:", tweet) except: print("Unexpected error:", sys.exc_info()[0]) #pass raise avoid = False for profilo_twitter in AVOID_PROFILES: if profilo_twitter in str(tweet.encode('utf8')): avoid = True pass try: if avoid: print("Tweet have not passed semanthic filters") else: reply = False retweet = False like = False avoid = False follow = False message = False pos = 0 neg = 0 understood = False polytext = Text(str(tweet)) #TODO: polytext.entities get a cursor with entities[] try: print(polytext.pos_tags) except Exception as e: pass # ADJ: adjective # ADP: adposition # ADV: adverb # AUX: auxiliary verb # CONJ: coordinating conjunction # DET: determiner # INTJ: interjection # NOUN: noun # NUM: numeral # PART: particle # PRON: pronoun # PROPN: proper noun # PUNCT: punctuation # SCONJ: subordinating conjunction # SYM: symbol # VERB: verb # X: other for idx, topicfound in enumerate(TOPICS): if str(topicfound[0]).lower() in tweet.lower(): topic = TOPICS[idx] print("topic=", topic) understood = True if understood: # FIRST LEVEL OF RESPONSE try: # punt tweet in polyglot object sentences = polytext.sentences print("BEFOR LOGIC") like, reply, message = self.make_logic_reaction( sentences, topic[0], topic[1]) print("AFTER LOGIC") except Exception as e: raise e #pass print("Processing: REPLY:", reply, " RETWEET:", retweet, " LIKE:", like, " MSG: ", message) if reply: api.update_status(message, in_reply_to_status_id=tweet_id) REPLIES_USERS.append(username) print("######--> Replyed with:", message) if retweet: api.retweet(tweet_id) print("######--> Retweet") time.sleep(5) if like: api.create_favorite(tweet_id) print("######--> Like") if follow: api.create_friendship(username) print("######--> Follow ", username) #if API.exists_friendship(user_a, user_b): if message: api.send_direct_message(username, "interessante :)") return True else: print("Put tis tweet in the trashcan .... ") return True except Exception as exx: print("aborted", exx) print(sys.exc_info()[0]) pass finally: time.sleep(20) return True
'(' \ ' id integer primary key AUTOINCREMENT, '\ ' word varchar(50), ' \ ' radical varchar(50), ' \ ' tag varchar(50)' \ ')' cur.execute(sql_create) sql_insert = ' insert into miniDicionario (radical, word, tag) values (?, ?, ?) ' sql_update = ' update miniDicionario set radical = ? Where word = ? And tag = ? ' path = './dados-base' for filename in glob.glob(os.path.join(path, '*.dic')): arquivo = open(filename, 'r', encoding='utf8') for line in arquivo.readlines(): line = line.split('/') text = line[0].replace('\n', '') print(text) text = Text(text, hint_language_code='pt') for word, tag in text.pos_tags: sql_search_palavra = " Select count(id) from miniDicionario Where word = ? And tag = ? " #radicao = text.morphemes where = (word, tag) for count in cur.execute(sql_search_palavra, where).fetchall(): rec = ('', word, tag) if count[0] == 0: cur.execute(sql_insert, rec) else: cur.execute(sql_update, rec) con.commit()
""" POS tagger for sermons in content.dat """ import pandas as pd import nltk.data from polyglot.text import Text # data DF = pd.read_csv("content.dat", header=0, index_col=None) content = DF["content"].tolist() fnames = DF["id"].tolist() tokenizer = nltk.data.load("tokenizers/punkt/norwegian.pickle") DATA_pos = [] i = 0 for i, text in enumerate(content): print("file {}".format(i)) # sentence disambiguation sents = tokenizer.tokenize(text) # POS text_pos = [] for blob in sents: textblob = Text(blob, hint_language_code='da') if textblob.pos_tags: text_pos.append(textblob.pos_tags) DATA_pos.append([fnames[i], text_pos]) DF_pos = pd.DataFrame(DATA_pos) DF_pos.columns = ["id", "POS"] DF_pos.to_csv("content_pos.dat", index=False)
def get_pos_tags(self, input_text): pos_tags = [] text = Text(input_text, hint_language_code='bg') pos_tags = text.pos_tags pos_tags = [pt[1] for pt in pos_tags] return pos_tags
#In this exercise and the next, you'll use the polyglot library to identify French entities. The library functions slightly differently than spacy, so you'll use a few of the new things you learned in the last video to display the named entity text and category. # #You have access to the full article string in article. Additionally, the Text class of polyglot has been imported from polyglot.text. from polyglot.text import Text article = """ French NER with polyglot I In this exercise and the next, you'll use the polyglot library to identify French entities. The library functions slightly differently than spacy, so you'll use a few of the new things you learned in the last video to display the named entity text and category. You have access to the full article string in article. Additionally, the Text class of polyglot has been imported from polyglot.text. """ # Create a new text object using Polyglot's Text class: txt txt = Text(article) # Print each of the entities found for ent in txt.entities: print(ent) # Print the type of ent print(type(ent)) #French NER with polyglot II # #Here, you'll complete the work you began in the previous exercise. # #Your task is to use a list comprehension to create a list of tuples, in which the first element is the entity tag, and the second element is the full string of the entity text. # Create the list of tuples: entities
from konlpy.corpus import my_corpus from konlpy.tag import Hannanum from konlpy.utils import concordance, pprint from matplotlib import pyplot from konlpy.tag import Kkma from polyglot.text import Text with open('vegetariankoreansister.txt', 'r') as myfile: data = myfile.read().replace('\n', ' ') kkma = Kkma() sentlist = (kkma.sentences(data)) sentpolarity = 0 senttotal = 0 sentcount = 0 for i in sentlist: text = Text(i, hint_language_code='ko') for w in text.words: senttotal += w.polarity sentcount += 1 sentpolarity = senttotal / sentcount print(i) sentpolarity = 0 senttotal = 0 sentcount = 0
# %% [markdown] {"heading_collapsed": true} # # sentiments # %% {"hidden": true} # compare polyglot, textblob sents = [ "he is strongly criticised", "he died yesterday", "he was killed by a car", "he is the least popular", "he is never any good", "he visited a cancer charity", "Exquisite jewels were worn by the Queen", "he is good", "he is bad", "he avoids the issue", "he hates jews", "jews hate him", "everyone hates him", "he is the most ineffective" ] print("**** textblob, polyglot ****") for sent in sents: print(round(tb(sent).polarity, 1), Text(sent).polarity, sent) # %% {"hidden": true} pol = [] for doc in docs: for s in doc.sents: if s.text.find("Brexiteer") >= 0: print(s.text.strip()) try: pg = Text(s.text).polarity print(pg) except: pg = 0 pol.append(pg) # %% {"hidden": true}
def morphological(text): tokens = Text(text) return tokens.pos_tags
## installation # !pip install polyglot # installing dependency packages [2] get_ipython().system('pip install pyicu morfessor pycld2') # # Quick Tutorial [1] import polyglot from polyglot.text import Text, Word # ## Language Detection text = Text("Bonjour, Mesdames.") print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name)) zen = Text("Beautiful is better than ugly. " "Explicit is better than implicit. " "Simple is better than complex.") print(zen.words) print(zen.sentences) # ## POS Tagging ## dependencies
def webscraper(url_filename, urls_boys_filename, urls_girls_filename, pos_boys_filename, pos_girls_filename, text_boys_filename, text_girls_filename): urllist = filelister(url_filename) girl_names = sorted(filelister('pigenavne.txt'), key=len, reverse=True) boy_names = sorted(filelister('drengenavne.txt'), key=len, reverse=True) unisex_names = sorted(filelister('unisexnavne.txt'), key=len, reverse=True) boy_names_edt = [ boy_name.strip().lower() for boy_name in boy_names if boy_name not in unisex_names ] girl_names_edt = [ girl_name.strip().lower() for girl_name in girl_names if girl_name not in unisex_names ] urls_boys_file = codecs.open(urls_boys_filename, 'w', encoding='utf-8') urls_girls_file = codecs.open(urls_girls_filename, 'w', encoding='utf-8') text_boys_file = open(text_boys_filename, 'wb') text_girls_file = open(text_girls_filename, 'wb') pos_boys_file = open(pos_boys_filename, 'wb') pos_girls_file = open(pos_girls_filename, 'wb') text_list_boys = [] pos_list_boys = [] text_list_girls = [] pos_list_girls = [] for url in urllist: url = url.strip() author = None author_forename = None page = urllib.request.urlopen(url) soup = BeautifulSoup(page, 'html5lib') try: for lines in soup.find_all('div', class_='article_date'): if lines.find('a', class_='authorName') != None: print(url) author = lines.find('a', class_='authorName').text author_forename = author.split()[0].strip().lower() except (ValueError, IndexError): print('ValueError or IndexError') if author_forename in boy_names_edt: urls_boys_file.write(url.strip() + ',' + author_forename) urls_boys_file.write('\n') for article in soup.find_all('div', class_='remaining_paragraphs'): for p in article.find_all('p'): try: visible_text = p.get_text() text = Text(visible_text, hint_language_code='da') text_list_boys.append((author_forename, visible_text)) pos = text.pos_tags pos_list_boys.append((author_forename, pos)) except ValueError: pass elif author_forename in girl_names_edt: urls_girls_file.write(url.strip() + ',' + author_forename) urls_girls_file.write('\n') for article in soup.find_all('div', class_='remaining_paragraphs'): for p in article.find_all('p'): try: visible_text = p.get_text() text = Text(visible_text, hint_language_code='da') text_list_girls.append((author_forename, visible_text)) pos = text.pos_tags pos_list_girls.append((author_forename, pos)) except ValueError: pass try: pickle.dump(text_list_boys, text_boys_file) pickle.dump(pos_list_boys, pos_boys_file) pickle.dump(text_list_girls, text_girls_file) pickle.dump(pos_list_girls, pos_girls_file) except ValueError as e: pass urls_boys_file.close() urls_girls_file.close() text_boys_file.close() text_girls_file.close()
def parser(text): """Parses the text into sub groups of words Returns a list of the subgroups""" agregation = [] metric = [] dimension = [[]] filters = [[]] text = Text(text, hint_language_code='fr') pos_tags = text.pos_tags #Finding the agregation verb_counter = 0 noun = None i = 0 while noun == None and i < len(pos_tags): if pos_tags[i][1] == 'VERB': verb_counter += 1 if verb_counter > 0: if pos_tags[i][1] == 'NOUN': noun = pos_tags[i][0] agregation_pos = i i += 1 agregation.append(noun) #Finding the metric i = agregation_pos + 1 while i < len(pos_tags) and pos_tags[i][1] != 'ADP': metric.append(pos_tags[i][0]) i += 1 end_metric_pos = i - 1 #Finding the dimensions i = end_metric_pos + 1 dimension_counter = 0 while i < len(pos_tags) and pos_tags[i][1] != 'CONJ': print(pos_tags[i][1]) if pos_tags[i][0] == 'et': dimension_counter += 1 dimension.append([]) else: dimension[dimension_counter].append(pos_tags[i][0]) i += 1 end_dimension_pos = i - 1 #Finding the filters i = end_dimension_pos + 1 filter_counter = 0 while i < len(pos_tags): if pos_tags[i][0] == 'et': filter_counter += 1 filters.append([]) else: filters[filter_counter].append(pos_tags[i][0]) i += 1 return ([agregation, metric, dimension, filters])
@param text The text that must be split in to sentences. """ # sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]') sentence_delimiters = re.compile( u'[৷\u002f\u0053\u0056\u00a0\u00ad\u00d0\u00da\u00e6\u00f2\u00f3\u2013\u2014\\[\\]\n!?,;:\।\t\\"\\(\\)\\\'\‘\‘‘]') sentences = sentence_delimiters.split(text) # print(sentences, file=open("bangla_splitted_sentence.txt", "a", encoding='utf8')) return sentences sample_file1 = open('Accident/Codes For Accident/Documents/accidentwordswithscore.txt', 'r', encoding='utf8') sample_file2 = open('Economics/Codes For Economics/Documents/economicswordswithscore.txt', 'r', encoding='utf8') sample_file3 = open('Entertainment/Codes For Entertainment/Documents/entertainmentwordswithscore.txt', 'r', encoding='utf8') sample_file4 = open('Politics/Codes For Politics/Documents/politicswordswithscore.txt', 'r', encoding='utf8') text1 = sample_file1.read() textual1 = Text(text1) scoresht1 = 0 text2 = sample_file2.read() textual2 = Text(text2) scoresht2 = 0 text3 = sample_file3.read() textual3 = Text(text3) scoresht3 = 0 text4 = sample_file4.read() textual4 = Text(text4) scoresht4 = 0 countacc=0
def emotechintentionmodel(SentenceToBe, synonym_num): ######## # detector = Detector(SentenceToBe) # if detector.language.code == "vi": # langthesaurusload("vn",FileLocThe) # langmodelload("vn",LibLocLang) # else: # langthesaurusload(detector.language.code,FileLocThe) # langmodelload(detector.language.code,LibLocLang) # langthesaurusload("en", FileLocThe) # langmodelload("en",LibLocLang) ######## intention_filters = ['PRON', "VERB"] object_filters = ['NOUN', 'PROPN'] ######## sentences = model.tokenize(SentenceToBe) ######## for s in sentences: model.tag(s) # inplace tagging model.parse(s) # inplace parsing datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4) PosTagIntention = datause[datause.columns[1:4]].values.tolist() print(PosTagIntention) ################################ # ADJ: adjective # ADP: adposition # ADV: adverb # AUX: auxiliary # CCONJ: coordinating conjunction # DET: determiner # INTJ: interjection # NOUN: noun # NUM: numeral # PART: particle # PRON: pronoun # PROPN: proper noun # PUNCT: punctuation # SCONJ: subordinating conjunction # SYM: symbol # VERB: verb # X: other ################################ sentence_intention = [] sentence_object = [] #### if len(PosTagIntention) > 1: for i in range(0, len(PosTagIntention)): ##### if i == 0: if any(str(word).lower() in str(PosTagIntention[i][0]).lower() for word in question_words): sentence_intention.append("Question") ##### else: if all(str(word).lower() != str(PosTagIntention[i][0]).lower() for word in stop_words): if any(str(word).lower() in str(PosTagIntention[i][2]).lower() for word in intention_filters): sentence_intention.append(PosTagIntention[i][0]) if any(str(word).lower() in str(PosTagIntention[i][2]).lower() for word in object_filters): sentence_object.append(PosTagIntention[i][0]) ##### sentence_intention = list(set(sentence_intention)) sentence_object = list(set(sentence_object)) ##### else: sentence_intention = [] sentence_object = [] ##### intlength = len(sentence_intention) for i in range(0, intlength): temp = thesaurus[(thesaurus['subject'] == sentence_intention[i].lower()) & (thesaurus['weight'] >= 0.9)] # get top 2 sorted_temp = temp.sort_values(by=['weight'], ascending=False) try: sorted_temp = sorted_temp[0:synonym_num] except IndexError: print("not enough items") if len(sorted_temp) > 1: sentence_intention.extend((sorted_temp['word'])) ##### intlength = len(sentence_object) for i in range(0, intlength): temp = thesaurus[(thesaurus['subject'] == sentence_object[i].lower()) & (thesaurus['weight'] >= 0.9)] # get top 2 sorted_temp = temp.sort_values(by=['weight'], ascending=False) try: sorted_temp = sorted_temp[0:synonym_num] except IndexError: print("not enough items") if len(sorted_temp) > 1: sentence_object.extend((sorted_temp['word'])) ##### NERobj = Text(SentenceToBe, hint_language_code="en").entities ##### sentence_intention = list(set(sentence_intention)) sentence_object = list(set(sentence_object)) ##### overall = sentence_intention + sentence_object return (sentence_intention, sentence_object, overall, NERobj)
from polyglot.text import Text, Word import arabic_reshaper from bidi.algorithm import get_display # print(downloader.supported_languages_table("sentiment2", 3)) # text = Text("The movie was really good.") blob = """"آمریکا و چین در عالیترین سطح امنیتی، درباره چه موضوعاتی مذاکره میکنند.""" print("{:<16}{}".format("Word", "Polarity") + "\n" + "-" * 30) # for w in text.words: # text_ = arabic_reshaper.reshape(w) # bidi_text = get_display(text_) # print("{:<16}{:>2}".format(bidi_text, w.polarity)) # blob = """The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world".""" text = Text(blob) for sent in text.sentences: print(sent, "\n") for entity in sent.entities: print(entity.tag, entity) first_sentence = text.sentences[0] first_entity = first_sentence.entities[0] # print(first_entity) # # print(first_entity.positive_sentiment) # # print(first_entity.negative_sentiment) for w in first_sentence.entities: # text_ = arabic_reshaper.reshape(w) # bidi_text = get_display(text_) print("{} : positive: {:<2} | negative: {:>2}".format( w, str(w.positive_sentiment), str(w.negative_sentiment)))
def parse(self, response): claimdf = pd.DataFrame() #Extructing microdata or json in RDFA format data = extruct.extract(response.text, response.url) #Domain Name domain = urlparse(response.url).netloc.strip('www').strip('.com') #Selecting Microdata selected = [ properties for properties in data['microdata'] if properties['type'] == 'http://schema.org/ClaimReview' ] if selected: mode = 'micro' else: #If micro fails, selecting JSON try: selected = [ properties for properties in data['json-ld'] if properties['@type'] == 'ClaimReview' or properties['@type'] == ["ClaimReview"] ] except KeyError: selected = [ properties for properties in data['json-ld'][0]['@graph'] if properties['@type'] == 'ClaimReview' ] mode = 'json' if selected: #If JSON or micro succeed for elements in selected: if mode == 'micro': elements = elements['properties'] for key in elements: if type(elements[key]) == list: elements[key] = elements[key][0] ##Flattening Dictionary scraped_data = pd.io.json.json_normalize(elements) ##Renaming the columns of the dataframe scraped_data.columns = map(self.column_mapper, list(scraped_data.columns)) ##Dropping unimportant columns scraped_data = scraped_data.drop([None], axis=1) ##Checking if fact_checker_name exists or review_author_name try: scraped_data.loc[:, 'fact_checker_name'] = scraped_data[ 'fact_checker_name'] except KeyError: try: scraped_data.loc[:, 'fact_checker_name'] = scraped_data[ 'review_author_name'] except KeyError: #As a last resort extracting domain name from the url domname = urlparse( scraped_data.loc[0, 'fact_checker_url'] ).hostname.split('.')[1].capitalize() scraped_data['fact_checker_name'] = domname scraped_data['review_author_name'] = domname try: scraped_data.loc[:, 'claim_text'] = scraped_data['claim_text'] except KeyError: scraped_data.loc[:, 'claim_text'] = scraped_data[ 'claim_description'] scraped_data = scraped_data.drop(['claim_description'], axis=1) ##Appending to the dataframe claimdf = claimdf.append(scraped_data, ignore_index=True) ##Filtering columns needed for Claim_review table claim_review = claimdf.filter( regex= '(review_url|review_date|claimID|fact_checkerID|best_rating|worst_rating|rating_value|rating_name|review_author_name|review_rating_img|review_modified_date|review_headline|review_img|review_description)' ) claim_review.loc[:, 'claimID'] = 0 claim_review.loc[:, 'fact_checkerID'] = 0 claim_review.loc[:, 'review_crawl_date'] = str( datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')) claim = claimdf.filter( regex= '(claim_text|claim_description|claim_author_name|claim_url|claim_date|claim_author_img|claim_author_job|claim_location|claim_location_url)' ) claim.loc[:, 'claim_crawl_date'] = str( datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')) ##Filtering columns needed for Fact_checker table fact_checker = claimdf.filter( regex='(fact_checker_name|fact_checker_url|fact_checker_img)') ####Creating MySQL Engine engine = create_engine(URL(**settings.DATABASE), connect_args={'charset': 'utf8'}) #################Checking if fact checker exists fact_checker_check = pd.read_sql_query( 'select * from fact_checker where fact_checker_name="%s"' % (fact_checker.loc[0, 'fact_checker_name']), con=engine) if len(fact_checker_check) == 0: ###################If fact checker does not exist pd.DataFrame(fact_checker.iloc[0]).T.to_sql('fact_checker', engine, if_exists='append', index=False) fact_checker_check = pd.read_sql_query( 'select * from fact_checker where fact_checker_name="%s"' % (fact_checker.loc[0, 'fact_checker_name']), con=engine) fact_checkerID = fact_checker_check.loc[0, 'fact_checkerID'] else: ##Storing fact_checkerID for claim_review fact_checkerID = fact_checker_check.loc[0, 'fact_checkerID'] #############Iterating through the Claim_reiew Dataframe for i in range(len(claim_review)): ################Checking if claim exists flag = 0 claim_review_check = pd.read_sql_query( "select * from claim_review where review_url='%s'" % (claim_review.loc[i, 'review_url']), con=engine) if len(claim_review_check) == 0: #Row for this review does not exist flag = 1 else: #Row exists modified_date = claim_review_check.loc[ 0, 'review_modified_date'] try: if claim_review.loc[ i, 'review_modified_date'] != modified_date: #If review_modified_data has changed flag = 1 except KeyError: #If review_modified_data does not exist in the parsed dataframe if modified_date != None: flag = 1 if flag == 1: claim_check = pd.read_sql_query( "select * from claim where claim_text='%s'" % (claim.loc[i, 'claim_text'].replace("%", "%%").replace( "'", "''")), con=engine) if len(claim_check) == 0: #####################If claim does not exist pd.DataFrame(claim.iloc[i]).T.to_sql( 'claim', engine, if_exists='append', index=False) claim_check = pd.read_sql_query( "select * from claim where claim_text='%s'" % (claim.loc[i, 'claim_text'].replace( "%", "%%").replace("'", "''")), con=engine) claimID = claim_check.loc[0, 'claimID'] claim_review.loc[i, 'claimID'] = claimID ######Polyglot ner Language specific to spider and domain #extracting entities from claim text and setting lang text1 = Text(claim.loc[i, 'claim_text'].strip('"')) text1.language = domainlang try: #extracting entities from description and setting lang text2 = Text(claim.loc[i, 'claim_description']) text2.language = domainlang ner_entities = textpt1.entities + textpt2.entities except KeyError: #if description does not exist ner_entities = text1.entities for entity in ner_entities: entity_tag = str(entity.tag) entity = " ".join(entity).replace("\\", "") entity_check = pd.read_sql_query( 'select * from entity where entity_text="%s"' % (entity), con=engine) if len(entity_check) == 0: pd.DataFrame([[entity_tag, entity]], columns=['type', 'entity_text']).to_sql( 'entity', engine, if_exists='append', index=False) entity_check = pd.read_sql_query( 'select * from entity where entity_text="%s"' % (entity), con=engine) entityID = entity_check.loc[0, 'entityID'] pd.DataFrame([[entityID, claimID]], columns=['entityID', 'claimID']).to_sql( 'claim_entity', engine, if_exists='append', index=False) else: entityID = entity_check.loc[0, 'entityID'] pd.DataFrame([[entityID, claimID]], columns=['entityID', 'claimID']).to_sql( 'claim_entity', engine, if_exists='append', index=False) ############################################## else: claimID = claim_check.loc[0, 'claimID'] claim_review.loc[i, 'claimID'] = claimID claim_review.loc[i, 'fact_checkerID'] = fact_checkerID pd.DataFrame(claim_review.iloc[i]).T.to_sql( 'claim_review', engine, if_exists='append', index=False) return claimdf.to_dict()
if len(content)<2 or content==" " or content=="": printmessage("There is no text content in the submitted file") printmessage("Please use some ocr software before submitting a file") printmessage("Later on we will add a functionality to ocr things automatically") else: printmessage("Content = {}".format(content)) printmessage("Encoding = {}".format(encoding)) """ Before parsing the pdf structure, let's ner the content if requested """ if useNERRecognition: languages = langDetect(content) sortedLangDict = sorted(languages.items(), key=lambda i:i[1], reverse=True) lang, conf = sortedLangDict[0] printmessage("{} {}".format(lang, conf)) content = Text(content, hint_language_code=lang) ners = content.entities nersInText = [] printmessage("NERS = {}".format(ners)) for onener in ners: if onener.tag == 'I-PER': for name in onener: #printmessage(name) """Add value to storage""" nersInText.append(name) encodingstr = "encoding='"+str(encoding)+"'" pdfMinerData = minePDF.startParsingPDF(fullitempath) #pdfMinerData = [document, interpreter, device] printmessage("pdf document parsed, length of return string is {}".format(len(pdfMinerData))) printmessage(pdfMinerData)
# This script shows named entity recognition (NER) with polyglot. from polyglot.text import Text text = '''Abraham Lincoln fue un politico y abogado estadounidense que ejercio como decimosexto presidente de los Estados Unidos de America''' ptext = Text( text) # No need to specify language here; recognized automatically. ptext.entities # `entities` attribute; see a list of chunks (with label). for ent in ptext.entities: # Print each of the entities found. print(ent) type(ent) # Print the type of the (last) entity. ent.tag # Tag of (last) entity. 'los' in ent # Check is 'los' is in the (last) entity. 'Abraham' in ent # Is 'Abraham' in the (last) entity? # List comprehension to get tuples. First tuple element is the entity # tag, the second is the full string of the entity text (separate by # space). [(ent.tag, ' '.join(ent)) for ent in ptext.entities] # The `pos_tags` attribute queries all the tagged words. for word, tag in ptext.pos_tags: print(word, tag)
def keywords(sent): is_noun = lambda pos: pos[:2] == 'NN' is_nounp = lambda pos: pos[:2] == 'NNP' li_bad = ['moderate','security','update','perform','failure','violation','perform','vulnerabilities','us','end','life','project','Critical', 'privileges','execution','Keys','account','configuration','low','bug','fix','deserialization','vulnerability','files','deviation','Version','common','findings','vulnerabilites', 'ibm','spectrum','protect','storage','manager','windows','macintosh','client','commons','fileupload','managed','file','transfer','component','affects','r', 'message','headers','transmission','channels','data','error','important','application','@','secure','creation','may','tool','fixes', 'code','guidance','side-channel','response','spectre','meltdown','operations','center','management','service','updates','critical', 'process','designer','business','automation','workflow','speculative','store','bypass','patch','live','speculative','enhancement','Analysis', 'violations','scan','please','refer','description','column','details','medium','behavior','insecure','permission','privilege', 'escalation',']','keys','single','sign-on','april','information','disclosure','remote','confidential','escallation','[','format','multiple','path','libraries','july', 'july','circumstances','algorithm','password','users','installation','support','cheklist','kt','s/accesses','/','id','access', 'managment','team','ac2','identity','connect','activity','history','upgrade','patches','virtualization','new','function','propagation','flaw','certificate', 'extension','ipfdressfamily','queue','clients','channel','server','entity','attack','injection','vulnerable','external','modules', 'buffer','impacts','announce','hosted','validation','overwrite','tools','new','admin','console','packages','tm','edition','technology','health','check','container','platform', 'novell','suse','impact','bulletin','control','vulnerabities','unsafe','driver','party','library','hardware','power','\x96','high', 'space','environments','protection','performance','liberty','product','denial','gateways','appliance','caching','splitting','proxy','edge', 'faces','affect','source','open','malformed','certficate','editor','protection','forgery','request','cross','site','machine','guest','key','os', 'advisory','generation','august','june','desktop','protocol','malware','engine','elevation','variant','time','implementation','midrange', 'applications','servers','s/accesses','contracts','online','below','need','activities','compliance','policies','date','target','days','openssh-unix-announce', 'softwares','pack','risk','system','belongs','endpoint','issues','mar','apr','au02uap875ghox2.ahe.au.ibm.com','version','sk.ibm.com','g01cxnp20065','b03zcimq101.boulder.ibm.com', "'s/accesses",'.the','port','gi_svc_spo_s1','rsk100018054','services1bte.pok.ibm.com','suport','gi_cba_sap_s3','gi_svc_gsc_s1','canada','ownership','revalidation', 'corrección','apr2017','exc','desviaciones','cycle','chapter','currency','ends','unsecured','group1','analysis','protocols','g01aciwas062.ahe.pok.ibm.com','b06cxhd0p0230.portsmouth.uk.ibm.com', 'b06cxhd0p0330.portsmouth.uk.ibm.com', 'b06cxnr0p0231.portsmouth.uk.ibm.com', 'b06cxnr0p0232.portsmouth.uk.ibm.com','analysis','scanning-jan-2016','protocols','g01aciwas062.ahe.pok.ibm.com','instance','smallbluep8.pok','authorization', 'smay18','production','argentina'] useful=['flash-plugin','qemu-kvm-rhev','plexus-archiver','chromium-browser','openslp','yum-utils','kernel-alt','firefox','vdsm', 'qemu-kvm','libvirt','rhevm-setup-plugins','rhvm-appliance','kernel-rt','ghostscript-library','nautilus','nagios-nrpe','util-linux', 'unixODBC','ucode-intel','nautilus','AS2','OpenSSH','AIX'] tokenized = nltk.word_tokenize(sent) nouns = set(word for (word, pos) in nltk.pos_tag(tokenized) if (is_noun(pos) or is_nounp(pos)) and word.lower() not in li_bad) #print(nouns) keywords_ab = '' try: text = Text(l) for entity in text.entities: if entity.tag == "I-ORG": nouns.add(entity) except : pass nouns=list(nouns) java = '' for j in tokenized: if j[0:4] =='java' or j[0:8] =='rh-mysql' or j[0:6]=='python' or j[0:6] =='xmlrpc' or j[0:6] =='rh-php' or j[0:10]=='go-toolset' or j[0:7]=='rh-java' or j[0:8]=='rh-maven': java=j if java not in nouns and java !='': nouns.append(java) for k in nouns: if k[0:4] == 'RHSA' or k[0:4] =='SUSE' or k[0:3]=='CVE' or k[0:3]=='SIA' or k[0:4]=='CVE-' or k[0:5]=='MEESA': nouns.remove(k) for k in nouns: if k[0:3]=='CVE' or k[0:3] =='CVE' or k[0:4]=='bo3z' or k[0:6] =='AC2_GI': nouns.remove(k) if 'bug' in tokenized and 'fix' in tokenized: nouns.append('bug fix') if 'dhcp' in tokenized: nouns.append('dhcp') for j in tokenized: if j in useful and j not in nouns: nouns.append(j) try: if 'SLE' in nouns: index = tokenized.index('SLE') nouns = [iter.replace('SLE',tokenized[index]+" "+tokenized[index+1]+' '+tokenized[index+2]) for iter in nouns] nouns.remove(tokenized[index+2]) except: pass return nouns