def predict(link, mode): if mode == "corpus": article = link if mode == "url": toi_article = Article(link, language="en") toi_article.download() toi_article.parse() toi_article.nlp() text = toi_article.text article = News("author", text, "links", "orientation", "unk", "title") if mode == "eval": article = News("author", link, "links", "orientation", "unk", "title") article.clean_text() text = article.getCleanedText() joined_Text = np.array([' '.join(text)]) resList = [] #prediction for each model of our list resList = np.array(resList) output_name = sess.get_outputs()[0].name input_name = sess.get_inputs()[0].name pred = sess.run([output_name], {input_name: joined_Text}) print(pred) display("prediction = " + str(pred[0]) + " % True", "yellow") if pred[0][0] == 'mostly false': return (0) return (1)
def getNewsFromCorpus2(): """ Creation with the second corpus (The big one) Return: List of News """ news = [] with open("Fake.csv", "r") as f: # Parse of the CSV file = csv.reader(f, delimiter=',', quotechar='"') for index, line in enumerate(file): # if index%100==0: # print("article n°"+str(index)+ " de Fake") # The csv is on the shape title,text,subject,date text = line[1] title = line[0] orientation = line[2] article = News("author", text, "links", orientation, "mostly false", title) news.append(article) # Same for the 'True' file with open("True.csv", "r") as f: file = csv.reader(f, delimiter=',', quotechar='"') for index, line in enumerate(file): # if index%100==0: # print("article n°"+str(index)+ " de True") text = line[1] title = line[0] orientation = line[2] article = News("author", text, "links", orientation, "mostly true", title) news.append(article) return news
def GetNews(): while len(url_set) != 0: try: # 获取链接 url = url_set.pop() url_old.add(url) # 获取信息 article = News.News() article.url = url # URL信息 html = urllib.urlopen(article.url).read() soup = bs4.BeautifulSoup(html, 'html.parser') article.title = soup.find('title').get_text() # 标题信息 keywords='keywords' res0 = re.compile(keywords) if soup.find('meta', {'name': res0}).__getitem__('name') == "keywords": article.keywords = soup.find('meta', {'name': res0}).__getitem__('content') # 作者 else: article.keywords = "" author = 'author' res = re.compile(author) if soup.find('meta', {'name': res}).__getitem__('name') == "author": article.author = soup.find('meta', {'name': res}).__getitem__('content') # 作者 else: article.author = "" published_time= 'publishdate' res1 = re.compile(published_time) if soup.find('meta', {'name': res1}).__getitem__('name') == "publishdate": article.date = soup.find('meta', {'name': res1}).__getitem__('content') # 作者 else: article.date = "" article.content=soup.find('div', {'class': 'main_text'}).get_text() SaveNews(article) except Exception as e: print(e) continue
def GetNews(url): global NewsCount, MaxNewsCount #全局记录新闻数量 while len(url_set) != 0: try: # 获取链接 url = url_set.pop() url_old.add(url) # 获取代码 html = urllib2.urlopen(url).read().decode('utf8') # 解析 soup = bs4.BeautifulSoup(html, 'html.parser') pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+' # 链接匹配规则 links = soup.find_all('a', href=re.compile(pattern)) # 获取URL for link in links: if link['href'] not in url_old: url_set.add(link['href']) # 获取信息 article = News.News() article.url = url # URL信息 page = soup.find('div', {'id': 'page'}) article.title = page.find('h1').get_text() # 标题信息 info = page.find('div', {'class': 'article-info'}) article.author = info.find('a', { 'class': 'name' }).get_text() # 作者信息 article.date = info.find('span', { 'class': 'time' }).get_text() # 日期信息 article.about = page.find('blockquote').get_text() pnode = page.find('div', { 'class': 'article-detail' }).find_all('p') article.content = '' for node in pnode: # 获取文章段落 article.content += node.get_text() + '\n' # 追加段落信息 SaveNews(article) print NewsCount break except Exception as e: print(e) continue else: print(article.title) NewsCount += 1 finally: # 判断数据是否收集完成 if NewsCount == MaxNewsCount: break
def create_newsObject(titles, dates, details, company, links, newsItems): """ Returns a list of news objects This method uses the titles, date, details, company and links to instantiate a news object and append it to the list newspieces """ print("creating news objects list...") #create a news object and add it to the newsItems list newspiece = News.News(titles, dates, details, company,links) newsItems.append(newspiece) return newsItems
def GetNews(): while len(url_set) != 0: try: # 获取链接 url = url_set.pop() url_old.add(url) # 获取信息 article = News.News() article.url = url # URL信息 html = urllib.urlopen(article.url).read() html = html.decode('gbk') html = html.encode('utf8') soup = bs4.BeautifulSoup(html, 'html.parser') article.title = soup.find('title').get_text() # 标题信息 keywords = 'KEYWords' res0 = re.compile(keywords) if soup.find('meta', { 'name': res0 }).__getitem__('name') == "KEYWords": article.keywords = soup.find('meta', { 'name': res0 }).__getitem__('content') # 作者 else: article.keywords = "" author = 'Author' res = re.compile(author) if soup.find('meta', { 'name': res }).__getitem__('name') == "Author": article.author = soup.find('meta', { 'name': res }).__getitem__('content') # 作者 else: article.author = "" published_time = 'publishdate' res1 = re.compile(published_time) if soup.find('meta', { 'name': res1 }).__getitem__('name') == "publishdate": article.date = soup.find('meta', { 'name': res1 }).__getitem__('content') # 作者 else: article.date = "" content = soup.select('.neirong') article.content = content[0].text SaveNews(article) except Exception as e: print(e) continue
def getNewsFromCorpus3(): news = [] with open("Corpus4.csv", "r") as f: # Parse of the CSV file = csv.reader(f, delimiter=',', quotechar='"') for index, line in enumerate(file): # if index%100==0: # print("article n°"+str(index)+ " de Fake") # The csv is on the shape title,text,subject,date text = line[0] title = "" orientation = "" article = News("author", text, "links", orientation, line[1], title) news.append(article) return news
def GetNews(): while len(url_set) != 0: try: # 获取链接 url = url_set.pop() url_old.add(url) # 获取信息 article = News.News() article.url = url # URL信息 html = urllib.urlopen(article.url).read() soup = bs4.BeautifulSoup(html, 'html.parser') article.title = soup.find('title').get_text() # 标题信息 keywords = 'keywords' res0 = re.compile(keywords) if soup.find('meta', { 'name': res0 }).__getitem__('name') == "keywords": article.keywords = soup.find('meta', { 'name': res0 }).__getitem__('content') # 作者 else: article.keywords = "" content = 'Description' res1 = re.compile(content) if soup.find('meta', { 'name': res1 }).__getitem__('name') == "Description": article.content = soup.find('meta', { 'name': res1 }).__getitem__('content') # 作者 else: article.content = "" author = soup.select('.qq_editor') article.author = author[0].text SaveNews(article) except Exception as e: print(e) continue
def getNewsFromXML(link): """ Transform an article of the first news (xml) corpus into a news object. Parameters: link : Name of the file (must be in the articles folder) """ # Transformation of the xml in python dictionary article = schema.to_dict(link) # creation of default values (I don't remember how to make a class with variable parameters) author = "Anonyme" mainText = "empty" hyperlink = [] orientation = "default" # Possible value : mostly true / mixture of true and false / mostly false / no factual content # For the needs of the learning, I transform the 'mixture of true and false ' into 'mostly false'. veracity = "default" title = "default" # Recuperation of everything in the xml if 'author' in article: author = article['author'] if 'mainText' in article: mainText = article['mainText'] if 'hyperlink' in article: hyperlink = article['hyperlink'] if 'orientation' in article: orientation = article['orientation'] if 'veracity' in article: if article['veracity'] == 'mixture of true and false': veracity = "mostly false" else: veracity = article['veracity'] if 'title' in article: title = article['title'] # Creation of a News instance newsInstance = News(author, mainText, hyperlink, orientation, veracity, title) return newsInstance
def GetNews(url, i): response = requests.get(url) html = response.text article = News.News() try: article.title = re.findall(r'<h2 id=".*?">(.+?)</h2>', html) article.content = re.findall(r'<div class="article">([\w\W]*?)</div>', html) t = "" for j in article.title: t += str('标题:' + j + '\n') c = "" for m in article.content: c += str(m) article.content1 = ' ' + '\n'.join(c.split(' ')).strip() file = codecs.open('/tmp/luo/news ' + str(i) + '.txt', 'w+') file.write(t + "\t" + article.content1) file.close() print('ok') except Exception as e: print('Error1:', e) response.close()
def GetNewsEs(url): soup = News.GetSoup(url, 'lxml') # 获取网页内容基本信息 newsContent = soup.body.select('div[class="newsContent"]')[0] contentBody = soup.body.select('div[id="ContentBody"]')[0] if newsContent.parent == contentBody: # 研报样式 title = soup.body.select( 'div[class="report-title"]')[0].h1.text.strip() newsInfo = soup.body.select('div[class="report-infos"]')[0] time = dt.datetime.strptime(newsInfo.contents[3].text.strip(), '%Y年%m月%d日 %H:%M') source = newsInfo.contents[5].text.strip( ) + ' ' + newsInfo.contents[7].text.strip() abstract = '' newsBody = newsContent elif contentBody.parent == newsContent: # 资讯样式 title = newsContent.h1.text.strip() newsInfo = newsContent.select('div[class="Info"]')[0] newsBody = contentBody time = dt.datetime.strptime( newsInfo.select('div[class="time"]')[0].text.strip(), '%Y年%m月%d日 %H:%M') source = newsInfo.img['alt'] if newsInfo.img is not None else '' absTagList = newsBody.select('div[class="b-review"]') if len(absTagList) == 0: abstract = '' else: abstract = absTagList[0].text.strip() else: raise 'Unknown page style: url = ' + url sectionList = [] news = News.News(url, time, title, source, abstract, '', sectionList) # 识别段落 secTitle = '' secContent = '' for c in newsBody.contents: if c.name == 'p' and len(c.attrs) == 0: # 段落标题和正文都存在于<p></p>标签中,且标签无属性 # 标题判断:<p></p>中整段文本全为加粗,即<p></p>中存在<strong></strong>标签且无非空白文本位于<strong></strong>之外,且无<span></span>子节点 if c.strong is not None and c.strong.span is None: isTitle = True for cc in c.contents: if not (isinstance(cc, str) and cc.strip() == '' or cc.name == 'strong'): isTitle = False break if c.strong.text.strip() == c.text.strip(): isTitle = True if isTitle: # 如果发现了新的标题,则认为新段落开始,将前面已经有段落内容存入段落列表 if secTitle != '' or secContent != '': sectionList.append( News.Section(secTitle, secContent, news, url, len(sectionList))) secTitle = '' secContent = '' secTitle = c.text.strip() continue # 正文判断:<p></p>中至少直接有一处非空白文本(直接位于<p></p>中,而非子标签中) for cc in c.contents: if isinstance(cc, str) and cc.strip() != '': secContent += c.text + os.linesep break if secContent != '' or secContent != '': sectionList.append( News.Section(secTitle, secContent, news, url, len(sectionList))) return news
if url not in url_old: # 请求新闻的URL,获取其text文本 wbdata = requests.get(url) wbdata.encoding = 'utf-8' #print(wbdata.text) # 对获取到的文本进行解析 soup = BeautifulSoup(wbdata.text, 'lxml') #print(soup.text) # 从解析文件中通过select选择器定位指定的元素,返回一个列表 news_titles = soup.select('.news-item') # 对返回的列表进行遍历 for n in news_titles: # 获取信息 h2 = n.select('h2') if len(h2) > 0: article = News.News() title = h2[0].text # 标题信息 article.title = title.encode("utf-8") #print("title:" + article.title) article.url = h2[0].select('a')[0]['href'] # URL信息 #print("url:" + article.url) date = n.select('.time')[0].text # 日期信息 article.date = date.encode("utf-8") #print("date:" + article.date) wbdata0 = requests.get(article.url) wbdata0.encoding = 'utf-8' soup0 = BeautifulSoup(wbdata0.text, 'lxml') content = soup0.select('.article') article.content = content[0].text # print("content:" + article.content) keywords = soup0.select('.keywords')
def main(): news = News.News() news.grab_article_title_descrip(**top_headlines) news = news.newsArticles
def manage_utils(bot, text, author_id, thread_id): login = Utils.getLogin(author_id) password = Utils.getPassword(author_id) plan = Plan(login, password) news = News(login, password) avggrade = AvgGrade(login, password) # user data deletion if Utils.wantToDeleteData(text): Utils.delete_my_data(author_id) bot.send(Message(text='Kim Ty jesteś?'), thread_id=thread_id) # fun facts elif Utils.wantToHearFunFact(text): x = random.randint(0, 10) if x is 0: bot.send(Message(text='Jakie papierosy palą studenci EE?'), thread_id=thread_id) bot.send(Message(text='Elektryczne!'), thread_id=thread_id) elif x is 1: bot.send(Message( text= 'Na lekcji programowania obiektowego student łapie koleżankę obok za pierś. Na to ona: „To prywatne!!!”, a on odpowiada: „Myślałem że jesteśmy w tej samej klasie :D „' ), thread_id=thread_id) elif x is 2: bot.send(Message( text= 'Javoviec jakimś cudem spłodził dziecko. Miał wymyślić imię dla dziecka. Na wszelki wypadek przygotował 2, jakby urodziły się bliźniaki. Na nieszczęście urodziły się trojaczki i dostały imiona: Jaś, Staś, ArrayIndexOutOfBoundsException' ), thread_id=thread_id) elif x is 3: bot.send(Message( text= 'Spotyka się dwóch programistów:\n– Słyszałem, że straciłeś pracę. Jak to jest być bezrobotnym?\n– To było najgorsze pół godziny mojego życia!' ), thread_id=thread_id) elif x is 4: bot.send(Message( text= 'Doktorze, każdej nocy śni mi się jeden i ten sam koszmar. Jestem na Antarktydzie a wokół pełno pingwinów. I ciągle przybywają i przybywają. Zbliżają się do mnie, napierają na mnie, przepychają mnie do urwiska i za każdym razem spychają mnie do lodowatej wody.\n– Normalnie leczymy takie przypadki w jeden dzień. Ale z Panem możemy mieć większe problemy, Panie Gates…' ), thread_id=thread_id) elif x is 5: bot.send(Message( text= 'Jadą samochodem 3 koledzy i jeden z nich był programistą. Samochód się psuje, pasażerowie siedzą w środku i dywagują: świece, rozrusznik, benzyna, skończył sie olej… Nagle programista mówi: a może wyjdźmy z samochodu poczekajmy chwilę i potem wejdźmy :D' ), thread_id=thread_id) elif x is 6: bot.send(Message( text= 'Z programowaniem jak z budową katedry, budujesz,budujesz a potem się modlisz (żeby wszystko działało)' ), thread_id=thread_id) elif x is 7: bot.send(Message( text= 'Programista otwiera lodówkę, sięga po masło i patrząc na napis „82%” mówi:\n– a to jeszcze chwilka i będzie gotowe.' ), thread_id=thread_id) elif x is 8: bot.send(Message( text= 'Na świecie jest 10 rodzajów ludzi: ci, którzy rozumieją system binarny i ci, którzy go nie rozumieją.' ), thread_id=thread_id) elif x is 9: bot.send(Message( text= 'Żona do programisty: idź do sklepu kup 5 bułek, a jak będą jajka kup 10.\nProgramista będąc w sklepie: – Są jajka?\nSprzedawczyni: – Tak, są.\nProgramista: To poproszę 10 bułek.' ), thread_id=thread_id) elif x is 10: bot.send(Message( text= 'Dlaczego programiści mylą Boże Narodzenie z Halloween ?\nBo 25 Dec = 31 Oct' ), thread_id=thread_id) else: return -1 # helpdesk elif Utils.needHelp(text): bot.send(Message( text= 'Oto lista dostępnych poleceń po wykonanej autoryzacji:\n średnia - podaje średnią na semestr\n następne zajęcia - podaje najbliższe chronologicznie zajęcia\n usuń - polecenie usuwa dane użytkownika z systemu.\n plan <dzień tygodnia> - polecenie wyświetli plan na podany dzień tygodnia.\n aktualności - polecenie wyświetla nagłówki 5 ostatnich aktualności.\n żart - polecenie wyświetla losowo wybrany z systemu żart.\n pomóż - polecenie wyświetla ten komunikat.' ), thread_id=thread_id) # plan section elif Utils.wantToGetPlan(text) == 1: bot.send(Message(text=plan.get_plan_daily(1)), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 2: bot.send(Message(text=plan.get_plan_daily(2)), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 3: bot.send(Message(text=plan.get_plan_daily(3)), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 4: bot.send(Message(text=plan.get_plan_daily(4)), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 5: bot.send(Message(text=plan.get_plan_daily(5)), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 6: bot.send(Message(text='W weekend nie masz zajęć :)'), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 7: weekplan = plan.get_plan_weekly() for i in weekplan: bot.send(Message(text=i), thread_id=thread_id) elif Utils.wantToGetPlan(text) == 8: bot.send(Message(text=plan.get_next_class()), thread_id=thread_id) elif Utils.wantToGetPlan(text) == -1: bot.send(Message( text= 'Może to ja niedomagam, ale nie wiem na kiedy chcesz ten plan. Wyrażaj się jaśniej proszę' ), thread_id=thread_id) # News section elif Utils.wantToGetNews(text) == True: news_list = news.getlastnews() for obj in news_list: bot.send(Message(text=obj), thread_id=thread_id) elif Utils.wantToGetAvgGrade(text): grade = avggrade.getAvgGrade() bot.send(Message(text=grade.text), thread_id=thread_id) else: Utils.messageNotRecognized(bot, thread_id)