예제 #1
0
def predict(link, mode):

    if mode == "corpus":
        article = link
    if mode == "url":
        toi_article = Article(link, language="en")
        toi_article.download()
        toi_article.parse()
        toi_article.nlp()
        text = toi_article.text
        article = News("author", text, "links", "orientation", "unk", "title")
    if mode == "eval":
        article = News("author", link, "links", "orientation", "unk", "title")
    article.clean_text()
    text = article.getCleanedText()
    joined_Text = np.array([' '.join(text)])

    resList = []
    #prediction for each model of our list

    resList = np.array(resList)
    output_name = sess.get_outputs()[0].name
    input_name = sess.get_inputs()[0].name
    pred = sess.run([output_name], {input_name: joined_Text})
    print(pred)

    display("prediction = " + str(pred[0]) + " % True", "yellow")
    if pred[0][0] == 'mostly false':
        return (0)
    return (1)
예제 #2
0
def getNewsFromCorpus2():
    """
    Creation with the second corpus (The big one)
    Return:
        List of News
    """
    news = []
    with open("Fake.csv", "r") as f:
        # Parse of the CSV
        file = csv.reader(f, delimiter=',', quotechar='"')

        for index, line in enumerate(file):
            # if index%100==0:
            #     print("article n°"+str(index)+ " de Fake")
            # The csv is on the shape title,text,subject,date
            text = line[1]
            title = line[0]
            orientation = line[2]
            article = News("author", text, "links", orientation,
                           "mostly false", title)
            news.append(article)
    # Same for the 'True' file
    with open("True.csv", "r") as f:
        file = csv.reader(f, delimiter=',', quotechar='"')
        for index, line in enumerate(file):
            # if index%100==0:
            #     print("article n°"+str(index)+ " de True")
            text = line[1]
            title = line[0]
            orientation = line[2]
            article = News("author", text, "links", orientation, "mostly true",
                           title)
            news.append(article)

    return news
예제 #3
0
def GetNews():
    while len(url_set) != 0:
        try:
            # 获取链接
            url = url_set.pop()
            url_old.add(url)
            # 获取信息
            article = News.News()
            article.url = url  # URL信息
            html = urllib.urlopen(article.url).read()
            soup = bs4.BeautifulSoup(html, 'html.parser')
            article.title = soup.find('title').get_text()  # 标题信息
            keywords='keywords'
            res0 = re.compile(keywords)
            if soup.find('meta', {'name': res0}).__getitem__('name') == "keywords":
                article.keywords = soup.find('meta', {'name': res0}).__getitem__('content')  # 作者
            else:
                article.keywords = ""
            author = 'author'
            res = re.compile(author)
            if soup.find('meta', {'name': res}).__getitem__('name') == "author":
                article.author = soup.find('meta', {'name': res}).__getitem__('content')  # 作者
            else:
                article.author = ""
            published_time= 'publishdate'
            res1 = re.compile(published_time)
            if soup.find('meta', {'name': res1}).__getitem__('name') == "publishdate":
                article.date = soup.find('meta', {'name': res1}).__getitem__('content')  # 作者
            else:
                article.date = ""
            article.content=soup.find('div', {'class': 'main_text'}).get_text()
            SaveNews(article)
        except Exception as e:
            print(e)
            continue
예제 #4
0
def GetNews(url):
    global NewsCount, MaxNewsCount  #全局记录新闻数量
    while len(url_set) != 0:
        try:
            # 获取链接
            url = url_set.pop()
            url_old.add(url)

            # 获取代码
            html = urllib2.urlopen(url).read().decode('utf8')

            # 解析
            soup = bs4.BeautifulSoup(html, 'html.parser')
            pattern = 'http://\w+\.baijia\.baidu\.com/article/\w+'  # 链接匹配规则
            links = soup.find_all('a', href=re.compile(pattern))

            # 获取URL
            for link in links:
                if link['href'] not in url_old:
                    url_set.add(link['href'])

                    # 获取信息
                    article = News.News()
                    article.url = url  # URL信息
                    page = soup.find('div', {'id': 'page'})
                    article.title = page.find('h1').get_text()  # 标题信息
                    info = page.find('div', {'class': 'article-info'})
                    article.author = info.find('a', {
                        'class': 'name'
                    }).get_text()  # 作者信息
                    article.date = info.find('span', {
                        'class': 'time'
                    }).get_text()  # 日期信息
                    article.about = page.find('blockquote').get_text()
                    pnode = page.find('div', {
                        'class': 'article-detail'
                    }).find_all('p')
                    article.content = ''
                    for node in pnode:  # 获取文章段落
                        article.content += node.get_text() + '\n'  # 追加段落信息

                    SaveNews(article)

                    print NewsCount
                    break
        except Exception as e:
            print(e)
            continue
        else:
            print(article.title)
            NewsCount += 1
        finally:
            # 判断数据是否收集完成
            if NewsCount == MaxNewsCount:
                break
예제 #5
0
def create_newsObject(titles, dates, details, company, links, newsItems):
    """
    Returns a list of news objects

    This method uses the titles, date, details, company and links
    to instantiate a news object and append it to the list newspieces
    """
    print("creating news objects list...")
    #create a news object and add it to the newsItems list
    newspiece = News.News(titles, dates, details, company,links)
    newsItems.append(newspiece)
    return newsItems
예제 #6
0
def GetNews():
    while len(url_set) != 0:
        try:
            # 获取链接
            url = url_set.pop()
            url_old.add(url)
            # 获取信息
            article = News.News()
            article.url = url  # URL信息
            html = urllib.urlopen(article.url).read()
            html = html.decode('gbk')
            html = html.encode('utf8')
            soup = bs4.BeautifulSoup(html, 'html.parser')
            article.title = soup.find('title').get_text()  # 标题信息
            keywords = 'KEYWords'
            res0 = re.compile(keywords)
            if soup.find('meta', {
                    'name': res0
            }).__getitem__('name') == "KEYWords":
                article.keywords = soup.find('meta', {
                    'name': res0
                }).__getitem__('content')  # 作者
            else:
                article.keywords = ""
            author = 'Author'
            res = re.compile(author)
            if soup.find('meta', {
                    'name': res
            }).__getitem__('name') == "Author":
                article.author = soup.find('meta', {
                    'name': res
                }).__getitem__('content')  # 作者
            else:
                article.author = ""
            published_time = 'publishdate'
            res1 = re.compile(published_time)
            if soup.find('meta', {
                    'name': res1
            }).__getitem__('name') == "publishdate":
                article.date = soup.find('meta', {
                    'name': res1
                }).__getitem__('content')  # 作者
            else:
                article.date = ""
            content = soup.select('.neirong')
            article.content = content[0].text
            SaveNews(article)
        except Exception as e:
            print(e)
            continue
예제 #7
0
def getNewsFromCorpus3():
    news = []
    with open("Corpus4.csv", "r") as f:
        # Parse of the CSV
        file = csv.reader(f, delimiter=',', quotechar='"')

        for index, line in enumerate(file):
            # if index%100==0:
            #     print("article n°"+str(index)+ " de Fake")
            # The csv is on the shape title,text,subject,date
            text = line[0]
            title = ""
            orientation = ""
            article = News("author", text, "links", orientation, line[1],
                           title)
            news.append(article)
    return news
예제 #8
0
def GetNews():
    while len(url_set) != 0:
        try:
            # 获取链接
            url = url_set.pop()
            url_old.add(url)
            # 获取信息
            article = News.News()
            article.url = url  # URL信息
            html = urllib.urlopen(article.url).read()
            soup = bs4.BeautifulSoup(html, 'html.parser')
            article.title = soup.find('title').get_text()  # 标题信息
            keywords = 'keywords'
            res0 = re.compile(keywords)
            if soup.find('meta', {
                    'name': res0
            }).__getitem__('name') == "keywords":
                article.keywords = soup.find('meta', {
                    'name': res0
                }).__getitem__('content')  # 作者
            else:
                article.keywords = ""
            content = 'Description'
            res1 = re.compile(content)
            if soup.find('meta', {
                    'name': res1
            }).__getitem__('name') == "Description":
                article.content = soup.find('meta', {
                    'name': res1
                }).__getitem__('content')  # 作者
            else:
                article.content = ""
            author = soup.select('.qq_editor')
            article.author = author[0].text
            SaveNews(article)
        except Exception as e:
            print(e)
            continue
예제 #9
0
def getNewsFromXML(link):
    """
    Transform an article of the first news (xml) corpus into a news object.
    Parameters:
        link : Name of the file (must be in the articles folder)
    """
    # Transformation of the xml in python dictionary
    article = schema.to_dict(link)
    # creation of default values (I don't remember how to make a class with variable parameters)
    author = "Anonyme"
    mainText = "empty"
    hyperlink = []
    orientation = "default"
    # Possible value : mostly true / mixture of true and false / mostly false / no factual content
    # For the needs of the learning, I transform the 'mixture of true and false ' into 'mostly false'.
    veracity = "default"
    title = "default"

    # Recuperation of everything in the xml
    if 'author' in article:
        author = article['author']
    if 'mainText' in article:
        mainText = article['mainText']
    if 'hyperlink' in article:
        hyperlink = article['hyperlink']
    if 'orientation' in article:
        orientation = article['orientation']
    if 'veracity' in article:
        if article['veracity'] == 'mixture of true and false':
            veracity = "mostly false"
        else:
            veracity = article['veracity']
    if 'title' in article:
        title = article['title']
    # Creation of a News instance
    newsInstance = News(author, mainText, hyperlink, orientation, veracity,
                        title)
    return newsInstance
예제 #10
0
def GetNews(url, i):
    response = requests.get(url)
    html = response.text
    article = News.News()
    try:
        article.title = re.findall(r'<h2 id=".*?">(.+?)</h2>', html)
        article.content = re.findall(r'<div class="article">([\w\W]*?)</div>',
                                     html)

        t = ""
        for j in article.title:
            t += str('标题:' + j + '\n')
        c = ""
        for m in article.content:
            c += str(m)
        article.content1 = ' ' + '\n'.join(c.split(' ')).strip()

        file = codecs.open('/tmp/luo/news ' + str(i) + '.txt', 'w+')
        file.write(t + "\t" + article.content1)
        file.close()
        print('ok')
    except Exception as e:
        print('Error1:', e)
    response.close()
예제 #11
0
파일: NewsEs.py 프로젝트: GongCQ/Text
def GetNewsEs(url):
    soup = News.GetSoup(url, 'lxml')

    # 获取网页内容基本信息
    newsContent = soup.body.select('div[class="newsContent"]')[0]
    contentBody = soup.body.select('div[id="ContentBody"]')[0]
    if newsContent.parent == contentBody:  # 研报样式
        title = soup.body.select(
            'div[class="report-title"]')[0].h1.text.strip()
        newsInfo = soup.body.select('div[class="report-infos"]')[0]
        time = dt.datetime.strptime(newsInfo.contents[3].text.strip(),
                                    '%Y年%m月%d日 %H:%M')
        source = newsInfo.contents[5].text.strip(
        ) + ' ' + newsInfo.contents[7].text.strip()
        abstract = ''
        newsBody = newsContent
    elif contentBody.parent == newsContent:  # 资讯样式
        title = newsContent.h1.text.strip()
        newsInfo = newsContent.select('div[class="Info"]')[0]
        newsBody = contentBody
        time = dt.datetime.strptime(
            newsInfo.select('div[class="time"]')[0].text.strip(),
            '%Y年%m月%d日 %H:%M')
        source = newsInfo.img['alt'] if newsInfo.img is not None else ''
        absTagList = newsBody.select('div[class="b-review"]')
        if len(absTagList) == 0:
            abstract = ''
        else:
            abstract = absTagList[0].text.strip()
    else:
        raise 'Unknown page style: url = ' + url
    sectionList = []
    news = News.News(url, time, title, source, abstract, '', sectionList)

    # 识别段落
    secTitle = ''
    secContent = ''
    for c in newsBody.contents:
        if c.name == 'p' and len(c.attrs) == 0:  # 段落标题和正文都存在于<p></p>标签中,且标签无属性
            # 标题判断:<p></p>中整段文本全为加粗,即<p></p>中存在<strong></strong>标签且无非空白文本位于<strong></strong>之外,且无<span></span>子节点
            if c.strong is not None and c.strong.span is None:
                isTitle = True
                for cc in c.contents:
                    if not (isinstance(cc, str) and cc.strip() == ''
                            or cc.name == 'strong'):
                        isTitle = False
                        break
                if c.strong.text.strip() == c.text.strip():
                    isTitle = True
                if isTitle:  # 如果发现了新的标题,则认为新段落开始,将前面已经有段落内容存入段落列表
                    if secTitle != '' or secContent != '':
                        sectionList.append(
                            News.Section(secTitle, secContent, news, url,
                                         len(sectionList)))
                    secTitle = ''
                    secContent = ''
                    secTitle = c.text.strip()
                    continue
            # 正文判断:<p></p>中至少直接有一处非空白文本(直接位于<p></p>中,而非子标签中)
            for cc in c.contents:
                if isinstance(cc, str) and cc.strip() != '':
                    secContent += c.text + os.linesep
                    break
    if secContent != '' or secContent != '':
        sectionList.append(
            News.Section(secTitle, secContent, news, url, len(sectionList)))
    return news
예제 #12
0
 if url not in url_old:
     # 请求新闻的URL,获取其text文本
     wbdata = requests.get(url)
     wbdata.encoding = 'utf-8'
     #print(wbdata.text)
     # 对获取到的文本进行解析
     soup = BeautifulSoup(wbdata.text, 'lxml')
     #print(soup.text)
     # 从解析文件中通过select选择器定位指定的元素,返回一个列表
     news_titles = soup.select('.news-item')
     # 对返回的列表进行遍历
     for n in news_titles:
         # 获取信息
         h2 = n.select('h2')
         if len(h2) > 0:
             article = News.News()
             title = h2[0].text  # 标题信息
             article.title = title.encode("utf-8")
             #print("title:" + article.title)
             article.url = h2[0].select('a')[0]['href']  # URL信息
             #print("url:" + article.url)
             date = n.select('.time')[0].text  # 日期信息
             article.date = date.encode("utf-8")
             #print("date:" + article.date)
             wbdata0 = requests.get(article.url)
             wbdata0.encoding = 'utf-8'
             soup0 = BeautifulSoup(wbdata0.text, 'lxml')
             content = soup0.select('.article')
             article.content = content[0].text
             # print("content:" + article.content)
             keywords = soup0.select('.keywords')
예제 #13
0
def main():
    news = News.News()
    news.grab_article_title_descrip(**top_headlines)
    news = news.newsArticles
예제 #14
0
    def manage_utils(bot, text, author_id, thread_id):

        login = Utils.getLogin(author_id)
        password = Utils.getPassword(author_id)
        plan = Plan(login, password)
        news = News(login, password)
        avggrade = AvgGrade(login, password)

        # user data deletion

        if Utils.wantToDeleteData(text):
            Utils.delete_my_data(author_id)
            bot.send(Message(text='Kim Ty jesteś?'), thread_id=thread_id)

        # fun facts
        elif Utils.wantToHearFunFact(text):
            x = random.randint(0, 10)
            if x is 0:
                bot.send(Message(text='Jakie papierosy palą studenci EE?'),
                         thread_id=thread_id)
                bot.send(Message(text='Elektryczne!'), thread_id=thread_id)
            elif x is 1:
                bot.send(Message(
                    text=
                    'Na lekcji programowania obiektowego student łapie koleżankę obok za pierś. Na to ona: „To prywatne!!!”, a on odpowiada: „Myślałem że jesteśmy w tej samej klasie :D „'
                ),
                         thread_id=thread_id)
            elif x is 2:
                bot.send(Message(
                    text=
                    'Javoviec jakimś cudem spłodził dziecko. Miał wymyślić imię dla dziecka. Na wszelki wypadek przygotował 2, jakby urodziły się bliźniaki. Na nieszczęście urodziły się trojaczki i dostały imiona: Jaś, Staś, ArrayIndexOutOfBoundsException'
                ),
                         thread_id=thread_id)
            elif x is 3:
                bot.send(Message(
                    text=
                    'Spotyka się dwóch programistów:\n– Słyszałem, że straciłeś pracę. Jak to jest być bezrobotnym?\n– To było najgorsze pół godziny mojego życia!'
                ),
                         thread_id=thread_id)
            elif x is 4:
                bot.send(Message(
                    text=
                    'Doktorze, każdej nocy śni mi się jeden i ten sam koszmar. Jestem na Antarktydzie a wokół pełno pingwinów. I ciągle przybywają i przybywają. Zbliżają się do mnie, napierają na mnie, przepychają mnie do urwiska i za każdym razem spychają mnie do lodowatej wody.\n– Normalnie leczymy takie przypadki w jeden dzień. Ale z Panem możemy mieć większe problemy, Panie Gates…'
                ),
                         thread_id=thread_id)
            elif x is 5:
                bot.send(Message(
                    text=
                    'Jadą samochodem 3 koledzy i jeden z nich był programistą. Samochód się psuje, pasażerowie siedzą w środku i dywagują: świece, rozrusznik, benzyna, skończył sie olej… Nagle programista mówi: a może wyjdźmy z samochodu poczekajmy chwilę i potem wejdźmy :D'
                ),
                         thread_id=thread_id)
            elif x is 6:
                bot.send(Message(
                    text=
                    'Z programowaniem jak z budową katedry, budujesz,budujesz a potem się modlisz (żeby wszystko działało)'
                ),
                         thread_id=thread_id)
            elif x is 7:
                bot.send(Message(
                    text=
                    'Programista otwiera lodówkę, sięga po masło i patrząc na napis „82%” mówi:\n– a to jeszcze chwilka i będzie gotowe.'
                ),
                         thread_id=thread_id)
            elif x is 8:
                bot.send(Message(
                    text=
                    'Na świecie jest 10 rodzajów ludzi: ci, którzy rozumieją system binarny i ci, którzy go nie rozumieją.'
                ),
                         thread_id=thread_id)
            elif x is 9:
                bot.send(Message(
                    text=
                    'Żona do programisty: idź do sklepu kup 5 bułek, a jak będą jajka kup 10.\nProgramista będąc w sklepie: – Są jajka?\nSprzedawczyni: – Tak, są.\nProgramista: To poproszę 10 bułek.'
                ),
                         thread_id=thread_id)
            elif x is 10:
                bot.send(Message(
                    text=
                    'Dlaczego programiści mylą Boże Narodzenie z Halloween ?\nBo 25 Dec = 31 Oct'
                ),
                         thread_id=thread_id)
            else:
                return -1

        # helpdesk
        elif Utils.needHelp(text):
            bot.send(Message(
                text=
                'Oto lista dostępnych poleceń po wykonanej autoryzacji:\n średnia - podaje średnią na semestr\n następne zajęcia - podaje najbliższe chronologicznie zajęcia\n usuń - polecenie usuwa dane użytkownika z systemu.\n plan <dzień tygodnia> - polecenie wyświetli plan na podany dzień tygodnia.\n aktualności - polecenie wyświetla nagłówki 5 ostatnich aktualności.\n żart - polecenie wyświetla losowo wybrany z systemu żart.\n pomóż - polecenie wyświetla ten komunikat.'
            ),
                     thread_id=thread_id)

        # plan section

        elif Utils.wantToGetPlan(text) == 1:
            bot.send(Message(text=plan.get_plan_daily(1)), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 2:
            bot.send(Message(text=plan.get_plan_daily(2)), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 3:
            bot.send(Message(text=plan.get_plan_daily(3)), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 4:
            bot.send(Message(text=plan.get_plan_daily(4)), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 5:
            bot.send(Message(text=plan.get_plan_daily(5)), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 6:
            bot.send(Message(text='W weekend nie masz zajęć :)'),
                     thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 7:
            weekplan = plan.get_plan_weekly()
            for i in weekplan:
                bot.send(Message(text=i), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == 8:
            bot.send(Message(text=plan.get_next_class()), thread_id=thread_id)
        elif Utils.wantToGetPlan(text) == -1:
            bot.send(Message(
                text=
                'Może to ja niedomagam, ale nie wiem na kiedy chcesz ten plan. Wyrażaj się jaśniej proszę'
            ),
                     thread_id=thread_id)

        # News section

        elif Utils.wantToGetNews(text) == True:
            news_list = news.getlastnews()
            for obj in news_list:
                bot.send(Message(text=obj), thread_id=thread_id)

        elif Utils.wantToGetAvgGrade(text):
            grade = avggrade.getAvgGrade()
            bot.send(Message(text=grade.text), thread_id=thread_id)

        else:
            Utils.messageNotRecognized(bot, thread_id)