예제 #1
0
def google_news(browser, url, cursor, db, category):
    try:
        page = browser.get(url)
        source = BeautifulSoup(page.content)
    except:
        source = ""
    if source != "":
        article_lst = source.findAll("div", attrs={"class", "NiLAwe"})
        for article in article_lst:
            try:
                article_div = article.find("h3")
                article_title = article_div.text
                weblink = "https://news.google.com" + article_div.find(
                    "a")["href"]
                summary = article.find("div", attrs={"class", "Da10Tb"}).text
                date_time = article.find("time")["datetime"].split("T")[0]
            except:
                article_title, weblink, summary, date_time = "", "", "", ""
            try:
                img_link = article.find("img")["src"]
            except:
                img_link = ""

            topics = ""
            upload_article(article_title, date_time, "", summary, weblink,
                           "https://news.google.com", img_link, cursor, db,
                           category, topics)
예제 #2
0
def khonumthung(browser,url,cursor,db):
    page = browser.get(url)
    if page.status_code==200:
        source = BeautifulSoup(page.content)
        for j in range(1,5):
            cat_url = url+"?cat="+str(j)
            
            page = browser.get(cat_url)
            if page.status_code==200:
                source = BeautifulSoup(page.content)
                #######pagination############
                pages =[1]
                pages.extend([int(a.text) for a in source.findAll("a",attrs={"class","page-numbers"}) if a.text != "Next"])
                page_count =0#max(pages)
                
                for i in range(page_count+1):
                    page_url = "https://khonumthung.org/?paged="+str(i)+"&cat="+str(j)
                    page = browser.get(page_url)
                    if page.status_code==200:
                        source = BeautifulSoup(page.content)
                    
                        article_lst = source.findAll("div",attrs={"class","column half b-col"})
                        for article in article_lst:
                            article_title,date_time,author,summary,article_url,img_link ="","","","","",""
                                
                            article_title_div = article.find("h2",attrs={"class","post-title"})
                            article_title = article_title_div.text.strip()
                            article_url = article_title_div.find("a")["href"]
                            
                            date_time = article.find("time").text.strip()
                            try:
                                _div = article.find("a",attrs={"class","image-link"})
                                img_link = _div.find("img")["src"]
                            except:
                                pass
                            try:
                                summary_div = article.find("div",attrs={"class","excerpt"})
                                summary = summary_div.find("p").text.strip()
            
                            except:
                                pass
                            try:
                                page = browser.get(article_url)
                                if page.status_code==200:
                                    source = BeautifulSoup(page.content)
                                    article = source.find("article")#,attrs={"class","item-list"})
                            except:
                                pass
                            try:
                                author = article.find("span",attrs={"class","reviewer"}).text.strip()
                                summary_div = article.find("div",attrs={"class","post-content description"})
                                summary += "\n".join([p.text.strip() for p in summary_div.findAll("p")])
                            except:
                                pass
                            from dateutil import parser as dparser
                            date_time = dparser.parse(date_time,fuzzy=True)

                            upload_article(article_title,date_time,author,summary,article_url,url,img_link,cursor,db,"news","")
예제 #3
0
def articlesourcefun(articles_lst,browser,category,cursor,db):
    url = "https://burma.irrawaddy.com/"
    for article_url in articles_lst:
        try:
            page = browser.get(article_url)
            source = BeautifulSoup(page.content)
        except:
            source =""
        if source!="":
            article = articlefun(source)
            entry_name,date_time,author,summary,img_link = article_details(article)
            topics = source.find("p",attrs={"class","article-tags"}).text.strip()
            upload_article(entry_name,date_time,author,summary,article_url,url,img_link,cursor,db,category,topics)                
예제 #4
0
def voa_main(browser,url,cursor,db):

    count=0
    for page_url in list(set(['https://burmese.voanews.com/z/2513','https://burmese.voanews.com/z/2517', 'https://burmese.voanews.com/z/4380', 'https://burmese.voanews.com/z/4381',\
    'https://burmese.voanews.com/z/2524', 'https://burmese.voanews.com/z/2524', 'https://burmese.voanews.com/z/4381', 'https://burmese.voanews.com/z/4380',\
    'https://burmese.voanews.com/z/2512', 'https://burmese.voanews.com/z/4843','https://burmese.voanews.com/z/4251', 'https://burmese.voanews.com/z/4251',\
    'https://burmese.voanews.com/z/4843','https://burmese.voanews.com/z/2525',\
    'https://burmese.voanews.com/z/4406','https://burmese.voanews.com/z/4853', 'https://burmese.voanews.com/z/4385','https://burmese.voanews.com/z/4382',\
    'https://burmese.voanews.com/z/4863','https://burmese.voanews.com/z/4384','https://burmese.voanews.com/z/4860','https://burmese.voanews.com/z/4861', \
    'https://burmese.voanews.com/z/4862','https://burmese.voanews.com/z/5180',\
    'https://burmese.voanews.com/z/4511', 'https://burmese.voanews.com/z/4582','https://burmese.voanews.com/z/5011'])):
        val_lst =[]
        article_lst =[]
        count +=1
        for i in range(2):
            page_url = page_url+"?p="+str(i)#page_lst[0]
            try:
                page = browser.get(page_url)
                source = BeautifulSoup(page.content)
            except:
                source =""
            if source !="":
                try:
                    article_lst.extend([a["href"] for a in source.findAll("a") if a.has_attr('href') if "/a/" in a["href"]])
                    article_lst = list(set(article_lst))
                    for article_url in article_lst:
                        if "burmese.voanews.com" not in article_url:
                            article_url = "https://burmese.voanews.com"+article_url
            #                print("article ------",article_url)
                            page = browser.get(article_url)
                            source = BeautifulSoup(page.content)
                            
                            article_title = ",".join([_title.text.strip() for _title in source.findAll("h1",attrs={"class","pg-title"})])
                            date_time = ",".join([_time.text.strip() for _date_time in source.findAll("div",attrs={"class","col-publishing-details"}) for _time in _date_time.findAll("time")])
                            author = ",".join([a.text.strip() for a in source.findAll("a",attrs={"class","links__item-link"}) if a.has_attr('href') and "/author/" in a["href"]])
                            try:
                                summary = "\n".join([p.text.strip() for p in source.find("div",attrs={"class","wsw"}).findAll("p")])
                            except:
                                summary =""
                            try:
                                img_link = source.find("div",attrs={"class","thumb"}).find("img")["src"]
                            except:
                                img_link =""
                            if lang_identifier_mm(date_time) == True:
                                date_time = translator.translate(date_time)
                            topics =""
                            category ="news"
                            upload_article(article_title,date_time,author,summary,article_url,url,img_link,cursor,db,category,topics)
                except:
                    pass
예제 #5
0
def articlesourcefun(articles_lst, browser, category, cursor, db):
    url = "http://www.7daydaily.com/"
    not_processed = []
    count = 0
    for article_url in articles_lst:
        count += 1
        try:
            page = browser.get(article_url)
            source = BeautifulSoup(page.content)
            entry_name, date_time, author, summary, img_link = article_details(
                source)
            topics = ""
            upload_article(entry_name, date_time, author, summary, article_url,
                           url, img_link, cursor, db, category, topics)

        except:
            not_processed.append(article_url)
예제 #6
0
def article_globalnewlightofmyanmar(browser,url,cursor,db):
#    page = browser.get(url)
#    source = BeautifulSoup(page.content)
#    category_list = page_categories(source,url)
    category_list = ['http://www.globalnewlightofmyanmar.com/category/editors-choice/',
                    'http://www.globalnewlightofmyanmar.com/category/regional-new/',
                    'http://www.globalnewlightofmyanmar.com/category/business/', 
                    'http://www.globalnewlightofmyanmar.com/category/local-news/',
                    'http://www.globalnewlightofmyanmar.com/category/opinion/', 
                    'http://www.globalnewlightofmyanmar.com/category/national/']
    for category_url in category_list:
        try:
            page = browser.get(category_url+"page/2")
            source = BeautifulSoup(page.content)
        except:
            source=""
        if source !="":
            pages = 0#page_count(source)
            for i in range(pages+1):
                suburl =category_url+"page/" +str(i)
                try:
                    page = browser.get(suburl)
                    source = BeautifulSoup(page.content)
                except:
                    source=""
                if source !="":
                    article_lst = source.findAll("li",attrs={"class": "post"})
        #            print(len(post_link_lst))
                    for article in article_lst:
                        article_url = article.find("h2",attrs={"class": "cat-grid-title"}).find("a")["href"]
                        article_title = article.find("h2",attrs={"class": "cat-grid-title"}).text.strip()
                        
                        author = article.find("a",attrs={"itemprop": "author"}).text.strip()
                        img_link = article.find("figure",attrs={"class": "post-thumbnail"}).find("img")["src"]
                        full_text = article.find("div",attrs={"class": "entry-content"}).text.strip()
                        try:
                            page = browser.get(article_url)
                            source = BeautifulSoup(page.content)
                            category = source.find("div",attrs={"class": "entry-cat"}).text.strip()
                            full_text = "\n".join([p.text.strip() for p in source.find("div",attrs={"class": "entry-content"}).findAll("p")])
                            publication_date = source.find("time",attrs={"class": "entry-date"}).text.strip()
                        except:
                            publication_date =""
                        topics =""
                        upload_article(article_title,publication_date,author,full_text,article_url,url,img_link,cursor,db,category,topics)
예제 #7
0
def se_main(browser,url,cursor,db):
    pages =2
    for i in range(pages+1):
        try:
            page = browser.get(url+"/page/"+str(i))
            source = BeautifulSoup(page.content)
        except:
            source =""
        if source !="":            
            articles_lst = [a["href"] for article in source.findAll("article") for h2 in article.findAll("h2",attrs={"class": "entry-title"}) for a in h2.findAll("a")]
            not_processed_articles =[]
            for article in articles_lst:
                try:
                    page = browser.get(article)
                    source = BeautifulSoup(page.content)
                    summary,entry_name,date_time,Image,author = sub_data(source)
                    topics = ""
                    upload_article(entry_name,date_time,author,summary,article,url,Image,cursor,db,"news",topics)                
                except:
                    not_processed_articles.append(article)
예제 #8
0
def article_elevenmyanmar(browser, url, cursor, db):
    page = browser.get(url)
    source = BeautifulSoup(page.content)
    category_list = ["https://elevenmyanmar.com/editorial","https://elevenmyanmar.com/politics",\
                     "https://elevenmyanmar.com/opinion","https://elevenmyanmar.com/crime",\
                     "https://elevenmyanmar.com/business","https://elevenmyanmar.com/interview",\
                     "https://elevenmyanmar.com/economy"]#page_categories(source,url)
    for category_url in category_list:
        category = category_url.replace(url + ",", "").replace(
            "/", ",").replace("archives,", "").replace("category,", "")

        print("category_url ------", category_url, "\n")
        page = browser.get(category_url)
        source = BeautifulSoup(page.content)
        pages = 2  #page_count(source)
        print(pages, "\n")
        for i in range(int(pages)):
            print("page_number = ", i)
            suburl = category_url + "?page=" + str(i)
            page = browser.get(suburl)
            source = BeautifulSoup(page.content)
            post_link_lst = data_collection_and_tagging(source)
            #            print(len(post_link_lst))
            for post_link in post_link_lst:
                try:
                    #                    print(post_link)
                    page = browser.get(post_link)
                    source = BeautifulSoup(page.content)
                    _sub_dict = sub_data(source)
                    article_title = _sub_dict[3]
                    publication_date = _sub_dict[4]
                    author = _sub_dict[6]
                    full_text = _sub_dict[1]
                    img_link = post_link + "\n" + _sub_dict[5]
                    topics = ""
                    upload_article(article_title, publication_date, author,
                                   full_text, post_link, url, img_link, cursor,
                                   db, category, topics)

                except:
                    print("\n\n not processed", post_link, "\n\n")
예제 #9
0
def voa_main(browser, url, cursor, db):
    try:
        page = browser.get(url)
        source = BeautifulSoup(page.content)
    except:
        source = ""
    if source != "":
        category = "news"

        articles_lst = source.findAll("div",
                                      attrs={"class": "vertical-list__item"})

        for article in articles_lst:
            try:
                entry_name = article.find("h2",
                                          attrs={
                                              "class": "teaser__title"
                                          }).text.strip()
                article_url = "https://www.voanews.com/" + article.find(
                    "a", attrs={"class": "teaser__title-link"})["href"]
                date_time = article.find("div",
                                         attrs={
                                             "class": "teaser__date"
                                         }).text.strip()
            except:
                pass
            try:
                img_link = "https://www.voanews.com/" + article.find(
                    "img")["src"]
            except:
                img_link = ""
            author = ""
            try:
                page = browser.get(article_url)
                source = BeautifulSoup(page.content)
            except:
                source = ""
            if source != "":
                try:
                    summary = "\n".join([
                        p.text.strip()
                        for p in source.find("div",
                                             attrs={
                                                 "class": "episode__body"
                                             }).findAll("p")
                    ])
                except:
                    summary = "\n".join([
                        p.text.strip()
                        for p in source.find("div",
                                             attrs={
                                                 "class": "article__body"
                                             }).findAll("p")
                    ])
                try:
                    author = source.find("div",
                                         attrs={
                                             "class": "page-header__meta-item"
                                         }).findAll("span")[1].text.strip()
                except:
                    pass
                topics = ""
                upload_article(entry_name, date_time, author, summary,
                               article_url, url, img_link, cursor, db,
                               category, topics)
예제 #10
0
def shannews_main(browser, url, cursor, db):
    page = browser.get(url)
    if page.status_code == 200:
        source = BeautifulSoup(page.content)

        category_list = [
            a["href"] for ul in source.findAll("ul") if ul.has_attr('id')
            if "menu-td-demo-header-menu-1" in ul["id"]
            for a in ul.findAll("a")
        ]  # if a["href"] !="/" and a["href"] !="#"]
        if len(category_list) > 0:
            category_list = list(set(category_list))
            for cat_url in category_list:
                category = cat_url.replace(url, "").replace("/", ",").replace(
                    "archives,", "").replace("category,", "")

                page = browser.get(cat_url)
                if page.status_code == 200:
                    source = BeautifulSoup(page.content)

                    ########pagination############
                    pages = [1]
                    pages.extend([
                        int(span.text.split("of ")[-1])
                        for span in source.findAll("span",
                                                   attrs={"class", "pages"})
                    ])
                    page_count = max(pages)

                    for i in range(page_count + 1):
                        page_url = cat_url + "/page/" + str(i)
                        page = browser.get(page_url)
                        if page.status_code == 200:
                            source = BeautifulSoup(page.content)

                            article_lst = source.findAll(
                                "div", attrs={"class", "td_module_10"})

                            for article in article_lst:

                                article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", ""

                                try:
                                    article_title_div = article.find(
                                        "h3",
                                        attrs={"class", "td-module-title"})
                                    article_title = article_title_div.text.strip(
                                    )
                                    article_url = article_title_div.find(
                                        "a")["href"]
                                except:
                                    pass
                                try:
                                    p_div = article.find(
                                        "div",
                                        attrs={"class", "td-module-meta-info"})
                                    author = p_div.find(
                                        "span",
                                        attrs={"class", "td-post-author-name"
                                               }).text.strip()

                                    date_time = p_div.find(
                                        "span",
                                        attrs={"class",
                                               "td-post-date"}).text.strip()
                                except:
                                    pass

                                try:
                                    _div = article.find(
                                        "div",
                                        attrs={"class", "td-module-thumb"})
                                    img_link = _div.find("img")["src"]
                                except:
                                    pass
                                try:
                                    summary = article.find("div",
                                                           attrs={
                                                               "class",
                                                               "td-excerpt"
                                                           }).text.strip()
                                except:
                                    pass

                                page = browser.get(article_url)
                                if page.status_code == 200:
                                    source = BeautifulSoup(page.content)

                                    article = source.find("article")
                                    try:
                                        summary_div = article.find(
                                            "div",
                                            attrs={"class", "td-post-content"})
                                        summary += "\n".join([
                                            p.text.strip()
                                            for p in summary_div.findAll("p")
                                        ])
                                    except:
                                        pass
                                    topics = ""

                                    upload_article(article_title, date_time,
                                                   author, summary,
                                                   article_url, url, img_link,
                                                   cursor, db, category,
                                                   topics)
예제 #11
0
def news_eleven(browser, url, cursor, db):
    category_list = ["https://news-eleven.com/news"]
    count = 0
    for cat_url in category_list:
        count += 1
        val_lst = []
        try:
            page = browser.get(cat_url)
            source = BeautifulSoup(page.content)
        except:
            source = ""
        if source != "":
            ########pagination############
            #        pages =[1]
            #    pages.extend([int(a["href"].split("=")[-1]) for a in source.find("a") if a.has_attr('href') if "page" in a["href"]])
            page_count = 1  #max(pages)
            article_list = []
            for i in range(page_count + 1):

                page_url = cat_url + "?page=" + str(i)
                try:
                    page = browser.get(page_url)
                    source = BeautifulSoup(page.content)
                except:
                    pass
                article_list.extend([
                    a["href"]
                    for article in source.findAll("div",
                                                  attrs={"class", "views-row"})
                    for a in article.findAll("a")
                    if a.has_attr('href') and "/article/" not in a["href"]
                ])
            for article_url in list(set(article_list)):
                try:

                    page = browser.get(article_url)
                    source = BeautifulSoup(page.content)
                except:
                    source = ""
                if source != "":
                    article_title, date_time, author, summary, img_link = "", "", "", "", ""
                    try:
                        article_title = source.find(
                            "div", attrs={"class",
                                          "news-detail-title"}).text.strip()
                    except:
                        pass
                    try:
                        date_time = source.find("span",
                                                attrs={
                                                    "class",
                                                    "date-display-single"
                                                }).text.strip()
                    except:
                        pass
                    try:
                        div_image = source.find("div",
                                                attrs={"class", "news-image"})
                    except:
                        pass
                    try:
                        article_category = source.find(
                            "div",
                            attrs={"class",
                                   "news-detail-news-category"}).text.strip()
                    except:
                        article_category = ""
                    try:
                        author = source.find(
                            "div",
                            attrs={
                                "class", "news-detail-date-author-info-author"
                            }).text.strip()
                    except:
                        pass

                    try:
                        img_link = div_image.find("img")["src"]
                    except:
                        pass

                    try:
                        summary_div = source.find(
                            "div", attrs={"class", "field-items"})
                        summary += "\n".join(
                            [p.text.strip() for p in summary_div.findAll("p")])
                    except:
                        pass

                    topics = ""
                    upload_article(article_title, date_time, author, summary,
                                   article_url, url, img_link, cursor, db,
                                   article_category, topics)
예제 #12
0
def kachinlandnews_main(browser, url, cursor, db):
    page = browser.get(url)
    if page.status_code == 200:
        source = BeautifulSoup(page.content)

        category_list = [
            a["href"] for ul in source.findAll("ul") if ul.has_attr('id')
            if "menu-primary" in ul["id"] for a in ul.findAll("a")
            if a["href"] not in [
                'http://kachinlandnews.com', "http://kachinlandnews.org",
                "http://kachinlandnews.com/?page_id=23598"
            ]
        ]
        category_list = list(set(category_list))
        if len(category_list) > 0:
            for cat_url in category_list:
                category = "news"  #cat_url.replace(url,"").replace("/",",").replace("archives","").replace("category","")
                page = browser.get(cat_url)
                if page.status_code == 200:
                    source = BeautifulSoup(page.content)

                    ########pagination############
                    pages = [1]
                    pages.extend([
                        int(span.text) for span in source.findAll(
                            "a", attrs={"class", "page-numbers"})
                        if span.text not in ["…", "Next"]
                    ])
                    page_count = max(pages)
                    for i in range(page_count + 1):
                        page_url = cat_url + "&paged=" + str(i)
                        page = browser.get(page_url)
                        if page.status_code == 200:
                            source = BeautifulSoup(page.content)

                            article_lst = source.findAll("article")

                            for article in article_lst:
                                article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", ""

                                article_title_div = article.find(
                                    "h3", attrs={"class", "entry-title"})
                                try:
                                    article_title = article_title_div.text.strip(
                                    )
                                    article_url = article_title_div.find(
                                        "a")["href"]

                                    date_time = article.find("span",
                                                             attrs={
                                                                 "class",
                                                                 "published"
                                                             }).text.strip()
                                    author = article.find(
                                        "span", attrs={"class",
                                                       "author"}).text.strip()
                                    _div = article.find(
                                        "div",
                                        attrs={"class", "entry-thumbnail"})
                                    img_link = _div.find("img")["src"]

                                    summary_div = article.find(
                                        "div",
                                        attrs={"class", "entry-content"})
                                    summary = summary_div.find(
                                        "p").text.strip()
                                except:
                                    pass
                                page = browser.get(article_url)
                                if page.status_code == 200:
                                    source = BeautifulSoup(page.content)

                                    article = source.find("article")
                                    try:
                                        summary_div = article.find(
                                            "div",
                                            attrs={"class", "entry-content"})
                                        summary += "\n".join([
                                            p.text.strip()
                                            for p in summary_div.findAll("p")
                                        ])
                                        date_time = article.find(
                                            "span",
                                            attrs={"class",
                                                   "published"}).text.strip()
                                        author = article.find("span",
                                                              attrs={
                                                                  "class",
                                                                  "author"
                                                              }).text.strip()
                                    except:
                                        pass
                                    topics = ""
                                    upload_article(article_title, date_time,
                                                   author, summary,
                                                   article_url, url, img_link,
                                                   cursor, db, category,
                                                   topics)
예제 #13
0
def twitter_wrapper(browser, url, cursor, db):
    page = browser.get(url)

    #scrolling down...
    pause = 3

    lastHeight = browser.execute_script("return document.body.scrollHeight")
    #print(lastHeight)

    sub_url_lst = []
    i = 0
    source = ""

    browser.get_screenshot_as_file("test03_1_" + str(i) + ".jpg")
    while True:
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        newHeight = browser.execute_script("return document.body.scrollHeight")
        #print(newHeight)
        if newHeight == lastHeight:
            break
        lastHeight = newHeight
        i += 1

        #extract JSON from web pages...
        source = BeautifulSoup(page.content)

        for st in source.findAll('div'):
            try:

                if "content" in st["class"]:
                    sub_url_lst.append(st)
            except:
                pass

        for d in sub_url_lst:
            name, author, post, _datetime, img_link = [], '', '', '', ''
            for d1 in d.findAll('strong'):
                name.append(d1.get_text())
            for d4 in d.findAll('span'):
                try:
                    if "username" in d4["class"]:
                        author = d4.get_text()
                except:
                    pass
            for d2 in d.findAll('p'):
                post = d2.get_text()
            for d3 in d.findAll('small'):
                for d4 in d3.findAll('span'):
                    _datetime = d4.text
            for d5 in d.findAll('div'):
                try:
                    for img in d5.findAll('img'):

                        img_link = str(img["src"]) + "\n"
                        print(img_link)
                except:
                    pass

            _name = name[0]
            if _datetime != '' and post != "":
                topics = ""
                upload_article(_name, _datetime, author, post, url, url,
                               img_link, cursor, db, "news", topics)
    return ""
예제 #14
0
def find_articles(category_list, browser, cursor, db):
    for category_url in category_list:
        articles = []
        try:
            page = requests.get(category_url)
            source = BeautifulSoup(page.content)
        except:
            source = ""
        if source != "":
            try:
                pages = 2  #[math.ceil(int(re.findall('\d+',span.text)[0])/15) for span in source.findAll("span",attrs={"class","count"}) if "Found" in span.text][0]
                for i in range(pages + 1):
                    page = requests.get(category_url + "/page/" + str(i))
                    source = BeautifulSoup(page.content)
                    articles.extend(source.findAll("article"))
            except:
                pass
        count = 0
        for article in list(set(articles)):
            count += 1
            try:
                news_title = [
                    h.text.strip()
                    for h in article.findAll("header",
                                             attrs={"class", "article-header"})
                ][0]
            except:
                news_title = ""
            try:
                author = [
                    h.text.strip()
                    for h in article.findAll("span",
                                             attrs={"class", "reporter"})
                ][0]
            except:
                author = ""
            try:
                news_category = [
                    h.text.strip()
                    for h in article.findAll("span",
                                             attrs={"class", "category"})
                ][0]
            except:
                news_category = ""
            try:
                summary = [
                    h.text.strip()
                    for h in article.findAll("div", attrs={"class", "entry"})
                ][0]
            except:
                summary = ""
            try:
                image_link = [
                    h["data-src"] for h in article.findAll("figure")
                ][0]
            except:
                image_link = ""
            try:
                article_link = [
                    a["href"]
                    for h in article.findAll("header",
                                             attrs={"class", "article-header"})
                    for a in h.findAll("a")
                ][0]
            except:
                article_link = ""
            try:
                page = requests.get(article_link)
                source = BeautifulSoup(page.content)
                article = source.find("article")
                full_text = "\n".join([
                    p.text.strip()
                    for h in article.findAll("div",
                                             attrs={"class", "article-entry"})
                    for p in h.findAll("p") if not p.has_attr('class')
                ])
            except:
                full_text = ""
            try:
                date_time = parser.parse([
                    " ".join(h.text.strip().split(" ")[-3:])
                    for h in article.findAll("div",
                                             attrs={"class", "article-entry"})
                ][0],
                                         fuzzy=True)
            except:
                date_time = "\n".join([
                    " ".join(p.text.strip().split(" ")[-3:])
                    for h in article.findAll("div",
                                             attrs={"class", "article-entry"})
                    for p in h.findAll("p", attrs={"class", "date"})
                ])
            try:
                related_articles = "\n".join(
                    list(
                        set([
                            a["href"] for h in article.findAll(
                                "div", attrs={"class", "article-entry"})
                            for p in h.findAll("p") for a in p.findAll("a")
                        ])))
            except:
                related_articles = ""
            try:
                topics = [
                    p.text.strip()
                    for p in article.findAll("p",
                                             attrs={"class", "article-tags"})
                ][0].replace("Topics: ", "")
            except:
                topics = ""
            upload_article(news_title, date_time, author, full_text,
                           article_link, "https://www.irrawaddy.com/",
                           image_link, cursor, db, news_category, topics)
예제 #15
0
def mmtimes_main(browser, url, cursor, db):
    #    print("started---------find_categories")
    category_list, browser = find_categories(url, browser)

    for category_url in category_list:
        try:
            page = browser.get(category_url)
            source = BeautifulSoup(page.content)
        except:
            source = ""
        if source != "":
            article_lst = source.findAll("div", attrs={"class": "views-row"})
            for article in article_lst:
                try:
                    img_link = source.find("div",
                                           attrs={
                                               "class": "latest-news-top"
                                           }).find("img")["src"]
                except:
                    img_link = ""
                try:
                    article_title = source.find("div",
                                                attrs={
                                                    "class": "news-title"
                                                }).text.strip()
                    article_url = url + source.find("div",
                                                    attrs={
                                                        "class": "news-title"
                                                    }).find("a")["href"]
                except:
                    article_title, article_url = "", ""
                try:
                    category = source.find("span",
                                           attrs={
                                               "class": "news-category"
                                           }).text.strip()
                except:
                    category = ""
                try:
                    date_time = source.find("span",
                                            attrs={
                                                "class": "news-date"
                                            }).text.strip()
                except:
                    date_time = ""
        #            print(article_url)
                try:
                    page = browser.get(article_url)
                    source = BeautifulSoup(page.content)
                    summary = "\n".join([
                        p.text.strip()
                        for p in source.find("div",
                                             attrs={
                                                 "class": "field-item"
                                             }).findAll("p")
                    ])
                    author = source.find("span",
                                         attrs={
                                             "class": "news-author"
                                         }).text.strip()
                except:
                    summary = ""
                    author = ""
                topics = ""
                upload_article(article_title, date_time, author, summary,
                               article_url, url, img_link, cursor, db,
                               category, topics)
예제 #16
0
def thanlwintimes(browser, url, cursor, db):
    page = browser.get(url)
    if page.status_code == 200:
        source = BeautifulSoup(page.content)
        category_list = [
            a["href"] for ul in source.findAll("ul") if ul.has_attr('id')
            if "menu-cat-menu-1" in ul["id"] for a in ul.findAll("a")
        ]  # if a["href"] !="/" and a["href"] !="#"]
        if len(category_list) > 0:
            category_list = list(set(category_list))
            for cat_url in category_list:
                category = cat_url.replace(url, "").replace("/", ",").replace(
                    "archives,", "").replace("category,", "")
                page = browser.get(cat_url)
                if page.status_code == 200:
                    source = BeautifulSoup(page.content)
                    ########pagination############
                    pages = [1]
                    pages.extend([
                        int(span.text.split("of ")[-1])
                        for span in source.findAll("span",
                                                   attrs={"class", "pages"})
                    ])
                    page_count = max(pages)
                    for i in range(page_count + 1):
                        page_url = cat_url + "/page/" + str(i)
                        page = browser.get(page_url)
                        if page.status_code == 200:
                            source = BeautifulSoup(page.content)
                            article_lst = source.findAll(
                                "div", attrs={"class", "td-block-span6"})
                            for article in article_lst:
                                try:
                                    article_title_div = article.find(
                                        "h3",
                                        attrs={"class", "td-module-title"})
                                    article_title = article_title_div.text.strip(
                                    )
                                    article_url = article_title_div.find(
                                        "a")["href"]
                                except:
                                    article_title, article_url = "", ""
                                try:
                                    author = article.find(
                                        "span",
                                        attrs={"class", "td-post-author-name"
                                               }).text.strip()
                                    date_time = article.find(
                                        "span",
                                        attrs={"class",
                                               "td-post-date"}).text.strip()
                                    from dateutil import parser as dparser
                                    date_time = dparser.parse(date_time,
                                                              fuzzy=True)
                                except:
                                    author, date_time = "", ""
                                try:
                                    _div = article.find(
                                        "div",
                                        attrs={"class", "td-module-image"})
                                    img_link = _div.find("img")["src"] + "\n"
                                except:
                                    img_link = ""
                                summary = ""
                                if article_url != "":
                                    page = browser.get(article_url)
                                    if page.status_code == 200:
                                        source = BeautifulSoup(page.content)
                                        summary = "\n".join([
                                            p.text.strip()
                                            for p in source.findAll("p")
                                        ])
                                        img_link = "\n".join([
                                            img["src"] for img in
                                            summary_div.findAll("img")
                                        ])

                                topics = ""
                                upload_article(article_title, date_time,
                                               author, summary, article_url,
                                               url, img_link, cursor, db,
                                               category, topics)
예제 #17
0
def narinjara_main(browser, url, cursor, db):
    page = browser.get(url)
    if page.status_code == 200:
        source = BeautifulSoup(page.content)
        category = "news"
        ########pagination############
        pages = [1]
        pages.extend([
            int(span.text.split("of ")[-1])
            for span in source.findAll("span", attrs={"class", "pages"})
        ])
        page_count = max(pages)
        for i in range(page_count + 1):
            page_url = url + "?page=" + str(i)
            page = browser.get(page_url)
            if page.status_code == 200:
                source = BeautifulSoup(page.content)
                article_lst = source.findAll("article",
                                             attrs={"class", "entry-item"})
                for article in article_lst:
                    article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", ""
                    try:
                        article_title_div = article.find(
                            "h2", attrs={"class", "entry-title"})
                        article_title = article_title_div.text.strip()
                        article_url = "https://burmese.narinjara.com" + article_title_div.find(
                            "a")["href"]
                    except:
                        pass
                    try:
                        p_div = article.find("ul",
                                             attrs={"class", "entry-meta"})
                        date_time = p_div.find("li",
                                               attrs={"class", "entry-date"
                                                      }).text.strip()
                        author = p_div.find("li",
                                            attrs={"class", "entry-author"
                                                   }).text.strip()
                    except:
                        pass
                    try:
                        _div = article.find("div",
                                            attrs={"class", "entry-img"})
                        img_link = "https://burmese.narinjara.com" + _div.find(
                            "img")["src"]
                    except:
                        pass
                    try:
                        summary_div = article.find(
                            "div", attrs={"class", "entry-content"})
                        summary = summary_div.find("p").text.strip()
                    except:
                        pass
                    page = browser.get(article_url)
                    if page.status_code == 200:
                        source = BeautifulSoup(page.content)
                        try:
                            article = source.find(
                                "article")  #,attrs={"class","item-list"})
                            summary_div = article.find(
                                "div", attrs={"class", "entry"})
                            summary += "\n".join([
                                p.text.strip()
                                for p in summary_div.findAll("p")
                            ])
                        except:
                            pass
                    topics = ""
                    upload_article(article_title, date_time, author, summary,
                                   article_url, url, img_link, cursor, db,
                                   category, topics)