def google_news(browser, url, cursor, db, category): try: page = browser.get(url) source = BeautifulSoup(page.content) except: source = "" if source != "": article_lst = source.findAll("div", attrs={"class", "NiLAwe"}) for article in article_lst: try: article_div = article.find("h3") article_title = article_div.text weblink = "https://news.google.com" + article_div.find( "a")["href"] summary = article.find("div", attrs={"class", "Da10Tb"}).text date_time = article.find("time")["datetime"].split("T")[0] except: article_title, weblink, summary, date_time = "", "", "", "" try: img_link = article.find("img")["src"] except: img_link = "" topics = "" upload_article(article_title, date_time, "", summary, weblink, "https://news.google.com", img_link, cursor, db, category, topics)
def khonumthung(browser,url,cursor,db): page = browser.get(url) if page.status_code==200: source = BeautifulSoup(page.content) for j in range(1,5): cat_url = url+"?cat="+str(j) page = browser.get(cat_url) if page.status_code==200: source = BeautifulSoup(page.content) #######pagination############ pages =[1] pages.extend([int(a.text) for a in source.findAll("a",attrs={"class","page-numbers"}) if a.text != "Next"]) page_count =0#max(pages) for i in range(page_count+1): page_url = "https://khonumthung.org/?paged="+str(i)+"&cat="+str(j) page = browser.get(page_url) if page.status_code==200: source = BeautifulSoup(page.content) article_lst = source.findAll("div",attrs={"class","column half b-col"}) for article in article_lst: article_title,date_time,author,summary,article_url,img_link ="","","","","","" article_title_div = article.find("h2",attrs={"class","post-title"}) article_title = article_title_div.text.strip() article_url = article_title_div.find("a")["href"] date_time = article.find("time").text.strip() try: _div = article.find("a",attrs={"class","image-link"}) img_link = _div.find("img")["src"] except: pass try: summary_div = article.find("div",attrs={"class","excerpt"}) summary = summary_div.find("p").text.strip() except: pass try: page = browser.get(article_url) if page.status_code==200: source = BeautifulSoup(page.content) article = source.find("article")#,attrs={"class","item-list"}) except: pass try: author = article.find("span",attrs={"class","reviewer"}).text.strip() summary_div = article.find("div",attrs={"class","post-content description"}) summary += "\n".join([p.text.strip() for p in summary_div.findAll("p")]) except: pass from dateutil import parser as dparser date_time = dparser.parse(date_time,fuzzy=True) upload_article(article_title,date_time,author,summary,article_url,url,img_link,cursor,db,"news","")
def articlesourcefun(articles_lst,browser,category,cursor,db): url = "https://burma.irrawaddy.com/" for article_url in articles_lst: try: page = browser.get(article_url) source = BeautifulSoup(page.content) except: source ="" if source!="": article = articlefun(source) entry_name,date_time,author,summary,img_link = article_details(article) topics = source.find("p",attrs={"class","article-tags"}).text.strip() upload_article(entry_name,date_time,author,summary,article_url,url,img_link,cursor,db,category,topics)
def voa_main(browser,url,cursor,db): count=0 for page_url in list(set(['https://burmese.voanews.com/z/2513','https://burmese.voanews.com/z/2517', 'https://burmese.voanews.com/z/4380', 'https://burmese.voanews.com/z/4381',\ 'https://burmese.voanews.com/z/2524', 'https://burmese.voanews.com/z/2524', 'https://burmese.voanews.com/z/4381', 'https://burmese.voanews.com/z/4380',\ 'https://burmese.voanews.com/z/2512', 'https://burmese.voanews.com/z/4843','https://burmese.voanews.com/z/4251', 'https://burmese.voanews.com/z/4251',\ 'https://burmese.voanews.com/z/4843','https://burmese.voanews.com/z/2525',\ 'https://burmese.voanews.com/z/4406','https://burmese.voanews.com/z/4853', 'https://burmese.voanews.com/z/4385','https://burmese.voanews.com/z/4382',\ 'https://burmese.voanews.com/z/4863','https://burmese.voanews.com/z/4384','https://burmese.voanews.com/z/4860','https://burmese.voanews.com/z/4861', \ 'https://burmese.voanews.com/z/4862','https://burmese.voanews.com/z/5180',\ 'https://burmese.voanews.com/z/4511', 'https://burmese.voanews.com/z/4582','https://burmese.voanews.com/z/5011'])): val_lst =[] article_lst =[] count +=1 for i in range(2): page_url = page_url+"?p="+str(i)#page_lst[0] try: page = browser.get(page_url) source = BeautifulSoup(page.content) except: source ="" if source !="": try: article_lst.extend([a["href"] for a in source.findAll("a") if a.has_attr('href') if "/a/" in a["href"]]) article_lst = list(set(article_lst)) for article_url in article_lst: if "burmese.voanews.com" not in article_url: article_url = "https://burmese.voanews.com"+article_url # print("article ------",article_url) page = browser.get(article_url) source = BeautifulSoup(page.content) article_title = ",".join([_title.text.strip() for _title in source.findAll("h1",attrs={"class","pg-title"})]) date_time = ",".join([_time.text.strip() for _date_time in source.findAll("div",attrs={"class","col-publishing-details"}) for _time in _date_time.findAll("time")]) author = ",".join([a.text.strip() for a in source.findAll("a",attrs={"class","links__item-link"}) if a.has_attr('href') and "/author/" in a["href"]]) try: summary = "\n".join([p.text.strip() for p in source.find("div",attrs={"class","wsw"}).findAll("p")]) except: summary ="" try: img_link = source.find("div",attrs={"class","thumb"}).find("img")["src"] except: img_link ="" if lang_identifier_mm(date_time) == True: date_time = translator.translate(date_time) topics ="" category ="news" upload_article(article_title,date_time,author,summary,article_url,url,img_link,cursor,db,category,topics) except: pass
def articlesourcefun(articles_lst, browser, category, cursor, db): url = "http://www.7daydaily.com/" not_processed = [] count = 0 for article_url in articles_lst: count += 1 try: page = browser.get(article_url) source = BeautifulSoup(page.content) entry_name, date_time, author, summary, img_link = article_details( source) topics = "" upload_article(entry_name, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics) except: not_processed.append(article_url)
def article_globalnewlightofmyanmar(browser,url,cursor,db): # page = browser.get(url) # source = BeautifulSoup(page.content) # category_list = page_categories(source,url) category_list = ['http://www.globalnewlightofmyanmar.com/category/editors-choice/', 'http://www.globalnewlightofmyanmar.com/category/regional-new/', 'http://www.globalnewlightofmyanmar.com/category/business/', 'http://www.globalnewlightofmyanmar.com/category/local-news/', 'http://www.globalnewlightofmyanmar.com/category/opinion/', 'http://www.globalnewlightofmyanmar.com/category/national/'] for category_url in category_list: try: page = browser.get(category_url+"page/2") source = BeautifulSoup(page.content) except: source="" if source !="": pages = 0#page_count(source) for i in range(pages+1): suburl =category_url+"page/" +str(i) try: page = browser.get(suburl) source = BeautifulSoup(page.content) except: source="" if source !="": article_lst = source.findAll("li",attrs={"class": "post"}) # print(len(post_link_lst)) for article in article_lst: article_url = article.find("h2",attrs={"class": "cat-grid-title"}).find("a")["href"] article_title = article.find("h2",attrs={"class": "cat-grid-title"}).text.strip() author = article.find("a",attrs={"itemprop": "author"}).text.strip() img_link = article.find("figure",attrs={"class": "post-thumbnail"}).find("img")["src"] full_text = article.find("div",attrs={"class": "entry-content"}).text.strip() try: page = browser.get(article_url) source = BeautifulSoup(page.content) category = source.find("div",attrs={"class": "entry-cat"}).text.strip() full_text = "\n".join([p.text.strip() for p in source.find("div",attrs={"class": "entry-content"}).findAll("p")]) publication_date = source.find("time",attrs={"class": "entry-date"}).text.strip() except: publication_date ="" topics ="" upload_article(article_title,publication_date,author,full_text,article_url,url,img_link,cursor,db,category,topics)
def se_main(browser,url,cursor,db): pages =2 for i in range(pages+1): try: page = browser.get(url+"/page/"+str(i)) source = BeautifulSoup(page.content) except: source ="" if source !="": articles_lst = [a["href"] for article in source.findAll("article") for h2 in article.findAll("h2",attrs={"class": "entry-title"}) for a in h2.findAll("a")] not_processed_articles =[] for article in articles_lst: try: page = browser.get(article) source = BeautifulSoup(page.content) summary,entry_name,date_time,Image,author = sub_data(source) topics = "" upload_article(entry_name,date_time,author,summary,article,url,Image,cursor,db,"news",topics) except: not_processed_articles.append(article)
def article_elevenmyanmar(browser, url, cursor, db): page = browser.get(url) source = BeautifulSoup(page.content) category_list = ["https://elevenmyanmar.com/editorial","https://elevenmyanmar.com/politics",\ "https://elevenmyanmar.com/opinion","https://elevenmyanmar.com/crime",\ "https://elevenmyanmar.com/business","https://elevenmyanmar.com/interview",\ "https://elevenmyanmar.com/economy"]#page_categories(source,url) for category_url in category_list: category = category_url.replace(url + ",", "").replace( "/", ",").replace("archives,", "").replace("category,", "") print("category_url ------", category_url, "\n") page = browser.get(category_url) source = BeautifulSoup(page.content) pages = 2 #page_count(source) print(pages, "\n") for i in range(int(pages)): print("page_number = ", i) suburl = category_url + "?page=" + str(i) page = browser.get(suburl) source = BeautifulSoup(page.content) post_link_lst = data_collection_and_tagging(source) # print(len(post_link_lst)) for post_link in post_link_lst: try: # print(post_link) page = browser.get(post_link) source = BeautifulSoup(page.content) _sub_dict = sub_data(source) article_title = _sub_dict[3] publication_date = _sub_dict[4] author = _sub_dict[6] full_text = _sub_dict[1] img_link = post_link + "\n" + _sub_dict[5] topics = "" upload_article(article_title, publication_date, author, full_text, post_link, url, img_link, cursor, db, category, topics) except: print("\n\n not processed", post_link, "\n\n")
def voa_main(browser, url, cursor, db): try: page = browser.get(url) source = BeautifulSoup(page.content) except: source = "" if source != "": category = "news" articles_lst = source.findAll("div", attrs={"class": "vertical-list__item"}) for article in articles_lst: try: entry_name = article.find("h2", attrs={ "class": "teaser__title" }).text.strip() article_url = "https://www.voanews.com/" + article.find( "a", attrs={"class": "teaser__title-link"})["href"] date_time = article.find("div", attrs={ "class": "teaser__date" }).text.strip() except: pass try: img_link = "https://www.voanews.com/" + article.find( "img")["src"] except: img_link = "" author = "" try: page = browser.get(article_url) source = BeautifulSoup(page.content) except: source = "" if source != "": try: summary = "\n".join([ p.text.strip() for p in source.find("div", attrs={ "class": "episode__body" }).findAll("p") ]) except: summary = "\n".join([ p.text.strip() for p in source.find("div", attrs={ "class": "article__body" }).findAll("p") ]) try: author = source.find("div", attrs={ "class": "page-header__meta-item" }).findAll("span")[1].text.strip() except: pass topics = "" upload_article(entry_name, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)
def shannews_main(browser, url, cursor, db): page = browser.get(url) if page.status_code == 200: source = BeautifulSoup(page.content) category_list = [ a["href"] for ul in source.findAll("ul") if ul.has_attr('id') if "menu-td-demo-header-menu-1" in ul["id"] for a in ul.findAll("a") ] # if a["href"] !="/" and a["href"] !="#"] if len(category_list) > 0: category_list = list(set(category_list)) for cat_url in category_list: category = cat_url.replace(url, "").replace("/", ",").replace( "archives,", "").replace("category,", "") page = browser.get(cat_url) if page.status_code == 200: source = BeautifulSoup(page.content) ########pagination############ pages = [1] pages.extend([ int(span.text.split("of ")[-1]) for span in source.findAll("span", attrs={"class", "pages"}) ]) page_count = max(pages) for i in range(page_count + 1): page_url = cat_url + "/page/" + str(i) page = browser.get(page_url) if page.status_code == 200: source = BeautifulSoup(page.content) article_lst = source.findAll( "div", attrs={"class", "td_module_10"}) for article in article_lst: article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", "" try: article_title_div = article.find( "h3", attrs={"class", "td-module-title"}) article_title = article_title_div.text.strip( ) article_url = article_title_div.find( "a")["href"] except: pass try: p_div = article.find( "div", attrs={"class", "td-module-meta-info"}) author = p_div.find( "span", attrs={"class", "td-post-author-name" }).text.strip() date_time = p_div.find( "span", attrs={"class", "td-post-date"}).text.strip() except: pass try: _div = article.find( "div", attrs={"class", "td-module-thumb"}) img_link = _div.find("img")["src"] except: pass try: summary = article.find("div", attrs={ "class", "td-excerpt" }).text.strip() except: pass page = browser.get(article_url) if page.status_code == 200: source = BeautifulSoup(page.content) article = source.find("article") try: summary_div = article.find( "div", attrs={"class", "td-post-content"}) summary += "\n".join([ p.text.strip() for p in summary_div.findAll("p") ]) except: pass topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)
def news_eleven(browser, url, cursor, db): category_list = ["https://news-eleven.com/news"] count = 0 for cat_url in category_list: count += 1 val_lst = [] try: page = browser.get(cat_url) source = BeautifulSoup(page.content) except: source = "" if source != "": ########pagination############ # pages =[1] # pages.extend([int(a["href"].split("=")[-1]) for a in source.find("a") if a.has_attr('href') if "page" in a["href"]]) page_count = 1 #max(pages) article_list = [] for i in range(page_count + 1): page_url = cat_url + "?page=" + str(i) try: page = browser.get(page_url) source = BeautifulSoup(page.content) except: pass article_list.extend([ a["href"] for article in source.findAll("div", attrs={"class", "views-row"}) for a in article.findAll("a") if a.has_attr('href') and "/article/" not in a["href"] ]) for article_url in list(set(article_list)): try: page = browser.get(article_url) source = BeautifulSoup(page.content) except: source = "" if source != "": article_title, date_time, author, summary, img_link = "", "", "", "", "" try: article_title = source.find( "div", attrs={"class", "news-detail-title"}).text.strip() except: pass try: date_time = source.find("span", attrs={ "class", "date-display-single" }).text.strip() except: pass try: div_image = source.find("div", attrs={"class", "news-image"}) except: pass try: article_category = source.find( "div", attrs={"class", "news-detail-news-category"}).text.strip() except: article_category = "" try: author = source.find( "div", attrs={ "class", "news-detail-date-author-info-author" }).text.strip() except: pass try: img_link = div_image.find("img")["src"] except: pass try: summary_div = source.find( "div", attrs={"class", "field-items"}) summary += "\n".join( [p.text.strip() for p in summary_div.findAll("p")]) except: pass topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, article_category, topics)
def kachinlandnews_main(browser, url, cursor, db): page = browser.get(url) if page.status_code == 200: source = BeautifulSoup(page.content) category_list = [ a["href"] for ul in source.findAll("ul") if ul.has_attr('id') if "menu-primary" in ul["id"] for a in ul.findAll("a") if a["href"] not in [ 'http://kachinlandnews.com', "http://kachinlandnews.org", "http://kachinlandnews.com/?page_id=23598" ] ] category_list = list(set(category_list)) if len(category_list) > 0: for cat_url in category_list: category = "news" #cat_url.replace(url,"").replace("/",",").replace("archives","").replace("category","") page = browser.get(cat_url) if page.status_code == 200: source = BeautifulSoup(page.content) ########pagination############ pages = [1] pages.extend([ int(span.text) for span in source.findAll( "a", attrs={"class", "page-numbers"}) if span.text not in ["…", "Next"] ]) page_count = max(pages) for i in range(page_count + 1): page_url = cat_url + "&paged=" + str(i) page = browser.get(page_url) if page.status_code == 200: source = BeautifulSoup(page.content) article_lst = source.findAll("article") for article in article_lst: article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", "" article_title_div = article.find( "h3", attrs={"class", "entry-title"}) try: article_title = article_title_div.text.strip( ) article_url = article_title_div.find( "a")["href"] date_time = article.find("span", attrs={ "class", "published" }).text.strip() author = article.find( "span", attrs={"class", "author"}).text.strip() _div = article.find( "div", attrs={"class", "entry-thumbnail"}) img_link = _div.find("img")["src"] summary_div = article.find( "div", attrs={"class", "entry-content"}) summary = summary_div.find( "p").text.strip() except: pass page = browser.get(article_url) if page.status_code == 200: source = BeautifulSoup(page.content) article = source.find("article") try: summary_div = article.find( "div", attrs={"class", "entry-content"}) summary += "\n".join([ p.text.strip() for p in summary_div.findAll("p") ]) date_time = article.find( "span", attrs={"class", "published"}).text.strip() author = article.find("span", attrs={ "class", "author" }).text.strip() except: pass topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)
def twitter_wrapper(browser, url, cursor, db): page = browser.get(url) #scrolling down... pause = 3 lastHeight = browser.execute_script("return document.body.scrollHeight") #print(lastHeight) sub_url_lst = [] i = 0 source = "" browser.get_screenshot_as_file("test03_1_" + str(i) + ".jpg") while True: browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(pause) newHeight = browser.execute_script("return document.body.scrollHeight") #print(newHeight) if newHeight == lastHeight: break lastHeight = newHeight i += 1 #extract JSON from web pages... source = BeautifulSoup(page.content) for st in source.findAll('div'): try: if "content" in st["class"]: sub_url_lst.append(st) except: pass for d in sub_url_lst: name, author, post, _datetime, img_link = [], '', '', '', '' for d1 in d.findAll('strong'): name.append(d1.get_text()) for d4 in d.findAll('span'): try: if "username" in d4["class"]: author = d4.get_text() except: pass for d2 in d.findAll('p'): post = d2.get_text() for d3 in d.findAll('small'): for d4 in d3.findAll('span'): _datetime = d4.text for d5 in d.findAll('div'): try: for img in d5.findAll('img'): img_link = str(img["src"]) + "\n" print(img_link) except: pass _name = name[0] if _datetime != '' and post != "": topics = "" upload_article(_name, _datetime, author, post, url, url, img_link, cursor, db, "news", topics) return ""
def find_articles(category_list, browser, cursor, db): for category_url in category_list: articles = [] try: page = requests.get(category_url) source = BeautifulSoup(page.content) except: source = "" if source != "": try: pages = 2 #[math.ceil(int(re.findall('\d+',span.text)[0])/15) for span in source.findAll("span",attrs={"class","count"}) if "Found" in span.text][0] for i in range(pages + 1): page = requests.get(category_url + "/page/" + str(i)) source = BeautifulSoup(page.content) articles.extend(source.findAll("article")) except: pass count = 0 for article in list(set(articles)): count += 1 try: news_title = [ h.text.strip() for h in article.findAll("header", attrs={"class", "article-header"}) ][0] except: news_title = "" try: author = [ h.text.strip() for h in article.findAll("span", attrs={"class", "reporter"}) ][0] except: author = "" try: news_category = [ h.text.strip() for h in article.findAll("span", attrs={"class", "category"}) ][0] except: news_category = "" try: summary = [ h.text.strip() for h in article.findAll("div", attrs={"class", "entry"}) ][0] except: summary = "" try: image_link = [ h["data-src"] for h in article.findAll("figure") ][0] except: image_link = "" try: article_link = [ a["href"] for h in article.findAll("header", attrs={"class", "article-header"}) for a in h.findAll("a") ][0] except: article_link = "" try: page = requests.get(article_link) source = BeautifulSoup(page.content) article = source.find("article") full_text = "\n".join([ p.text.strip() for h in article.findAll("div", attrs={"class", "article-entry"}) for p in h.findAll("p") if not p.has_attr('class') ]) except: full_text = "" try: date_time = parser.parse([ " ".join(h.text.strip().split(" ")[-3:]) for h in article.findAll("div", attrs={"class", "article-entry"}) ][0], fuzzy=True) except: date_time = "\n".join([ " ".join(p.text.strip().split(" ")[-3:]) for h in article.findAll("div", attrs={"class", "article-entry"}) for p in h.findAll("p", attrs={"class", "date"}) ]) try: related_articles = "\n".join( list( set([ a["href"] for h in article.findAll( "div", attrs={"class", "article-entry"}) for p in h.findAll("p") for a in p.findAll("a") ]))) except: related_articles = "" try: topics = [ p.text.strip() for p in article.findAll("p", attrs={"class", "article-tags"}) ][0].replace("Topics: ", "") except: topics = "" upload_article(news_title, date_time, author, full_text, article_link, "https://www.irrawaddy.com/", image_link, cursor, db, news_category, topics)
def mmtimes_main(browser, url, cursor, db): # print("started---------find_categories") category_list, browser = find_categories(url, browser) for category_url in category_list: try: page = browser.get(category_url) source = BeautifulSoup(page.content) except: source = "" if source != "": article_lst = source.findAll("div", attrs={"class": "views-row"}) for article in article_lst: try: img_link = source.find("div", attrs={ "class": "latest-news-top" }).find("img")["src"] except: img_link = "" try: article_title = source.find("div", attrs={ "class": "news-title" }).text.strip() article_url = url + source.find("div", attrs={ "class": "news-title" }).find("a")["href"] except: article_title, article_url = "", "" try: category = source.find("span", attrs={ "class": "news-category" }).text.strip() except: category = "" try: date_time = source.find("span", attrs={ "class": "news-date" }).text.strip() except: date_time = "" # print(article_url) try: page = browser.get(article_url) source = BeautifulSoup(page.content) summary = "\n".join([ p.text.strip() for p in source.find("div", attrs={ "class": "field-item" }).findAll("p") ]) author = source.find("span", attrs={ "class": "news-author" }).text.strip() except: summary = "" author = "" topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)
def thanlwintimes(browser, url, cursor, db): page = browser.get(url) if page.status_code == 200: source = BeautifulSoup(page.content) category_list = [ a["href"] for ul in source.findAll("ul") if ul.has_attr('id') if "menu-cat-menu-1" in ul["id"] for a in ul.findAll("a") ] # if a["href"] !="/" and a["href"] !="#"] if len(category_list) > 0: category_list = list(set(category_list)) for cat_url in category_list: category = cat_url.replace(url, "").replace("/", ",").replace( "archives,", "").replace("category,", "") page = browser.get(cat_url) if page.status_code == 200: source = BeautifulSoup(page.content) ########pagination############ pages = [1] pages.extend([ int(span.text.split("of ")[-1]) for span in source.findAll("span", attrs={"class", "pages"}) ]) page_count = max(pages) for i in range(page_count + 1): page_url = cat_url + "/page/" + str(i) page = browser.get(page_url) if page.status_code == 200: source = BeautifulSoup(page.content) article_lst = source.findAll( "div", attrs={"class", "td-block-span6"}) for article in article_lst: try: article_title_div = article.find( "h3", attrs={"class", "td-module-title"}) article_title = article_title_div.text.strip( ) article_url = article_title_div.find( "a")["href"] except: article_title, article_url = "", "" try: author = article.find( "span", attrs={"class", "td-post-author-name" }).text.strip() date_time = article.find( "span", attrs={"class", "td-post-date"}).text.strip() from dateutil import parser as dparser date_time = dparser.parse(date_time, fuzzy=True) except: author, date_time = "", "" try: _div = article.find( "div", attrs={"class", "td-module-image"}) img_link = _div.find("img")["src"] + "\n" except: img_link = "" summary = "" if article_url != "": page = browser.get(article_url) if page.status_code == 200: source = BeautifulSoup(page.content) summary = "\n".join([ p.text.strip() for p in source.findAll("p") ]) img_link = "\n".join([ img["src"] for img in summary_div.findAll("img") ]) topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)
def narinjara_main(browser, url, cursor, db): page = browser.get(url) if page.status_code == 200: source = BeautifulSoup(page.content) category = "news" ########pagination############ pages = [1] pages.extend([ int(span.text.split("of ")[-1]) for span in source.findAll("span", attrs={"class", "pages"}) ]) page_count = max(pages) for i in range(page_count + 1): page_url = url + "?page=" + str(i) page = browser.get(page_url) if page.status_code == 200: source = BeautifulSoup(page.content) article_lst = source.findAll("article", attrs={"class", "entry-item"}) for article in article_lst: article_title, date_time, author, summary, article_url, img_link = "", "", "", "", "", "" try: article_title_div = article.find( "h2", attrs={"class", "entry-title"}) article_title = article_title_div.text.strip() article_url = "https://burmese.narinjara.com" + article_title_div.find( "a")["href"] except: pass try: p_div = article.find("ul", attrs={"class", "entry-meta"}) date_time = p_div.find("li", attrs={"class", "entry-date" }).text.strip() author = p_div.find("li", attrs={"class", "entry-author" }).text.strip() except: pass try: _div = article.find("div", attrs={"class", "entry-img"}) img_link = "https://burmese.narinjara.com" + _div.find( "img")["src"] except: pass try: summary_div = article.find( "div", attrs={"class", "entry-content"}) summary = summary_div.find("p").text.strip() except: pass page = browser.get(article_url) if page.status_code == 200: source = BeautifulSoup(page.content) try: article = source.find( "article") #,attrs={"class","item-list"}) summary_div = article.find( "div", attrs={"class", "entry"}) summary += "\n".join([ p.text.strip() for p in summary_div.findAll("p") ]) except: pass topics = "" upload_article(article_title, date_time, author, summary, article_url, url, img_link, cursor, db, category, topics)