def getPageHrefs(self, count): searchUrl = "http://www.medipana.com/news/news_list_new.asp?Page=" + str(count) + "&MainKind=A&NewsKind=106&vCount=20&vKind=1&sID=&sWord=" + self.query + "&sDate=" soup = Soup.requests(searchUrl, parser="html.parser") searchBox = soup.find_all('a', class_='import_middle_title1') regex = re.compile("&sWord=(.*)") pageHrefs = ["http://www.medipana.com/" + regex.sub("", x.get('href')[2:]) for x in searchBox] return pageHrefs
def getPage(self, url): soup = Soup.requests(url) articleID = re.sub("\D","",url) articleTitle = soup.find('div',class_='art_top').h2.get_text().strip() articleDate = re.sub("\D","",soup.find('ul',class_='art_info').contents[3].get_text())[:8] articleTxt = soup.find('div',id='news_body_area').get_text(separator="\n").strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPageHrefs(self, count): # 동아일보만 검색되게??? 나머지는??? searchUrl="http://news.donga.com/search?p="+str(count)+"&query="+self.query+"&check_news=1&more=1&sorting=1&search_date=1&v1="+self.startDate+"&v2="+self.endDate+"&range=1" soup = Soup.requests(searchUrl) searchBox = soup.find_all("div", class_="searchList") pageHrefs = [element.div.a['href'] for element in searchBox] return pageHrefs
def getPageCount(self): searchUrl = "http://www.hkn24.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type=" soup = Soup.phantomjs(searchUrl) pageCount = int( re.sub("\D", "", soup.find('font', color='#333333').get_text())) return math.floor(pageCount / 20)
def getPage(self, url): soup = Soup.requests(url) if soup.find('div', class_='error'): print("Error Page : %s" % (url)) return (0, "error", "error", 0) articleDate = re.sub( "\D", "", soup.find( 'div', class_='byline').em.next_sibling.next_sibling.get_text())[:8] articleID = re.sub("\D", "", url) articleTitle = soup.find("", id='article_title').get_text().strip() # 강세, sub caption 등 중복되는 기사 etc = soup.find_all('div', class_='ab_subtitle') + soup.find_all( 'div', class_='ab_related') + soup.find_all( 'span', class_='rt') + soup.find_all('a') + soup.find_all( 'td', class_='pt_8') photoCaption = soup.find_all( 'div', class_='html_photo_center') + soup.find_all( 'p', class_='caption') for element in photoCaption + etc: element.decompose() articleTxt = soup.find( 'div', id='article_body').get_text(separator="\n").strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPageCount(self): searchUrl = "http://www.khanews.com/news/articleList.html?page=&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type=" soup = Soup.phantomjs(searchUrl) tbodys = soup.find("td", bgcolor="#FFFFFF") pageCount = int( re.sub("\D", "", tbodys.find("font", color="#333333").get_text())) return math.floor(pageCount / 20)
def getPageHrefs(self, count): count = (count-1)*10 searchUrl= "http://www.medicaltimes.com/Users4/Search/searchEach.html?nStart="+str(count)+"&KeyWord="+self.query+"&searchType=news" soup = Soup.requests(searchUrl,parser='html.parser') searchBox=soup.find('table', class_="news_view_contents").find_all('a', style="font-size:11pt; color:darkblue") regex= re.compile("&nSection=(.*)") pageHrefs=[regex.sub("",x.get('href')) for x in searchBox] return pageHrefs
def getPageHrefs(self, count): searchUrl = "http://search.joins.com/TotalNews?page=" + str( count ) + "&Keyword=" + self.query + "&StartSearchDate=" + self.startDate + "&EndSearchDate=" + self.endDate + "&SortType=New&SearchCategoryType=TotalNews&PeriodType=DirectInput&ScopeType=All&ServiceCode=&MasterCode=&SourceGroupType=Joongang&ReporterCode=&ImageType=All&JplusType=All&BlogType=All&ImageSearchType=Image&MatchKeyword=" + self.query + "&IncludeKeyword=&ExcluedeKeyword=" soup = Soup.requests(searchUrl) searchBox = soup.find('ul', class_="list_default").find_all('li') pageHrefs = [x.find('a')['href'] for x in searchBox] return pageHrefs
def getPageCount(self): searchUrl = "http://www.nursenews.co.kr/main/search.asp?SearchStr=" + self.query + "&intPage=1" soup = Soup.requests(searchUrl) pageCount = int( re.sub( "\D", "", soup.find_all("img", align="absmiddle")[1].parent.get('href')[-3:])) return pageCount
def getPageHrefs(self, count): searchUrl = "http://www.hkn24.com/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type=" soup = Soup.phantomjs(searchUrl) searchBox = soup.find_all('td', class_='ArtList_Title') pageHrefs = [ "http://www.hkn24.com/news/" + x.a.get('href') for x in searchBox ] return pageHrefs
def getPageHrefs(self, count): searchUrl = "http://www.nursenews.co.kr/main/search.asp?SearchStr=" + self.query + "&intPage=" + str( count) soup = Soup.requests(searchUrl) searchBox = soup.find('ul', class_='ul_board').find_all('li') pageHrefs = [ "http://www.nursenews.co.kr/main" + x.a.get('href')[1:] for x in searchBox ] return pageHrefs
def getPageCount(self): searchUrl = "http://www.medicaltimes.com/Users4/Search/searchEach.html?nStart=1&KeyWord="+self.query+"&searchType=news" soup = Soup.requests(searchUrl,parser='html.parser') try: maxPageCnt=soup.find_all('tr',height='40')[0].contents[1].contents[-1].get('href') except AttributeError: maxPageCnt=soup.find_all('tr',height='40')[0].find_all('a')[-1].get('href') maxPageTag = soup.find_all('img', src="http://image.medicaltimes.com/common/1_57.jpg")[0].parent pageCount=int(re.search("nStart=[0-9]*",maxPageTag['href']).group()[7:]) return math.floor(pageCount/10)
def getPageHrefs(self, count): searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type=" soupArticle = Soup.phantomjs(searchUrl) articleHrefList = soupArticle.find_all("a", class_="news_list_title") pageHrefs = [ "http://www.doctorsnews.co.kr/news/" + x.get('href') for x in articleHrefList ] return pageHrefs
def getPageCount(self): page = '1' while 1: searchUrl = "http://www.medipana.com/news/news_list_new.asp?Page=" + page + "&MainKind=A&NewsKind=106&vCount=20&vKind=1&sID=&sWord=" + self.query + "&sDate=" soup = Soup.requests(searchUrl, parser='html.parser') nextButton = soup.find('img', src='../images/paging_next.gif').parent if nextButton.name != 'a': lastLink = soup.find_all('td', align='center')[1].find_all('a')[-1]['href'] page = re.sub("\D", "", lastLink)[:2] break page = re.sub("\D", "", nextButton['href'])[:2] return int(page)
def getPage(self, url): soup = Soup.requests(url) photoCaption = soup.find('td', id='articleBody').find_all('table') for element in photoCaption: element.decompose() articleTxt = soup.find( 'td', id='articleBody').get_text(separator="\n").strip() articleDate = re.sub("\D", "", soup.find('td', bgcolor="EFEFEF").get_text())[:8] articleID = articleDate[2:] + "_" + re.sub("\D", "", url) articleTitle = soup.find("td", class_="view_t").get_text().strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPageHrefs(self, count): searchUrl = "http://www.khanews.com/news/articleList.html?page=" + str( count ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type=" soup = Soup.phantomjs(searchUrl) searchBox = soup.find("td", bgcolor="#FFFFFF").find_all("font", color="#001DD0") pageHrefs = [ "http://www.khanews.com/news/" + x.parent.get('href') for x in searchBox ] return pageHrefs
def getPage(self, url): soup = Soup.requests(url) if soup.find('div', id='articleBody') is None: articleTxt="사진 기사" articleDate=re.sub("\D","", soup.select("#ND_Warp > table:nth-of-type(2)")[0].get_text()) articleTitle=soup.select("#ND_Warp > table:nth-of-type(1) > tr > td:nth-of-type(2)")[0].get_text().strip() else: adTags = soup.find('div', id='articleBody').find_all('table') for element in adTags: element.decompose() articleTxt=soup.find('div', id='articleBody').get_text(separator="\n").strip() articleDate=re.sub("\D","",soup.find('div',class_="info").contents[7].get_text())[:8] articleTitle=soup.find('span',class_='headline-title').get_text().strip() articleID=str(articleDate[2:]+re.sub("\D","",url)) return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): soup = Soup.requests(url) articleID = re.sub("\D", "", url)[2:] articleDate = re.sub("\D", "", soup.find('div', class_='View_Time').get_text())[:8] articleTitle = soup.find( 'div', class_='View_Title').strong.get_text().strip() adTags = soup.find('div', id='CmAdContent').find_all('table') for element in adTags: element.decompose() articleTxt = soup.find( 'div', id='CmAdContent').get_text(separator="\n").strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): soup = Soup.requests(url, parser='html.parser', encoding="euc-kr") if not soup.find('body'): print("Error url : %s"%(url)) return (0, "error", "error", 0) articleID=re.search("ID=[0-9]*",url).group()[3:] articleTitle=soup.find('td',class_='px21 bk fbd lh15 lt').get_text().strip() articleDate=re.sub("\D","",soup.select("#html_head > table:nth-of-type(2) > tr:nth-of-type(2) > td:nth-of-type(1) > table > tr > td > div > table:nth-of-type(1) > tr:nth-of-type(6) > td > font")[0].get_text())[:8] photoCaption = soup.find('td', id='NEWS_CONTENT').find_all('table') for element in photoCaption: element.decompose() articleTxt=soup.find('td', id='NEWS_CONTENT').get_text(separator="\n").strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): soup = Soup.requests(url) articleID = re.search("idx=[0-9]*", url).group()[4:] articleDate = re.search( "[12][0-9]{3}-[0-9]{2}-[0-9]{2}", soup.find('div', class_='txt_1').get_text()).group() # need articleDate = re.sub("\D", "", articleDate) #주석 제거 for x in soup.find('div', id='neyongID').find_all('span'): x.decompose() articleTxt = soup.find('div', id='neyongID').get_text(separator="\n").strip() titleTag = soup.find('div', class_='bx_board').find('div', class_='tit_1') articleSubTitle = titleTag.find('div', class_='txt_s') articleSubTitle.decompose() articleTitle = titleTag.get_text().strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): articleID = url[37:-2] articleDate = url[28:36] #yyyymmdd if articleDate.isdigit() is False: #date error return -1 articleSoup = Soup.requests(url) article = articleSoup.find('div', class_='article_txt') for script in article.find_all('script'): script.extract() adTags = article.find_all('div', class_='article_relation') + article.find_all('span',class_='t') photoCaption = article.find_all('div',class_='articlePhotoC') +article.find_all('div',class_='articlePhotoB') + article.find_all('div',class_='articlePhotoA') notArticleTxt = adTags + photoCaption for element in notArticleTxt: element.decompose() articleTxt = article.get_text(separator="\n").strip() articleTitle = articleSoup.find('div', class_='article_title').find(class_='title').get_text().strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): soup = Soup.requests(url,parser='html.parser') articleID = re.search("NewsNum=[0-9]*",url).group()[8:] articleDate = re.sub("\D","",soup.find_all('a',class_='plan_1_1')[1].get_text())[:8] articleTitle = soup.find('div',class_='detailL').get_text().strip() bodyTag = soup.find('div',style="font:굴림; LINE-HEIGHT: 22px;letter-spacing:0px;text-align:justify; ") if bodyTag.find('div') is not None: articleTxtList = [div.get_text(separator="\n").strip() for div in bodyTag.find_all('div') ] else: adTags = bodyTag.find_all('table') for element in adTags: element.decompose() articleTxtList = [] if len(articleTxtList) < 2: articleTxt = bodyTag.get_text(separator="\n\n").strip() else: articleTxt = "\n".join(articleTxtList) return (articleID, articleDate, articleTitle, articleTxt)
def getPage(self, url): articleID = re.sub("\D", "", url) soup = Soup.requests(url) if soup.find('td', id='articleBody') is None: #본문이 없는 기사 articleTitle = soup.find('b', class_='title').get_text() articleTxt = "사진 기사" articleDate = re.sub("\D", "", soup.find('div', class_='info').get_text())[:8] else: #광고 제거 adTags = soup.find('td', id='articleBody').find_all('table') for element in adTags: element.decompose() articleTxt = soup.find( 'td', id='articleBody').get_text(separator="\n").strip() articleDate = re.sub("\D", "", soup.find('td', class_="WrtTip").get_text())[0:8] articleTitle = soup.find("td", id="font_title").get_text().strip() return (articleID, articleDate, articleTitle, articleTxt)
def getPageCount(self): searchUrl = "http://search.joins.com/TotalNews?page=1&Keyword=" + self.query + "&StartSearchDate=" + self.startDate + "&EndSearchDate=" + self.endDate + "&SortType=New&SearchCategoryType=TotalNews&PeriodType=DirectInput&ScopeType=All&ServiceCode=&MasterCode=&SourceGroupType=Joongang&ReporterCode=&ImageType=All&JplusType=All&BlogType=All&ImageSearchType=Image&MatchKeyword=" + self.query + "&IncludeKeyword=&ExcluedeKeyword=" soup = Soup.requests(searchUrl) pageCount = soup.find('span', class_="total_number").string return int(pageCount[2:].split(" ")[0])
def getPageCount(self): searchUrl = "http://www.doctorstimes.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type=" soup = Soup.phantomjs(searchUrl) pageCount = int(re.sub("\D","",soup.select("#article-list > tbody > tr > td > table > tbody > tr:nth-of-type(1) > td > table > tbody > tr > td:nth-of-type(1)")[0].get_text())) return math.floor(pageCount/20)
def getPageHrefs(self, count): searchUrl = "http://www.doctorstimes.com/news/articleList.html?page="+str(count)+"&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type=" soup = Soup.phantomjs(searchUrl,'html.parser') searchBox = soup.find_all('td',class_='list-titles list-pad-5') pageHrefs = ["http://www.doctorstimes.com/news/"+x.a.get('href') for x in searchBox] return pageHrefs
def getPageHrefs(self, count): searchUrl = "http://medifonews.com/news/search_result.html?search=" + self.query + "&search_mode=&hash=&s_title=&s_writer_name=&s_body=&s_sdate=" + self.startDate + "&s_edate=" + self.endDate + "&page=" + str(count) soup = Soup.requests(searchUrl) searchBox = soup.find('ul', class_='art_list_all').find_all('a') pageHrefs = ["http://medifonews.com/news/" + x.get('href') for x in searchBox] return pageHrefs
def getPageCount(self): searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type=" soup = Soup.phantomjs(searchUrl) articleCnt = soup.find("tr", height="35").td.get_text() maxArticle = int(re.sub("\D", "", articleCnt)) return math.floor(int(maxArticle / 25))
def getPageCount(self): searchUrl="http://news.donga.com/search?p=1&query="+self.query+"&check_news=1&more=1&sorting=1&search_date=1&v1="+self.startDate+"&v2="+self.endDate+"&range=1" soup = Soup.requests(searchUrl) return math.floor(int(re.sub('\D',"", soup.find('div',class_="searchCont").h2.span.text ))//15)
def getPageCount(self): searchUrl = "http://medifonews.com/news/search_result.html?search=" + self.query + "&search_mode=&hash=&s_title=&s_writer_name=&s_body=&s_sdate=" + self.startDate + "&s_edate=" + self.endDate + "&page=1" soup = Soup.requests(searchUrl) maxPageCnt = int(soup.find('i',class_='t02').get_text()) return math.floor(maxPageCnt / 20)