Esempio n. 1
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.medipana.com/news/news_list_new.asp?Page=" + str(count) + "&MainKind=A&NewsKind=106&vCount=20&vKind=1&sID=&sWord=" + self.query + "&sDate="
     soup = Soup.requests(searchUrl, parser="html.parser")        
     searchBox = soup.find_all('a', class_='import_middle_title1')
     regex = re.compile("&sWord=(.*)")
     pageHrefs = ["http://www.medipana.com/" + regex.sub("", x.get('href')[2:]) for x in searchBox]
     return pageHrefs
Esempio n. 2
0
 def getPage(self, url):
     soup = Soup.requests(url)
     articleID = re.sub("\D","",url)
     articleTitle = soup.find('div',class_='art_top').h2.get_text().strip()
     articleDate = re.sub("\D","",soup.find('ul',class_='art_info').contents[3].get_text())[:8]
     articleTxt = soup.find('div',id='news_body_area').get_text(separator="\n").strip()
     return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 3
0
 def getPageHrefs(self, count):
     # 동아일보만 검색되게??? 나머지는???
     searchUrl="http://news.donga.com/search?p="+str(count)+"&query="+self.query+"&check_news=1&more=1&sorting=1&search_date=1&v1="+self.startDate+"&v2="+self.endDate+"&range=1"
     soup = Soup.requests(searchUrl)
     searchBox = soup.find_all("div", class_="searchList")
     pageHrefs = [element.div.a['href'] for element in searchBox]
     return pageHrefs
Esempio n. 4
0
 def getPageCount(self):
     searchUrl = "http://www.hkn24.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type="
     soup = Soup.phantomjs(searchUrl)
     pageCount = int(
         re.sub("\D", "",
                soup.find('font', color='#333333').get_text()))
     return math.floor(pageCount / 20)
Esempio n. 5
0
 def getPage(self, url):
     soup = Soup.requests(url)
     if soup.find('div', class_='error'):
         print("Error Page : %s" % (url))
         return (0, "error", "error", 0)
     articleDate = re.sub(
         "\D", "",
         soup.find(
             'div',
             class_='byline').em.next_sibling.next_sibling.get_text())[:8]
     articleID = re.sub("\D", "", url)
     articleTitle = soup.find("", id='article_title').get_text().strip()
     # 강세, sub caption 등 중복되는 기사
     etc = soup.find_all('div', class_='ab_subtitle') + soup.find_all(
         'div', class_='ab_related') + soup.find_all(
             'span', class_='rt') + soup.find_all('a') + soup.find_all(
                 'td', class_='pt_8')
     photoCaption = soup.find_all(
         'div', class_='html_photo_center') + soup.find_all(
             'p', class_='caption')
     for element in photoCaption + etc:
         element.decompose()
     articleTxt = soup.find(
         'div', id='article_body').get_text(separator="\n").strip()
     return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 6
0
 def getPageCount(self):
     searchUrl = "http://www.khanews.com/news/articleList.html?page=&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type="
     soup = Soup.phantomjs(searchUrl)
     tbodys = soup.find("td", bgcolor="#FFFFFF")
     pageCount = int(
         re.sub("\D", "",
                tbodys.find("font", color="#333333").get_text()))
     return math.floor(pageCount / 20)
Esempio n. 7
0
 def getPageHrefs(self, count):
     count = (count-1)*10
     searchUrl= "http://www.medicaltimes.com/Users4/Search/searchEach.html?nStart="+str(count)+"&KeyWord="+self.query+"&searchType=news"
     soup = Soup.requests(searchUrl,parser='html.parser')
     searchBox=soup.find('table', class_="news_view_contents").find_all('a', style="font-size:11pt; color:darkblue")
     regex= re.compile("&nSection=(.*)")
     pageHrefs=[regex.sub("",x.get('href')) for x in searchBox]
     return pageHrefs
Esempio n. 8
0
 def getPageHrefs(self, count):
     searchUrl = "http://search.joins.com/TotalNews?page=" + str(
         count
     ) + "&Keyword=" + self.query + "&StartSearchDate=" + self.startDate + "&EndSearchDate=" + self.endDate + "&SortType=New&SearchCategoryType=TotalNews&PeriodType=DirectInput&ScopeType=All&ServiceCode=&MasterCode=&SourceGroupType=Joongang&ReporterCode=&ImageType=All&JplusType=All&BlogType=All&ImageSearchType=Image&MatchKeyword=" + self.query + "&IncludeKeyword=&ExcluedeKeyword="
     soup = Soup.requests(searchUrl)
     searchBox = soup.find('ul', class_="list_default").find_all('li')
     pageHrefs = [x.find('a')['href'] for x in searchBox]
     return pageHrefs
Esempio n. 9
0
 def getPageCount(self):
     searchUrl = "http://www.nursenews.co.kr/main/search.asp?SearchStr=" + self.query + "&intPage=1"
     soup = Soup.requests(searchUrl)
     pageCount = int(
         re.sub(
             "\D", "",
             soup.find_all("img",
                           align="absmiddle")[1].parent.get('href')[-3:]))
     return pageCount
Esempio n. 10
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.hkn24.com/news/articleList.html?page=" + str(
         count
     ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&sc_view_code=&view_type="
     soup = Soup.phantomjs(searchUrl)
     searchBox = soup.find_all('td', class_='ArtList_Title')
     pageHrefs = [
         "http://www.hkn24.com/news/" + x.a.get('href') for x in searchBox
     ]
     return pageHrefs
Esempio n. 11
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.nursenews.co.kr/main/search.asp?SearchStr=" + self.query + "&intPage=" + str(
         count)
     soup = Soup.requests(searchUrl)
     searchBox = soup.find('ul', class_='ul_board').find_all('li')
     pageHrefs = [
         "http://www.nursenews.co.kr/main" + x.a.get('href')[1:]
         for x in searchBox
     ]
     return pageHrefs
Esempio n. 12
0
    def getPageCount(self):
        searchUrl = "http://www.medicaltimes.com/Users4/Search/searchEach.html?nStart=1&KeyWord="+self.query+"&searchType=news"
        soup = Soup.requests(searchUrl,parser='html.parser')

        try:
            maxPageCnt=soup.find_all('tr',height='40')[0].contents[1].contents[-1].get('href')
        except AttributeError:
            maxPageCnt=soup.find_all('tr',height='40')[0].find_all('a')[-1].get('href')
        maxPageTag = soup.find_all('img', src="http://image.medicaltimes.com/common/1_57.jpg")[0].parent
        pageCount=int(re.search("nStart=[0-9]*",maxPageTag['href']).group()[7:])
        return math.floor(pageCount/10)
Esempio n. 13
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=" + str(
         count
     ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type="
     soupArticle = Soup.phantomjs(searchUrl)
     articleHrefList = soupArticle.find_all("a", class_="news_list_title")
     pageHrefs = [
         "http://www.doctorsnews.co.kr/news/" + x.get('href')
         for x in articleHrefList
     ]
     return pageHrefs
Esempio n. 14
0
 def getPageCount(self):
     page = '1'
     while 1:
         searchUrl = "http://www.medipana.com/news/news_list_new.asp?Page=" + page + "&MainKind=A&NewsKind=106&vCount=20&vKind=1&sID=&sWord=" + self.query + "&sDate="
         soup = Soup.requests(searchUrl, parser='html.parser')
         nextButton = soup.find('img', src='../images/paging_next.gif').parent
         if nextButton.name != 'a':
             lastLink = soup.find_all('td', align='center')[1].find_all('a')[-1]['href']
             page = re.sub("\D", "", lastLink)[:2]   
             break
         page = re.sub("\D", "", nextButton['href'])[:2]           
     return int(page)
Esempio n. 15
0
 def getPage(self, url):
     soup = Soup.requests(url)
     photoCaption = soup.find('td', id='articleBody').find_all('table')
     for element in photoCaption:
         element.decompose()
     articleTxt = soup.find(
         'td', id='articleBody').get_text(separator="\n").strip()
     articleDate = re.sub("\D", "",
                          soup.find('td', bgcolor="EFEFEF").get_text())[:8]
     articleID = articleDate[2:] + "_" + re.sub("\D", "", url)
     articleTitle = soup.find("td", class_="view_t").get_text().strip()
     return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 16
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.khanews.com/news/articleList.html?page=" + str(
         count
     ) + "&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=2000.01.01&sc_edate=&sc_serial_number=&sc_word=" + self.query + "&view_type="
     soup = Soup.phantomjs(searchUrl)
     searchBox = soup.find("td",
                           bgcolor="#FFFFFF").find_all("font",
                                                       color="#001DD0")
     pageHrefs = [
         "http://www.khanews.com/news/" + x.parent.get('href')
         for x in searchBox
     ]
     return pageHrefs
Esempio n. 17
0
 def getPage(self, url):
     soup = Soup.requests(url)
     if soup.find('div', id='articleBody') is None:
         articleTxt="사진 기사"
         articleDate=re.sub("\D","", soup.select("#ND_Warp > table:nth-of-type(2)")[0].get_text())
         articleTitle=soup.select("#ND_Warp > table:nth-of-type(1) > tr > td:nth-of-type(2)")[0].get_text().strip()
     else:
         adTags = soup.find('div', id='articleBody').find_all('table')
         for element in adTags:
             element.decompose()
         articleTxt=soup.find('div', id='articleBody').get_text(separator="\n").strip()
         articleDate=re.sub("\D","",soup.find('div',class_="info").contents[7].get_text())[:8]
         articleTitle=soup.find('span',class_='headline-title').get_text().strip()
     articleID=str(articleDate[2:]+re.sub("\D","",url))
     return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 18
0
    def getPage(self, url):
        soup = Soup.requests(url)
        articleID = re.sub("\D", "", url)[2:]
        articleDate = re.sub("\D", "",
                             soup.find('div',
                                       class_='View_Time').get_text())[:8]
        articleTitle = soup.find(
            'div', class_='View_Title').strong.get_text().strip()

        adTags = soup.find('div', id='CmAdContent').find_all('table')
        for element in adTags:
            element.decompose()
        articleTxt = soup.find(
            'div', id='CmAdContent').get_text(separator="\n").strip()
        return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 19
0
    def getPage(self, url):
        soup = Soup.requests(url, parser='html.parser', encoding="euc-kr")
        if not soup.find('body'):
            print("Error url : %s"%(url))
            return (0, "error", "error", 0)

        articleID=re.search("ID=[0-9]*",url).group()[3:]
        articleTitle=soup.find('td',class_='px21 bk fbd lh15 lt').get_text().strip()
        articleDate=re.sub("\D","",soup.select("#html_head > table:nth-of-type(2) > tr:nth-of-type(2) > td:nth-of-type(1) > table > tr > td > div > table:nth-of-type(1) > tr:nth-of-type(6) > td > font")[0].get_text())[:8]

        photoCaption = soup.find('td', id='NEWS_CONTENT').find_all('table')
        for element in photoCaption:
            element.decompose()
        articleTxt=soup.find('td', id='NEWS_CONTENT').get_text(separator="\n").strip()
        return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 20
0
    def getPage(self, url):
        soup = Soup.requests(url)
        articleID = re.search("idx=[0-9]*", url).group()[4:]
        articleDate = re.search(
            "[12][0-9]{3}-[0-9]{2}-[0-9]{2}",
            soup.find('div', class_='txt_1').get_text()).group()  # need
        articleDate = re.sub("\D", "", articleDate)
        #주석 제거
        for x in soup.find('div', id='neyongID').find_all('span'):
            x.decompose()
        articleTxt = soup.find('div',
                               id='neyongID').get_text(separator="\n").strip()

        titleTag = soup.find('div', class_='bx_board').find('div',
                                                            class_='tit_1')
        articleSubTitle = titleTag.find('div', class_='txt_s')
        articleSubTitle.decompose()
        articleTitle = titleTag.get_text().strip()
        return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 21
0
    def getPage(self, url):                
        articleID = url[37:-2]
        articleDate = url[28:36] #yyyymmdd
        if articleDate.isdigit() is False:
            #date error
            return -1
        articleSoup =  Soup.requests(url)
        article = articleSoup.find('div', class_='article_txt')
        for script in article.find_all('script'):
            script.extract()

        adTags = article.find_all('div', class_='article_relation') + article.find_all('span',class_='t')
        photoCaption = article.find_all('div',class_='articlePhotoC') +article.find_all('div',class_='articlePhotoB') + article.find_all('div',class_='articlePhotoA')
        notArticleTxt = adTags + photoCaption
        for element in notArticleTxt:
            element.decompose()
        articleTxt = article.get_text(separator="\n").strip()
        articleTitle = articleSoup.find('div', class_='article_title').find(class_='title').get_text().strip()
        return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 22
0
    def getPage(self, url):
        soup = Soup.requests(url,parser='html.parser')
        articleID = re.search("NewsNum=[0-9]*",url).group()[8:]
        articleDate = re.sub("\D","",soup.find_all('a',class_='plan_1_1')[1].get_text())[:8]
        articleTitle = soup.find('div',class_='detailL').get_text().strip()

        bodyTag = soup.find('div',style="font:굴림; LINE-HEIGHT: 22px;letter-spacing:0px;text-align:justify; ")
        if bodyTag.find('div') is not None:
            articleTxtList = [div.get_text(separator="\n").strip() for div in bodyTag.find_all('div') ]
        else:
            adTags = bodyTag.find_all('table')
            for element in adTags:
                element.decompose()
            articleTxtList = []

        if len(articleTxtList) < 2:
            articleTxt = bodyTag.get_text(separator="\n\n").strip()
        else:
            articleTxt = "\n".join(articleTxtList)
        return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 23
0
 def getPage(self, url):
     articleID = re.sub("\D", "", url)
     soup = Soup.requests(url)
     if soup.find('td', id='articleBody') is None:
         #본문이 없는 기사
         articleTitle = soup.find('b', class_='title').get_text()
         articleTxt = "사진 기사"
         articleDate = re.sub("\D", "",
                              soup.find('div',
                                        class_='info').get_text())[:8]
     else:
         #광고 제거
         adTags = soup.find('td', id='articleBody').find_all('table')
         for element in adTags:
             element.decompose()
         articleTxt = soup.find(
             'td', id='articleBody').get_text(separator="\n").strip()
         articleDate = re.sub("\D", "",
                              soup.find('td',
                                        class_="WrtTip").get_text())[0:8]
         articleTitle = soup.find("td", id="font_title").get_text().strip()
     return (articleID, articleDate, articleTitle, articleTxt)
Esempio n. 24
0
 def getPageCount(self):
     searchUrl = "http://search.joins.com/TotalNews?page=1&Keyword=" + self.query + "&StartSearchDate=" + self.startDate + "&EndSearchDate=" + self.endDate + "&SortType=New&SearchCategoryType=TotalNews&PeriodType=DirectInput&ScopeType=All&ServiceCode=&MasterCode=&SourceGroupType=Joongang&ReporterCode=&ImageType=All&JplusType=All&BlogType=All&ImageSearchType=Image&MatchKeyword=" + self.query + "&IncludeKeyword=&ExcluedeKeyword="
     soup = Soup.requests(searchUrl)
     pageCount = soup.find('span', class_="total_number").string
     return int(pageCount[2:].split(" ")[0])
Esempio n. 25
0
 def getPageCount(self):
     searchUrl = "http://www.doctorstimes.com/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type="
     soup = Soup.phantomjs(searchUrl)
     pageCount = int(re.sub("\D","",soup.select("#article-list > tbody > tr > td > table > tbody > tr:nth-of-type(1) > td > table > tbody > tr > td:nth-of-type(1)")[0].get_text()))
     return math.floor(pageCount/20)
Esempio n. 26
0
 def getPageHrefs(self, count):
     searchUrl = "http://www.doctorstimes.com/news/articleList.html?page="+str(count)+"&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_area=A&sc_level=&sc_article_type=&sc_view_level=&sc_sdate=&sc_edate=&sc_serial_number=&sc_word="+self.query+"&sc_word2=&sc_andor=&sc_order_by=E&view_type="
     soup = Soup.phantomjs(searchUrl,'html.parser')
     searchBox = soup.find_all('td',class_='list-titles list-pad-5')
     pageHrefs = ["http://www.doctorstimes.com/news/"+x.a.get('href') for x in searchBox]
     return pageHrefs
Esempio n. 27
0
 def getPageHrefs(self, count):
     searchUrl = "http://medifonews.com/news/search_result.html?search=" + self.query + "&search_mode=&hash=&s_title=&s_writer_name=&s_body=&s_sdate=" + self.startDate + "&s_edate=" + self.endDate + "&page=" + str(count)
     soup = Soup.requests(searchUrl)
     searchBox = soup.find('ul', class_='art_list_all').find_all('a')
     pageHrefs = ["http://medifonews.com/news/" + x.get('href') for x in searchBox]
     return pageHrefs
Esempio n. 28
0
 def getPageCount(self):
     searchUrl = "http://www.doctorsnews.co.kr/news/articleList.html?page=1&sc_section_code=&sc_sub_section_code=&sc_serial_code=&sc_add_section_code=&sc_add_sub_section_code=&sc_add_serial_code=&sc_area=A&sc_level=&sc_m_level=&sc_article_type=&sc_view_level=&sc_sdate=" + self.startDate + "&sc_edate=" + self.endDate + "&sc_serial_number=&sc_word=" + self.query + "&sc_word2=&sc_andor=OR&sc_order_by=I&view_type="
     soup = Soup.phantomjs(searchUrl)
     articleCnt = soup.find("tr", height="35").td.get_text()
     maxArticle = int(re.sub("\D", "", articleCnt))
     return math.floor(int(maxArticle / 25))
Esempio n. 29
0
 def getPageCount(self):
     searchUrl="http://news.donga.com/search?p=1&query="+self.query+"&check_news=1&more=1&sorting=1&search_date=1&v1="+self.startDate+"&v2="+self.endDate+"&range=1"
     soup = Soup.requests(searchUrl)
     return math.floor(int(re.sub('\D',"", soup.find('div',class_="searchCont").h2.span.text ))//15)
Esempio n. 30
0
 def getPageCount(self):
     searchUrl = "http://medifonews.com/news/search_result.html?search=" + self.query + "&search_mode=&hash=&s_title=&s_writer_name=&s_body=&s_sdate=" + self.startDate + "&s_edate=" + self.endDate + "&page=1"
     soup = Soup.requests(searchUrl)
     maxPageCnt = int(soup.find('i',class_='t02').get_text())
     return math.floor(maxPageCnt / 20)