コード例 #1
0
 def parse(self, response):
     soup = bs(response.text, 'html.parser')
     for i in soup.find_all(class_='post-title entry-title'):
         url = i.find('a').get('href')
         self.logger.info(url)
         yield scrapy.Request(url, callback=self.parse_news)
     if soup.find(class_='blog-pager-older-link'):
         next_url = soup.find('a',
                              class_='blog-pager-older-link').get('href')
         pub_time1 = soup.find_all(
             class_='date-header')[-1].find('span').text
         pub_time = time.strftime(
             "%Y-%m-%d %H:%M:%S",
             datetime.datetime(int(pub_time1.split()[-1]),
                               self.month[pub_time1.split()[1]],
                               int(pub_time1.split()[0])).timetuple())
         self.logger.info(pub_time)
         if self.time == None or Util.format_time3(pub_time) >= int(
                 self.time):
             self.logger.info(next_url)
             yield scrapy.Request(next_url, callback=self.parse)
         else:
             self.logger.info('时间截止')
コード例 #2
0
ファイル: janoduniya.py プロジェクト: gitzdx/crawler
 def parse(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.select('article'):
         pub_time = Util.format_time2(soup.select_one('.updated').text)
         response.meta['title'] = soup.select_one('h2.entry-title a').text
         response.meta['pub_time'] = pub_time
         if self.time is None or Util.format_time3(pub_time) >= int(
                 self.time):
             yield Request(url=i.select_one('a').get('href'),
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
     if flag:
         try:
             nextPage = soup.select_one('.nav-previous a').get('href')
             yield Request(nextPage,
                           meta=response.meta,
                           callback=self.parse)
         except:
             self.logger.info('Next page no more')
コード例 #3
0
ファイル: pressnote.py プロジェクト: ldqsss/crawler
 def parse_essay(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.find_all(
             class_='fbt-col-lg-12 col-md-4 col-xs-6 padding-reset'):
         tt = soup.select_one('.post-info').text.split()
         pub_time = Util.format_time2(tt[1] + ' ' + tt[0] + ' ' + tt[2])
         response.meta['title'] = soup.select_one('.post-content').text
         response.meta['pub_time'] = pub_time
         if self.time is None or Util.format_time3(pub_time) >= int(
                 self.time):
             yield Request(url=i.select_one('.post-content a').get('href'),
                           meta=response.meta,
                           callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
     if flag:
         try:
             if soup.find_all(
                     class_='NavigationButton')[-1].get('value') == 'Next':
                 nextPage = soup.find_all(
                     class_='NavigationButton')[-1].get('onclick').replace(
                         "window.location='", '')[:-2]
                 if re.match('http', nextPage):
                     #self.logger.info('11111111111111111111111111111111翻页')
                     yield Request(nextPage,
                                   meta=response.meta,
                                   callback=self.parse_essay)
                 else:
                     nextPage = 'https://www.pressnote.in/' + nextPage
                     #self.logger.info('2222222222222222222222222222翻页')
                     yield Request(nextPage,
                                   meta=response.meta,
                                   callback=self.parse_essay)
         except:
             self.logger.info('Next page no more')
コード例 #4
0
ファイル: tempo.py プロジェクト: ldqsss/crawler
 def parse_essay(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     for i in soup.select('#container > div')[1:-2]:  # 每页的文章
         url = i.select_one('a').get('href')
         try:
             pub_time = i.select_one('.entryDate').text if i.select_one(
                 '.entryDate').text else i.select_one('.meta_date').text
         except Exception:
             continue
         if self.time == None or Util.format_time3(
                 Util.format_time2(pub_time)) >= int(self.time):
             yield scrapy.Request(url, callback=self.parse_item)
         else:
             flag = False
             self.logger.info('时间截止')
             break
     if flag:
         if soup.select('.pagi-next') != []:
             yield Request(soup.select('.pagi-next')[0].attrs['href'],
                           callback=self.parse_essay)
         else:
             for i in soup.select('.numbered-pagination a'):
                 yield Request(i.attrs['href'], callback=self.parse_essay)
コード例 #5
0
ファイル: khulasaa.py プロジェクト: ldqsss/crawler
 def parse_category1(self, response):
     soup = BeautifulSoup(response.text, features="lxml")
     news_url = []
     news_content = soup.select(".o-article .k-list-sec .allBox ul li")
     for n in news_content:
         news_url.append(n.find("a").get("href"))
     for url in news_url:
         yield scrapy.Request(url, callback=self.parse_detail)
     next_page = soup.select_one(".o-listing .pagination a").get(
         "href") if soup.select_one(".o-listing .pagination a") else None
     LastTimeStamp = Util.format_time3(
         str(
             khulasaa_time_switch(
                 BeautifulSoup(
                     requests.get(news_url[-1]).text,
                     features="lxml").select_one(
                         ".author-disc .date .author span").text)))
     if next_page:
         if self.time is None or LastTimeStamp >= self.time:
             yield scrapy.Request(next_page, callback=self.parse_category1)
         elif LastTimeStamp < self.time:
             self.logger.info("$$$时间截止$$$")
     else:
         self.logger.info("$$$该页已经到底$$$")
コード例 #6
0
ファイル: WHO.py プロジェクト: ldqsss/crawler
 def parse_item(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     item = DemoItem()
     try:
         pub_time = self._4matTime(soup.select_one('.timestamp').text)
     except:
         pub_time = Util.format_time(0)
     if self.time is None or Util.format_time3(pub_time) > int(
             self.time):  # 时间截止,遍历了所有url,没能截止下来
         item['pub_time'] = pub_time
         item['title'] = soup.select('.active')[-1].text.strip()
         item['category1'] = response.url.split('/')[3]
         item['category2'] = response.url.split(
             '/')[5] if response.url.split('/')[3] == 'myanmar' else 'news'
         item['body'] = soup.select_one('article').text.strip()
         item['abstract'] = soup.select_one('article').text.strip().split(
             '\n')[0]
         item['images'] = [
             i.get('src') for i in soup.select_one('section').select('img')
         ]
         return item
     else:
         self.logger.info('时间截止')
         self.stopCount += 1
コード例 #7
0
    def parse_page(self, response):
        soup = bs(response.text, "html.parser")
        if soup.find(class_="post-outer") != None:
            for i in soup.find_all(class_="post-title entry-title"):
                news_url = i.find("a").get("href")
                yield scrapy.Request(news_url,
                                     callback=self.parse_news,
                                     meta=response.meta)

            pub = soup.find_all(class_="published timeago")[-1].text.strip()
            time = parse.quote(
                soup.find_all(class_="published timeago")[-1].get("title"))
            # page = soup.select_one("div.blog-pager > span.showpageOf").text #.split("of")[-1].strip()
            # self.logger.info(page)
            if self.time == None or Util.format_time3(
                    Util.format_time2(pub)) >= int(self.time):
                url = response.meta[
                    "url"] + "?updated-max={}&max-results=8#PageNo={}"
                response.meta["p"] += 1
                yield scrapy.Request(url.format(time, response.meta["p"]),
                                     callback=self.parse_page,
                                     meta=response.meta)
            else:
                self.logger.info('时间截止')
コード例 #8
0
ファイル: tehelkahindi.py プロジェクト: Doglen/crawler
    def get_next_page(self, response):
        soup = bs(response.text, "html.parser")
        item = response.meta["item"]
        a_list = soup.find_all("a", class_="td-image-wrap")
        for a in a_list:
            yield scrapy.Request(a.get("href"),
                                 callback=self.get_news_detail,
                                 meta={"item": item})  # 层与层之间通过meta参数传递数据

            if self.time == None or Util.format_time3(
                    Util.format_time2(
                        soup.find_all(
                            "time", class_="entry-date updated td-module-date")
                        [-1].text)) >= int(self.time):
                next_url = soup.find(
                    "div", class_="page-nav td-pb-padding-side"
                ).select("a")[-1].get("href") if soup.find(
                    "div", class_="page-nav td-pb-padding-side") else None
                if next_url:
                    yield scrapy.Request(next_url,
                                         meta=response.meta,
                                         callback=self.get_next_page)
            else:
                self.logger.info('时间截止')
コード例 #9
0
ファイル: tv9hindi.py プロジェクト: gitzdx/crawler
 def parse_category1(self, response):
     soup = BeautifulSoup(response.text, features="lxml")
     news_url = []
     if re.match(r"\S+/page/\d+$", response.url):
         # https://www.tv9hindi.com/india/page/2
         news_content = soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing").select(
             "li h3 a")
         for li in news_content:
             news_url.append(li.get("href"))
     else:
         top_content = soup.select_one(".newsTop9  .topNewscomp ul").find_all("h3", class_="h3")
         for h3 in top_content:
             news_url.append(h3.find("a").get("href"))
         news_content = soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing").select(
             "li h3 a") if soup.find_all("div", class_="newsTop9")[-1].find("div", class_="col2 ComListing") else None
         for li in news_content:
             news_url.append(li.get("href"))
     for url in news_url:
         yield scrapy.Request(url, callback=self.parse_detail)
     last_timeStamp = Util.format_time3(Util.format_time2(soup.find_all("div", class_="col2 ComListing")[-1].find_all("div", class_="catTime flex")[-1].find("span").text.strip()))
     next_page = soup.find("a", class_="next page-numbers").get("href") if soup.find("a",
                                                                                     class_="next page-numbers") else None
     pattern_pingback = '<!--<link rel="pingback" href="https://www.tv9hindi.com/xmlrpc.php">-->'  # 判断是否到达最后一页
     next_soup = BeautifulSoup(requests.get(next_page).text, features="lxml")
     if next_page:
         if self.time is None:
             if int(next_page.rsplit("/", 1)[-1]) <= 50:
                 yield scrapy.Request(next_page, callback=self.parse_category1)
             else:
                 pass
         elif last_timeStamp >= int(self.time):
             yield scrapy.Request(next_page, callback=self.parse_category1)
         else:
             self.logger.info("时间截止")
     else:
         self.logger.info("该目录已经结束")
コード例 #10
0
 def parse_2(self, response):
     page_soup = BeautifulSoup(response.text, 'lxml')
     if len(page_soup.select('article.article_content div.tag-content-left > a')):
         news_list = page_soup.select('article.article_content div.tag-content-left > a')#取出各个新闻的标签
         for i in news_list:
             news_url = 'https://hindi.mykhel.com' + i.attrs['href']
             yield Request(news_url,callback=self.parse_3)
         if page_soup.select('section div.prev-next-story.clearfix.click-for-more a.next.half_width')[
                             0].get('href'):
             next_page_url = 'https://hindi.mykhel.com/' + \
                             page_soup.select('section div.prev-next-story.clearfix.click-for-more a.next.half_width')[
                                 0].get('href')
             #得到第二页中最后一条新闻的url
             last_time_url = 'https://hindi.mykhel.com' + BeautifulSoup(requests.get(next_page_url).text,'lxml').select('article.article_content div.tag-content-left > a')[-1].attrs['href']
             #拿最后一条新闻的时间
             last_time = time_font(BeautifulSoup(requests.get(last_time_url).text,'lxml').select('div.os-breadcrumb div.os-posted-by time')[0].get('datetime'))
             if self.time == None or Util.format_time3(last_time) >= int(self.time):# 截止功能
                 if len(page_soup.select('section div.prev-next-story.clearfix.click-for-more a')):
                     yield Request(next_page_url,callback=self.parse_2)
             else:
                 self.logger.info('时间截至')
     elif len(page_soup.select('div.os-sports-m-news.clearfix div.os-more.clearfix a')):
         list_url = 'https://hindi.mykhel.com' + page_soup.select('div.os-sports-m-news.clearfix div.os-more.clearfix a')[0].get('href')
         yield Request(list_url,callback=self.parse_2)
コード例 #11
0
 def parse_category(self, response):
     soup = BeautifulSoup(response.text, features="lxml")
     category = soup.find('div', class_="box-shadow-block box-shadow-1 text-center").text if soup.find('div', class_="box-shadow-block box-shadow-1 text-center").text else None
     articles = list(soup.find_all('div', class_="col-md-6 col-sm-6 col-xs-12")) if soup.find_all('div', class_="col-md-6 col-sm-6 col-xs-12") else None
     article_hrefs = []
     for article in articles:
         if article.select_one('a').get('href') == "Nation.html":
             article_hrefs.append(article.select('a')[-1].get('href'))
         else:
             article_hrefs.append(article.select_one('a').get('href'))
     for detail_url in article_hrefs:
         # yield Request(detail_url, callback=self.parse_detail)
         check_soup = BeautifulSoup(requests.get(detail_url).content)     #不加content会出错,原因是因为这里的wb_data是requests对象,无法用BeautifulSoup解析
         if check_soup.select_one('div.date_and_author_container span').text.split(" ")[1]:
             temp_time = check_soup.select_one('div.date_and_author_container span').text.split(" ")[1]
         else:
             temp_time = check_soup.select_one('td.miscinfo').text.split(" ")[1]
         adjusted_time = time_adjustment(temp_time)
         self.logger.info("当前时间:"+adjusted_time+"$$$$$$$$$$$$$$")
         if self.time is None or Util.format_time3(adjusted_time) >= int(self.time):
             yield Request(detail_url, callback=self.parse_detail, meta={'category': category})
         else:
             self.logger.info("时间截止")
             break
コード例 #12
0
def headlinehindi_time_switch1(time_string):
    # 2020-12-23T17:50:27+05:30
    # 返回时间戳
    time_string = time_string.rsplit("+", 1)[0]
    return Util.format_time3(
        str(datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")))
コード例 #13
0
ファイル: inextlive.py プロジェクト: Chenxuanqi666/crawler
 def parse_1(self, response):
     page_soup = BeautifulSoup(response.text, 'lxml')
     # 如果可以换页
     if len(
             page_soup.select(
                 'article.topicBox div.pagination.border0 li.last')):
         next_page_url = page_soup.select(
             'article.topicBox div.pagination.border0 li.last > a')[0].get(
                 'href')
         this_page_url = response.url
         j = 0
         # j决定爬取的页数
         while j < 3 and next_page_url:
             this_page_soup = BeautifulSoup(
                 requests.get(this_page_url).text, 'lxml')
             news_url_list = this_page_soup.select('ul.topicList > li > a')
             last_new_url = news_url_list[-1].get('href')
             last_time = time_font(
                 BeautifulSoup(
                     requests.get(last_new_url).text, 'lxml').select(
                         'div.articleHd div.dateInfo span.fl')[0].text)
             if self.time == None or Util.format_time3(last_time) >= int(
                     self.time):
                 for news_url in news_url_list:
                     yield Request(news_url.get('href'),
                                   callback=self.parse_2)
             else:
                 self.logger.info('时间截止')
                 break
             j = j + 1
             this_page_url = next_page_url
     elif len(
             page_soup.select(
                 'article.topicBox div.newsFJagran div.pagination.border0 ul > li'
             )):
         self.logger.info('本页面特殊翻页: ' + response.url)
         j = 1
         this_page_url = response.url + '/' + str(j)
         while j < 3:
             this_page_soup = BeautifulSoup(
                 requests.get(this_page_url).text, 'lxml')
             news_url_list = this_page_soup.select('ul.topicList > li > a')
             try:
                 last_new_url = news_url_list[-1].get('href')
             except:
                 self.logger.info(response.url)
             last_time = time_font(
                 BeautifulSoup(
                     requests.get(last_new_url).text, 'lxml').select(
                         'div.articleHd div.dateInfo span.fl')[0].text)
             if self.time == None or Util.format_time3(last_time) >= int(
                     self.time):
                 for news_url in news_url_list:
                     Request(news_url.get('href'), callback=self.parse_2)
             else:
                 self.logger.info('时间截止')
                 break
             j = j + 1
             this_page_url = response.url + '/' + str(j)
     else:
         news_url_list = page_soup.select('ul.topicList > li > a')
         for news_url in news_url_list:
             last_time = time_font(
                 BeautifulSoup(
                     requests.get(news_url.get('href')).text,
                     'lxml').select('div.articleHd div.dateInfo span.fl')
                 [0].text)
             if self.time == None or Util.format_time3(last_time) >= int(
                     self.time):
                 yield Request(news_url.get('href'), callback=self.parse_2)
コード例 #14
0
ファイル: doh_spider.py プロジェクト: gitzdx/crawler
 def parse_news_list(self, response):
     home_url = 'https://doh.gov.ph/'
     time2 = ''
     soup = BeautifulSoup(response.text, "html.parser")
     news_list = soup.select("div.panel>div>div.view-content>div")
     # 新闻列表
     for news in news_list:
         # 发布日期和时间
         date = news.find("span",
                          class_="field-content content-time").text.strip()
         dtime = " 00:00:00"
         # 日期
         pub_time_list = re.split(" |,", date) if date else None
         if pub_time_list:
             if pub_time_list[0] == "January":
                 time2 = pub_time_list[-1] + "-01-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "February":
                 time2 = pub_time_list[-1] + "-02-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "March":
                 time2 = pub_time_list[-1] + "-03-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "April":
                 time2 = pub_time_list[-1] + "-04-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "May":
                 time2 = pub_time_list[-1] + "-05-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "June":
                 time2 = pub_time_list[-1] + "-06-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "July":
                 time2 = pub_time_list[-1] + "-07-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "August":
                 time2 = pub_time_list[-1] + "-08-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "September":
                 time2 = pub_time_list[-1] + "-09-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "October":
                 time2 = pub_time_list[-1] + "-10-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "November":
                 time2 = pub_time_list[-1] + "-11-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "December":
                 time2 = pub_time_list[-1] + "-12-" + pub_time_list[
                     1] + dtime
         response.meta['pub_time'] = time2
         #新闻列表
         url = urljoin(home_url, news.find("a").get("href"))
         yield scrapy.Request(url,
                              meta=response.meta,
                              callback=self.parse_news)
     # 翻页
     next_page = "https://doh.gov.ph/" + soup.select_one(
         "li.pager-next>a").get("href") if soup.select_one(
             "li.pager-next>a") else None
     if self.time == None or (time2 and
                              Util.format_time3(time2) >= int(self.time)):
         if next_page:
             yield scrapy.Request(next_page,
                                  meta=response.meta,
                                  callback=self.parse_news_list)
     else:
         self.logger.info('时间截止')
コード例 #15
0
ファイル: philboxing.py プロジェクト: gitzdx/crawler
 def parse_news_list(self, response):
     soup = BeautifulSoup(response.text, "html.parser")
     # 每页第一条新闻
     web = soup.find("td", {
         "valign": "top"
     }).select_one("td>font>a").text.strip() if soup.find(
         "td", {
             "valign": "top"
         }).select_one("td>font>a").text else None
     if web and web == "PhilBoxing.com":
         url = soup.find("td", {
             "valign": "top"
         }).select_one("td>a").get("href") if soup.find(
             "td", {
                 "valign": "top"
             }).select_one("td>a").get("href") else None
         abstract = soup.find("td", {
             "valign": "top"
         }).select_one("td>font.newsblurb").text.strip().split(
             "\r\n\r\n") if soup.find("td", {
                 "valign": "top"
             }).select_one("td>font.newsblurb").text else None
         response.meta["abstract"] = ' '.join(
             abstract) if abstract else None
         if url:
             yield scrapy.Request(url,
                                  meta=response.meta,
                                  callback=self.parse_news)
     # 除第一条新闻的其他新闻
     table = soup.find("table", {
         "width": "100%",
         "height": "100%"
     }) if soup.find("table", {
         "width": "100%",
         "height": "100%"
     }) else None
     p = table.select("p")[2] if table and table.select("p")[2] else None
     web_list = p.select("p>font>a") if p and p.select("p>font>a") else None
     news_list = p.select("p>a") if p and p.select("p>a") else None
     abstract_list = p.select(
         "p>font.newsblurb") if p and p.select("p>font.newsblurb") else None
     i = 0
     if web_list:
         for web in web_list:
             if web.text.strip() == "PhilBoxing.com":
                 url = news_list[2 *
                                 i].get("href") if news_list and news_list[
                                     2 * i].get("href") else None
                 abstract = abstract_list[i].text.strip().split(
                     "\r\n\r\n"
                 ) if abstract_list and abstract_list[i].text else None
                 response.meta["abstract"] = ' '.join(
                     abstract) if abstract else None
                 if url:
                     yield scrapy.Request(url,
                                          meta=response.meta,
                                          callback=self.parse_news)
                 i += 1
             else:
                 i += 1
     # 翻页
     time_list = p.find_all("font", {"size": "2"})[-1].text.split(" ")
     if time_list:
         if time_list[-2] == "Jan":
             time = time_list[-1] + "-01-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Feb":
             time = time_list[-1] + "-02-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Mar":
             time = time_list[-1] + "-03-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Apr":
             time = time_list[-1] + "-04-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "May":
             time = time_list[-1] + "-05-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Jun":
             time = time_list[-1] + "-06-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Jul":
             time = time_list[-1] + "-07-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Aug":
             time = time_list[-1] + "-08-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Sept":
             time = time_list[-1] + "-09-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Oct":
             time = time_list[-1] + "-10-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Nov":
             time = time_list[-1] + "-11-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Dec":
             time = time_list[-1] + "-12-" + time_list[-3] + " 00:00:00"
         else:
             time = None
         if time and (self.time == None
                      or Util.format_time3(time) >= int(self.time)):
             font_list = soup.select("font.boxertablebody") if soup.select(
                 "font.boxertablebody") else None
             a_list = font_list[-1].select(
                 "a") if font_list and font_list[-1].select("a") else None
             next_page = "http://philboxing.com/news/" + a_list[0].get(
                 "href") if a_list and a_list[0].get("href") else None
             if next_page:
                 yield scrapy.Request(next_page,
                                      meta=response.meta,
                                      callback=self.parse_news_list)
         else:
             self.logger.info('时间截止')
コード例 #16
0
    def parse(self, response, **kwargs):
        header = {
            'Accept': 'application/json,text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '11',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'PHPSESSID=h2q86fctchauhq3ngeg8cu2ld7',
            'Host': 'www.macaupostdaily.com',
            'Origin': 'https://www.macaupostdaily.com',
            'Referer': 'https://www.macaupostdaily.com/',
            'sec-ch-ua':
            'Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
            'sec-ch-ua-mobile': '?0',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        url = 'https://www.macaupostdaily.com/'
        url_list = []
        time_list = []
        title_list = []
        img_list = []

        news_soup = BeautifulSoup(response.text, 'lxml')
        for i in news_soup.find('ul', class_='new_list',
                                id='fu').find_all('li'):
            url_list.append('https://www.macaupostdaily.com' +
                            i.find('a').get('href'))
            time_list.append(
                i.find('div', class_='time').text.strip('\n').strip(' ') +
                ":00")
            title_list.append(i.find('strong').text.strip('\n'))
            img_list.append(url + i.find('img').get('src'))

        request_url = 'https://www.macaupostdaily.com/index.php/Article/news_list'

        # 后面post得到的url
        i = 2
        while (i):
            Data = {'cid': '', 'page': "%d" % i}

            rep = requests.post(url=request_url, data=Data,
                                headers=header).json()
            for list in rep['list']:
                url_list.append("https://www.macaupostdaily.com/article" +
                                list['id'] + ".html")
                title_list.append(list['title'])
                time_list.append(list['time'] + ":00")
                img_list.append('https://www.macaupostdaily.com' + list['img'])
            for new in range(0, len(url_list)):
                if self.time == None or Util.format_time3(
                        time_list[new]) >= int(self.time):
                    yield Request(url_list[new],
                                  callback=self.parse_2,
                                  meta={
                                      'time': time_list[new],
                                      'title': title_list[new],
                                      'img': img_list[new]
                                  })
            if Util.format_time3(time_list[-1]) < int(self.time):
                break
            url_list = []
            time_list = []
            img_list = []
            title_list = []
            i = i + 1
コード例 #17
0
ファイル: nayalook.py プロジェクト: gitzdx/crawler
def nayalook_time_switch1_2(time_string):
    # 3 days ago
    # 返回时间戳
    return Util.format_time3(str(Util.format_time2(time_string)))
コード例 #18
0
ファイル: nayalook.py プロジェクト: gitzdx/crawler
def nayalook_time_switch1(time_string):
    # 30/11/2020
    # 返回时间戳
    return Util.format_time3(str(datetime.strptime(time_string, "%d/%m/%Y")))
コード例 #19
0
 def parse_2(self, response, **kawargs):
     page = BeautifulSoup(response.text, 'lxml')
     category1 = page.find('h1', class_='page-title').text
     if page.find('ul', id='posts-container', class_='posts-items') != None:
         for i in page.find('ul',
                            id='posts-container',
                            class_='posts-items').find_all(
                                'a', class_='post-thumb'):
             images = i.find('img').get('data-src')
             yield Request(i.attrs['href'],
                           callback=self.parse_3,
                           meta={
                               'images': images,
                               'category1': category1
                           })
     else:
         for i in page.find(
                 'div', class_='masonry-grid-wrapper masonry-with-spaces'
         ).find_all('div', class_='featured-area'):
             images = i.find('img').get('data-src')
             yield Request(i.find('a').get('href'),
                           callback=self.parse_3,
                           meta={
                               'images': images,
                               'category1': category1
                           })
     #看能否爬下一页
     if page.find('span', class_='last-page first-last-pages') != None:
         next_page = page.find(
             'span',
             class_='last-page first-last-pages').find('a').attrs['href']
         if page.find('div', class_='year-month') != None:
             time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \
                    page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[
                        -1].text
             pub_time = time_font_2(time)
         elif page.find(
                 'div',
                 class_='masonry-grid-wrapper masonry-with-spaces') != None:
             pub_time = time_font(
                 page.find(
                     'div',
                     class_='masonry-grid-wrapper masonry-with-spaces').
                 find_all('span',
                          class_='date meta-item tie-icon')[-1].text)
         elif page.find('ul', id='posts-container',
                        class_='posts-items') != None:
             pub_time = time_font(
                 page.find('ul', id='posts-container',
                           class_='posts-items').find_all(
                               'span',
                               class_='date meta-item tie-icon')[-1].text)
         if self.time == None or Util.format_time3(pub_time) >= int(
                 self.time):
             yield Request(next_page, callback=self.parse_2)
     #这是第二种二级目录
     elif page.find('li', class_='the-next-page') != None:
         next_page = page.find(
             'li', class_='the-next-page').find('a').attrs['href']
         if page.find('div', class_='year-month') != None:
             time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \
                    page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[
                        -1].text
             pub_time = time_font_2(time)
         elif page.find(
                 'div',
                 class_='masonry-grid-wrapper masonry-with-spaces') != None:
             pub_time = time_font(
                 page.find(
                     'div',
                     class_='masonry-grid-wrapper masonry-with-spaces').
                 find_all('span',
                          class_='date meta-item tie-icon')[-1].text)
         elif page.find('ul', id='posts-container',
                        class_='posts-items') != None:
             pub_time = time_font(
                 page.find('ul', id='posts-container',
                           class_='posts-items').find_all(
                               'span',
                               class_='date meta-item tie-icon')[-1].text)
             if self.time == None or Util.format_time3(pub_time) >= int(
                     self.time):
                 yield Request(next_page, callback=self.parse_2)
コード例 #20
0
def nhandan_time_switch1(time_string):
    # 2020年12月25日 星期五
    # 返回时间戳
    time_string = time_string.rsplit(" ", 1)[0]
    return Util.format_time3(str(datetime.strptime(time_string, "%Y年%m月%d日")))
コード例 #21
0
    def parse_news(self, response):
        item = DemoItem()
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]
        response1 = response.text.replace('<br>', ' ')
        soup = BeautifulSoup(response1, "html.parser")
        # 时间
        temp = soup.select_one("div.title_text") if soup.select_one(
            "div.title_text") else None
        pub_time_list = re.split(
            " |,",
            temp.select_one("p").text) if temp.select_one("p").text else None
        if pub_time_list:
            if pub_time_list[-5] == "Jan":
                time = pub_time_list[-4] + "-01-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Feb":
                time = pub_time_list[-4] + "-02-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Mar":
                time = pub_time_list[-4] + "-03-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Apr":
                time = pub_time_list[-4] + "-04-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "May":
                time = pub_time_list[-4] + "-05-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Jun":
                time = pub_time_list[-4] + "-06-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Jul":
                time = pub_time_list[-4] + "-07-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Aug":
                time = pub_time_list[-4] + "-08-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Sept":
                time = pub_time_list[-4] + "-09-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Oct":
                time = pub_time_list[-4] + "-10-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Nov":
                time = pub_time_list[-4] + "-11-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Dec":
                time = pub_time_list[-4] + "-12-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            item["pub_time"] = time

            if self.time == None or Util.format_time3(time) >= int(self.time):
                # 标题
                item["title"] = temp.find("a").text.strip() if temp.find(
                    "a").text else None
                # 摘要和正文
                body = []
                temp_list = soup.select_one("div.detail_text").find_all(
                    "p") if soup.select_one("div.detail_text").find_all(
                        "p") else None
                if temp_list:
                    for temp in temp_list:
                        body.append(temp.text.strip())
                    item["abstract"] = body[0]
                    item["body"] = "\n".join(body)
                else:
                    item["abstract"] = None
                    item["body"] = None
                # 图片
                images = []
                image_list = soup.select("div.article_image") if soup.select(
                    "div.article_image") else None
                if image_list:
                    for image in image_list:
                        images.append(image.find("img").get("src"))
                item["images"] = images
                yield item
            else:
                self.logger.info('时间截止')
コード例 #22
0
 def parse_eassys(self, response):  # 各类二级目录的文章的翻页和url爬取
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     if re.match(r'.*photo-gallery.*', response.url):  # 照片的
         for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'):
             try:
                 url = 'https://zeenews.india.com' + t.select_one('a').get(
                     'href')
             except:
                 continue
             response.meta['title'] = t.select_one('h3').text
             response.meta['images'] = [t.select_one('img').get('src')]
             response.meta['pub_time'] = t.select_one(
                 '.photo-date').text.strip()
             if self.time is None or Util.format_time3(
                     Util.format_time2(
                         t.select_one('.photo-date').text.strip())) >= int(
                             self.time):
                 yield Request(url,
                               callback=self.parse_item_photo,
                               meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     elif re.match(r'.*video.*', response.url):  # 视频的
         for i in soup.find_all(
                 attrs={'class': 'mini-video mini-video-h margin-bt30px'
                        }):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + i.select_one('a').get(
                 'href')
             #self.logger.info( url)
             response.meta['images'] = [i.select_one('img').get('src')]
             response.meta['title'] = i.select_one('h3').text
             response.meta['pub_time'] = i.select_one('.date').text.strip()
             if self.time is None or Util.format_time3(
                     Util.format_time2(
                         i.select_one('span.date').text.strip())) >= int(
                             self.time):
                 yield Request(url,
                               callback=self.parse_item_video,
                               meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     else:
         for t in soup.find_all(
                 class_='section-article margin-bt30px clearfix'
         ):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + t.select_one('a').get(
                 'href')
             response.meta['title'] = t.select_one('h3.margin-bt10px').text
             tt = t.select_one('span.date').text.strip().split()
             try:
                 pub_time = self.hindi_month[tt[0]] + ' ' + tt[
                     1] + ' ' + tt[2] + ' ' + tt[3] + ' ' + tt[5]
             except:
                 pub_time = t.select_one('span.date').text.strip()
             response.meta['pub_time'] = pub_time
             response.meta['images'] = [t.select_one('img').get('src')]
             if self.time is None or Util.format_time3(
                     Util.format_time2(pub_time)) >= int(self.time):
                 yield Request(url=url,
                               meta=response.meta,
                               callback=self.parse_item)
             else:
                 flag = False
                 self.logger.info('时间截止')
                 break
     if flag:
         try:
             nextPage = 'https://zeenews.india.com/' + soup.find(
                 class_='next last').select_one('a').get('href')
             yield Request(nextPage,
                           callback=self.parse_eassys,
                           meta=response.meta)
         except:
             self.logger.info('Next page no more!')
コード例 #23
0
ファイル: dfa.py プロジェクト: Doglen/crawler
 def parse_news_list(self, response):
     soup = BeautifulSoup(response.text, "html.parser")
     news_list = soup.select("tbody>tr") if soup.select("tbody>tr") else []
     time2 = None
     for news in news_list:
         url = "https://dfa.gov.ph" + news.select_one("a").get("href")
         pub_time_list = news.find(
             "td",
             class_="list-date small").text.strip().split(" ") if news.find(
                 "td", class_="list-date small") else None
         if pub_time_list:
             if pub_time_list[1] == "January":
                 time2 = pub_time_list[2] + "-01-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "February":
                 time2 = pub_time_list[2] + "-02-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "March":
                 time2 = pub_time_list[2] + "-03-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "April":
                 time2 = pub_time_list[2] + "-04-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "May":
                 time2 = pub_time_list[2] + "-05-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "June":
                 time2 = pub_time_list[2] + "-06-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "July":
                 time2 = pub_time_list[2] + "-07-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "August":
                 time2 = pub_time_list[2] + "-08-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "September":
                 time2 = pub_time_list[2] + "-09-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "October":
                 time2 = pub_time_list[2] + "-10-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "November":
                 time2 = pub_time_list[2] + "-11-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "December":
                 time2 = pub_time_list[2] + "-12-" + pub_time_list[
                     0] + " 00:00:00"
         response.meta["pub_time"] = time2
         yield scrapy.Request(url,
                              meta=response.meta,
                              callback=self.parse_news)
     next_page = "https://dfa.gov.ph" + soup.select_one(
         "li.pagination-next>a").get("href") if soup.select_one(
             "li.pagination-next>a") else None
     if self.time == None or (time2 and
                              Util.format_time3(time2) >= int(self.time)):
         if next_page:
             yield scrapy.Request(next_page,
                                  meta=response.meta,
                                  callback=self.parse_news_list)
     else:
         self.logger.info('时间截止')