Esempi in Python per upload_img_to_oss2, esempi in Python per lib.oss_api.upload_img_to_oss2

Esempio n. 1

0

Mostra file

File: ifeng_spider.py Progetto: yoyoidea/crawler

    def pic_detail_spider(self, url):
        content = self.get_content(url)
        soup = BeautifulSoup(content)
        news_detail_list = list()
        now_year = str(datetime.now().year)
        for data in soup.select(".picList div"):
            if '(' in data.span.string:
                date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00'
            else:
                date_time = data.span.string + ':00'
            pub_timestamp = string_transform_timestamp(date_time)
            if pub_timestamp < self.start_timestamp:
                self.flag = 1
                break
            news_detail_list.append(data.p.a['href'])
        for news in news_detail_list:
            tmp_dict = dict()
            try:
                news_body = self.get_content(news)
            except Exception as e:
                print traceback.format_exc()
                logger.debug(traceback.format_exc())
                continue
            news_soup = BeautifulSoup(news_body)
            title = get_tag_html(news_soup, 'h1')
            tmp_dict['title'] = title
            # 获取文章内容
            content_list = news_body.split('\n')
            artile_list = list()
            img_list = list()
            img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
            artile = ''
            for em in content_list:
                if '{title:' in em:
                    em = em.replace("{title:'", "")
                    em = em.replace("',", "")
                    artile_list.append(em.strip())
                if 'big_img: ' in em:
                    em = em.replace("big_img: '", "")
                    em = em.replace("',", "")
                    img_title = ''
                    # 上传图片到阿里云
                    status, msg, img_url = upload_img_to_oss2(em.strip())
                    if status:
                        artile += img_tag.format(img_url=img_url, img_title=img_title)
                        img_list.append([img_title, img_url])

            for a_content in set(artile_list):
                artile += a_content
            tmp_dict['artile'] = artile
            tmp_dict['img_list'] = img_list
            tmp_dict['source'] = news
            tmp_dict['pic_mode'] = 1
            self.article_data_list.append(tmp_dict)

Esempio n. 2

0

Mostra file

File: sina_spider.py Progetto: yoyoidea/crawler

 def sina(self, url):
     """
     新浪微博数据抓取
     :param url: 抓取数据的url
     """
     print url
     content = self.get_content(url)
     content = content.replace('try{feedCardJsonpCallback(', '')
     content = content.replace(');}catch(e){};', '')
     content_dict = eval(content)
     data_list = content_dict['result']['data']
     for data in data_list:
         tmp_dict = dict()
         url = data['url'].replace('\\', '')
         ctime = float(data['ctime'])
         if ctime < self.start_timestamp:
             self.flag = 1
             break
         tmp_dict['ctime'] = ctime
         tmp_dict['source'] = url
         try:
             data_content = self.get_content(url)
         except Exception as e:
             print traceback.format_exc()
             logger.debug(traceback.format_exc())
             continue
         soup = BeautifulSoup(data_content)
         title = get_tag_html(soup, '#main_title')
         tmp_dict['title'] = title.replace('\\', '')
         digest = get_tag_html(soup, '.ellipsis')
         tmp_dict['digest'] = digest
         img_list = list()
         # 获取图片内容
         img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
         artile = u''
         for img in soup.select("[class~=content] img"):
             img_title = img['alt']
             img_url = img['src']
             # 上传图片到阿里云
             status, msg, img_url = upload_img_to_oss2(img_url)
             if status:
                 img_list.append([img_title, img_url])
                 artile += img_tag.format(img_url=img_url, img_title=img_title)
         # 获取文章内容
         for a in soup.select("[class~=content] p"):
             for string in a.strings:
                 artile += u'<p>'+string.strip()+u'</p>'
         artile = artile.replace(u'新浪娱乐讯 ', '')
         artile = artile.replace(u'<p>[微博]</p>', '')
         tmp_dict['artile'] = artile
         tmp_dict['img_list'] = img_list
         tmp_dict['pic_mode'] = 0
         self.article_data_list.append(tmp_dict)

Esempio n. 3

0

Mostra file

File: ifeng_spider.py Progetto: yoyoidea/crawler

    def detail_spider(self, url):
        content = self.get_content(url)

        soup = BeautifulSoup(content)
        news_detail_list = list()
        for data in soup.select(".box_txt"):
            pub_timestamp = string_transform_timestamp(data.span.string + ':00')
            if pub_timestamp < self.start_timestamp:
                self.flag = 1
                break
            news_detail_list.append(data.a['href'])
        for news in news_detail_list:
            tmp_dict = dict()
            try:
                news_body = self.get_content(news)
            except Exception as e:
                print traceback.format_exc()
                logger.debug(traceback.format_exc())
                continue
            news_soup = BeautifulSoup(news_body)
            title = get_tag_html(news_soup, 'h1')
            tmp_dict['title'] = title
            # 获取文章内容
            artile = ''
            # 获取图片
            img_list = list()
            img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
            for data in news_soup.select("#main_content"):
                img_title = data.span.string if data.span.string else ''
                try:
                    img_url = data.p.img['src']
                except Exception as e:
                    print traceback.format_exc()
                    logger.debug(traceback.format_exc())
                    continue
                # 上传图片到阿里云
                status, msg, img_url = upload_img_to_oss2(img_url)
                if status:
                    img_list.append([img_title, img_url])
                    artile += img_tag.format(img_url=img_url, img_title=img_title)
            for a in news_soup.select("#main_content p"):
                for string in a.strings:
                    artile += u'<p>' + string.strip() + u'</p>'
            tmp_dict['artile'] = artile
            tmp_dict['img_list'] = img_list
            tmp_dict['source'] = news
            tmp_dict['pic_mode'] = 1
            self.article_data_list.append(tmp_dict)

Esempio n. 4

0

Mostra file

File: souhu_spider.py Progetto: yoyoidea/crawler

 def pic_main(self):
     for url in self.pic_url_list:
         try:
             content = self.get_content(url)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         soup = BeautifulSoup(content)
         for data in soup.select("#item-list a"):
             tmp_dict = dict()
             news_url = data['href']
             try:
                 news_body = self.get_content(news_url)
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 continue
             news_soup = BeautifulSoup(news_body)
             title = get_tag_html(news_soup, '#contentE h2')
             pub_time = get_tag_html(news_soup, '[class~=timt]')
             pub_time = pub_time.replace(u'日期：', '').strip()
             pub_timmestamp = string_transform_timestamp(pub_time + ' 00:00:00')
             if pub_timmestamp < self.start_timestamp:
                 self.flag = 1
                 break
             tmp_dict['title'] = title
             # 获取文章内容
             tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]')
             # 获取图片
             img_list = list()
             for img in news_soup.select("#picPlayerTab img"):
                 img_title = img['alt']
                 img_url = img['src'].replace('st', '')
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
             tmp_dict['img_list'] = img_list
             tmp_dict['source'] = news_url
             self.article_data_list.append(tmp_dict)
     insert_news_to_mysql(self.article_data_list)

Esempio n. 5

0

Mostra file

File: souhu_spider.py Progetto: yoyoidea/crawler

 def detail_spider(self, url):
     content = self.get_content(url)
     soup = BeautifulSoup(content)
     news_detail_list = list()
     now_year = str(datetime.now().year)
     for data in soup.select("[class~=f14list] li"):
         if data.span:
             date_time = now_year + '-' + data.span.string[2:-1].replace('/', '-') + ':00'
             date_timestamp = string_transform_timestamp(date_time)
             if date_timestamp < self.start_timestamp:
                 self.flag = 1
                 break
             news_detail_list.append(data.a['href'])
     for news in news_detail_list:
         tmp_dict = dict()
         try:
             news_body = self.get_content(news)
         except Exception as e:
             logger.debug(traceback.format_exc())
             continue
         news_soup = BeautifulSoup(news_body)
         if 'pic' not in news:
             print news
             title = get_tag_html(news_soup, 'h1')
             tmp_dict['title'] = title
             # 获取图片
             img_list = list()
             img_tag = u'<div><img alt="{img_title}" src="{img_url}"><span>{img_title}</span></div>'
             artile = ''
             for img in news_soup.select("#contentText img"):
                 img_title = img['alt']
                 img_url = img['src']
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
                     artile += img_tag.format(img_url=img_url, img_title=img_title)
             # 获取文章内容
             for a in news_soup.select("#contentText p"):
                 for string in a.strings:
                     if '_tvId' not in string:
                         artile += u'<p>' + string.strip() + u'</p>'
             artile = artile.replace(u'搜狐娱乐讯 ', '')
             tmp_dict['artile'] = artile
             tmp_dict['img_list'] = img_list
             tmp_dict['pic_mode'] = 0
         else:
             title = get_tag_html(news_soup, '[class~=ttl]')
             tmp_dict['title'] = title
             # 获取文章内容
             tmp_dict['artile'] = get_tag_html(news_soup, '[class~=explain]')
             # 获取图片
             img_list = list()
             for img in news_soup.select("#picPlayerTab img"):
                 img_title = img.get('alt') if img.get('alt') else ''
                 img_url = img['src'].replace('st', '')
                 # 上传图片到阿里云
                 status, msg, img_url = upload_img_to_oss2(img_url)
                 if status:
                     img_list.append([img_title, img_url])
             tmp_dict['img_list'] = img_list
             tmp_dict['pic_mode'] = 1
         tmp_dict['source'] = news
         self.article_data_list.append(tmp_dict)