Beispiel #1
0
    def parse(self, response):
        if not response.body:
            logger.error(msg='there is no response body ,please go and check it ')
            return
        json_object = json.loads(response.body_as_unicode())
        if not json_object:
            logger.error(msg='there is no json object')
            return
        result = json_object.get('result', None)
        if not result:
            return
        item_list = result.get('list')
        if not items:
            return

        for tmp_item in item_list:
            item = items.CnbetaspiderItem()
            item['catid'] = tmp_item.get('catid', None)
            item['comments'] = tmp_item.get('comments', None)
            item['counter'] = tmp_item.get('counter', None)
            item['mview'] = tmp_item.get('mview', None)
            item['rate_sum'] = tmp_item.get('rate_sum', None)
            item['source'] = tmp_item.get('source', None)
            item['score'] = tmp_item.get('score', None)
            item['thumb'] = tmp_item.get('thumb', None)
            item['topic'] = tmp_item.get('topic', None)
            item['inputtime'] = tmp_item.get('inputtime', None)
            item['hometext'] = tmp_item.get('hometext', None)
            item['title'] = tmp_item.get('title', None)
            item['url_show'] = 'http://www.cnbeta.com' + tmp_item.get('url_show', '')
            item['crawled_datetime'] = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
            yield item
Beispiel #2
0
    def parse(self, response):
        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return
        json_object = json.loads(response.body_as_unicode())
        if not json_object:
            logger.error(msg='there is no json object')
            return
        result = json_object.get('result', None)
        if not result:
            return
        item_list = result.get('list')
        if not items:
            return

        for tmp_item in item_list:
            item = items.CnbetaspiderItem()
            item['catid'] = tmp_item.get('catid', None)
            item['comments'] = tmp_item.get('comments', None)
            item['counter'] = tmp_item.get('counter', None)
            item['mview'] = tmp_item.get('mview', None)
            item['rate_sum'] = tmp_item.get('rate_sum', None)
            item['source'] = tmp_item.get('source', None)
            item['score'] = tmp_item.get('score', None)
            item['thumb'] = tmp_item.get('thumb', None)
            item['topic'] = tmp_item.get('topic', None)
            item['inputtime'] = tmp_item.get('inputtime', None)
            item['hometext'] = tmp_item.get('hometext', None)
            item['title'] = tmp_item.get('title', None)
            item['url_show'] = 'http://www.cnbeta.com' + tmp_item.get(
                'url_show', '')
            item['crawled_datetime'] = datetime.utcnow().strftime(
                '%Y-%m-%d %H:%M:%S')
            yield item
    def parse(self, response):

        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return

        nodes = response.xpath('//div[@class="artlist clearfix"]/DL/DT')
        if nodes:
            pass
        else:
            nodes = response.xpath('//div[@class="artlist clearfix"]/dl/dt')

        for node in nodes:
            pubdate = node.xpath('.//span/text()').extract_first()
            pubdate = re.sub('日期:', '', pubdate)
            title = node.xpath('.//a/text()').extract_first()
            url = node.xpath('.//a/@href').extract_first()
            full_url = 'https://www.jb51.net{}'.format(url)
            item = SandboxItem()
            item['pubdate'] = pubdate
            item['url'] = full_url
            item['title'] = title
            item['category'] = self.category
            yield item
Beispiel #4
0
    def parse(self, response):
        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return

        all_items = Selector(response).xpath('//ul[@class = "item-lists"]/li')

        for i in range(len(all_items)):
            item = items.XianyuItem()
            item['title'] = all_items[i].xpath(
                '//h4[@class ="item-title"]//a[@target = "_blank"]/text()'
            ).extract()[i]
            item['price'] = all_items[i].xpath(
                '//span[@class ="price"]//em/text()').extract()[i]
            item['description'] = all_items[i].xpath(
                '//div[@class = "item-description"]/text()').extract()[i]
            item['pic'] = ("https:" + str(all_items[i].xpath(
                '//div[@class = "item-pic sh-pic120"]//img/@src').extract()[i])
                           ).replace('_120x120', '').strip()
            item['area'] = all_items[i].xpath(
                '//div[@class="seller-location"]/text()').extract()[i]
            item["info"] = "https:" + all_items[i].xpath(
                '//a[@target = "_blank"]/@href').extract()[i]
            yield item
Beispiel #5
0
    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace("\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime("%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace("月", "-").replace("日", "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime =datetime.datetime.now().strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit, like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device


                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)
Beispiel #6
0
    def get_doc_list(self, response):
        """
        实现翻页,解析doc_id,返回案件基本信息
        :param response:
        :return:
        """
        cookie = response.meta['cookie']
        vjkl5 = response.meta['vjkl5']
        Param = response.meta['Param']
        try:
            result = json.loads(json.loads(response.text))
        except BaseException as exc:
            logger.error(exc)
            yield response.request
        else:
            if not response.meta.get('key'):
                format_key_str = result[0]['RunEval'].encode('utf-8')
                key = getkey(format_key_str).encode('utf-8')
                page_count = int(result[0]['Count']) / 20 if int(
                    result[0]['Count']) / 20 == 0 else int(
                        result[0]['Count']) / 20 + 1
                for page in range(2, page_count + 1):
                    data = {
                        'Param': Param,
                        'Index': str(page),
                        'Page': '20',
                        'Order': u'法院层级',
                        'Direction': 'asc',
                        'vl5x': vjkl5
                    }
                    yield scrapy.FormRequest(
                        'http://wenshu.court.gov.cn/List/ListContent',
                        headers={'Cookie': cookie},
                        callback=self.get_doc_list,
                        formdata=data,
                        meta={
                            'cookie': cookie,
                            'vjkl5': vjkl5,
                            'Param': Param,
                            'type_list': [],
                            'key': key
                        })
            else:
                key = response.meta['key']

            for x in result[1:]:
                iid = x[u'文书ID'].encode('utf-8')
                docid = decode_docid(iid, key)
                item = DocInfo()
                item['doc_id'] = docid
                item['doc_name'] = x[u'案件名称']
                item['doc_date'] = x[u'裁判日期']
                yield item
Beispiel #7
0
 def get_tree_list(self, response):
     """
     根据分类数量进行循环获取数据量少于200的查询条件,并请求案件信息列表
     :param response:
     :return:
     """
     cookie = response.meta['cookie']
     vjkl5 = response.meta['vjkl5']
     try:
         html = json.loads(json.loads(response.text))
     except BaseException as exc:
         logger.error(exc)
         yield response.request
     else:
         for d in html:
             if d['Key'] == response.meta['type_list'][0]:
                 for dd in d['Child']:
                     if not dd['Key'] or not dd['IntValue']:
                         continue
                     Param = response.meta['Param'] + u',{}:{}'.format(
                         d['Key'], dd['Key'])
                     data = {
                         'Param': Param,
                         'Index': '1',
                         'Page': '20',
                         'Order': u'法院层级',
                         'Direction': 'asc',
                         'vl5x': vjkl5
                     }
                     if dd['IntValue'] <= 200 or len(
                             response.meta['type_list']) == 1:
                         yield scrapy.FormRequest(
                             'http://wenshu.court.gov.cn/List/ListContent',
                             headers={'Cookie': cookie},
                             callback=self.get_doc_list,
                             formdata=data,
                             meta={
                                 'cookie': cookie,
                                 'vjkl5': vjkl5,
                                 'Param': Param
                             })
                     else:
                         yield scrapy.FormRequest(
                             'http://wenshu.court.gov.cn/List/TreeContent',
                             headers={'Cookie': cookie},
                             callback=self.get_tree_list,
                             formdata=data,
                             meta={
                                 'cookie': cookie,
                                 'vjkl5': vjkl5,
                                 'Param': Param,
                                 'type_list': response.meta['type_list'][1:]
                             })
Beispiel #8
0
 def process_social_data(self, item):
     sql = '''
         INSERT INTO tab_social_network (user_id, weibo_count, follows_count, fans_count)
          VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE weibo_count = %s,
          follows_count = %s, fans_count = %s
     '''
     cursor = self.__conn.cursor()
     try:
         cursor.execute(sql, (item["user_id"], item["weibo"], item["follow"],
                              item["fans"], item["weibo"], item["follow"], item["fans"],))
     except Exception, e:
         logger.error("social data insert error %s" % e)
Beispiel #9
0
 def process_weibo_context(self, item):
     sql = '''
         INSERT INTO tab_context_info (user_id, issue_time, get_time, context, like_count,
         relay_count, comment_count, device) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
     '''
     cursor = self.__conn.cursor()
     try:
         cursor.execute("SET NAMES utf8mb4")
         cursor.execute(sql, (item["user_id"], item["issue_time"], item["get_time"],
                          item["context"], item["like_count"], item["relay_count"],
                          item["comment_count"], item["device"]))
     except Exception, e:
         logger.error("weibo context data insert error %s" % e)
Beispiel #10
0
    def parse(self, response):
        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return
        html = str(response.body)
        # item = items.jdItem()
        # item['html'] = html
        # yield item

        with open("page.txt", "a") as f:
            f.write(html)
            f.close()
Beispiel #11
0
    def parse(self, response):
        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return

        all_item_urls = Selector(response).xpath('//div[@class  = "hd"]/a')

        for i in all_item_urls:
            item = items.Top250ItemUrl()
            item['url'] = i.xpath('./@href').extract()[0]

            yield item
Beispiel #12
0
 def process_weibo_context(self, item):
     sql = '''
         INSERT INTO tab_context_info (user_id, issue_time, get_time, context, like_count,
         relay_count, comment_count, device) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
     '''
     cursor = self.__conn.cursor()
     try:
         cursor.execute("SET NAMES utf8mb4")
         cursor.execute(
             sql, (item["user_id"], item["issue_time"], item["get_time"],
                   item["context"], item["like_count"], item["relay_count"],
                   item["comment_count"], item["device"]))
     except Exception, e:
         logger.error("weibo context data insert error %s" % e)
Beispiel #13
0
def convert_api(pattarn, tag_name, css_url):
    pattarn = 'uni' + repr(pattarn).strip("'")[-4:]
    pathes = get_all_path()
    # print(pathes)
    if css_url[-10:] not in pathes:
        write_file(css_url, 'w')
    font_url = get_font_url(css_url[-10:], tag_name)
    if font_url[-10:] not in pathes:
        write_file(font_url, 'wb')
    xml_tag_list = font_xml(font_url[-10:])
    # print(pattarn,xml_tag_list)
    if pattarn in xml_tag_list:
        p_index = xml_tag_list.index(pattarn)
        woffs = get_woffs(woff_string)
        print('{}>>{}'.format(pattarn, woffs[p_index]))
        return woffs[p_index]
    else:
        logger.error('解析出错')
Beispiel #14
0
 def parse(self, response):
     t1 = time.time()
     html = scrapy.Selector(text=response.text)
     divs = html.css("#content_left  > div .f13 .c-tools::attr(data-tools)")
     for div in divs:
         data_str = div.extract()
         data_dict = json.loads(data_str)
         url = None
         try:
             url = requests.get(data_dict['url'], timeout=5).url
             schame = urllib.parse.urlparse(url).netloc
             sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)"
             self.mysql.excute_sql(sql)
         except Exception as e:
             logger.error(
                 f"requests.get(data_dict['url']).url ===>>> {str(e)}")
     t2 = time.time()
     logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
Beispiel #15
0
 def process_social_data(self, item):
     sql = '''
         INSERT INTO tab_social_network (user_id, weibo_count, follows_count, fans_count)
          VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE weibo_count = %s,
          follows_count = %s, fans_count = %s
     '''
     cursor = self.__conn.cursor()
     try:
         cursor.execute(sql, (
             item["user_id"],
             item["weibo"],
             item["follow"],
             item["fans"],
             item["weibo"],
             item["follow"],
             item["fans"],
         ))
     except Exception, e:
         logger.error("social data insert error %s" % e)
Beispiel #16
0
 def process_user_info(self, item):
     sql = '''
         INSERT INTO tab_base_info (user_id, user_name, sex, province, city, birthday, abstract)
         VALUES (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE
         user_name = %s, sex = %s, province = %s, city = %s, birthday = %s, abstract = %s
     '''
     cursor = self.__conn.cursor()
     try:
         province = item.get("province", None)
         city = item.get("city", None)
         birthday = item.get("birthday", None)
         abstract = item.get("abstract", None)
         sex = item.get("sex", "-1")
         cursor.execute("SET NAMES utf8mb4")
         cursor.execute(sql, (item["user_id"], item["user_name"], sex, province, city,
             birthday, abstract, item["user_name"], sex, province,
             city, birthday, abstract))
     except Exception, e:
         logger.error("user info data insert error %s" % e)
Beispiel #17
0
 def process_user_info(self, item):
     sql = '''
         INSERT INTO tab_base_info (user_id, user_name, sex, province, city, birthday, abstract)
         VALUES (%s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE
         user_name = %s, sex = %s, province = %s, city = %s, birthday = %s, abstract = %s
     '''
     cursor = self.__conn.cursor()
     try:
         province = item.get("province", None)
         city = item.get("city", None)
         birthday = item.get("birthday", None)
         abstract = item.get("abstract", None)
         sex = item.get("sex", "-1")
         cursor.execute("SET NAMES utf8mb4")
         cursor.execute(sql,
                        (item["user_id"], item["user_name"], sex, province,
                         city, birthday, abstract, item["user_name"], sex,
                         province, city, birthday, abstract))
     except Exception, e:
         logger.error("user info data insert error %s" % e)
Beispiel #18
0
    def init(self):
        logger.debug("Initial the database")
        try:
            CREDIT = json.loads(os.environ.get("VCAP_SERVICES"))['cloudantNoSQLDB'][0]['credentials']
            couch = couchdb.Server("https://%s.cloudant.com" %
                                setting.CREDIT['username'])
            couch.resource.credentials = (setting.CREDIT['username'],
                                        setting.CREDIT['password'])
            try:
                db = couch.create(self.db_name)
                logger.debug("Create a new database " + self.db_name)
            except:
                db = couch.__getitem__(self.db_name)
                logger.debug("Use Data Base" + self.db_name)

            logger.debug("Create datadase successfully")
            self.__dict__.update(db.__dict__)
        except:
            logger.error('cannot find the credentials pls bind a CloudantDB Service')

        return self
Beispiel #19
0
 def parse(self, response):
     t1=time.time()
     html=scrapy.Selector(text=response.text)
     divs=html.css("div.results > div")
     for div in divs:
         vrwrap=div.css("div.vrwrap")
         if len(vrwrap)==0:
             title = "".join(div.css("div.rb h3 a::text").extract())
             url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0]
         else:
             title="".join(div.css("div.vrwrap h3 a::text").extract())
             url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0]
         try:
             _html=scrapy.Selector(text=requests.get(url,verify=False).text)
             url = _html.re("window.location.replace\(\"(.*?)\"\)")[0]
             schame = urllib.parse.urlparse(url).netloc
             sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)"
             self.mysql.excute_sql(sql)
         except Exception as e:
             logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}")
         t2=time.time()
         logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
Beispiel #20
0
    def init(self):
        logger.debug("Initial the database")
        try:
            CREDIT = json.loads(os.environ.get(
                "VCAP_SERVICES"))['cloudantNoSQLDB'][0]['credentials']
            couch = couchdb.Server("https://%s.cloudant.com" %
                                   setting.CREDIT['username'])
            couch.resource.credentials = (setting.CREDIT['username'],
                                          setting.CREDIT['password'])
            try:
                db = couch.create(self.db_name)
                logger.debug("Create a new database " + self.db_name)
            except:
                db = couch.__getitem__(self.db_name)
                logger.debug("Use Data Base" + self.db_name)

            logger.debug("Create datadase successfully")
            self.__dict__.update(db.__dict__)
        except:
            logger.error(
                'cannot find the credentials pls bind a CloudantDB Service')

        return self
    def parse(self, response):
        if not response.body:
            logger.error(
                msg='there is no response body ,please go and check it ')
            return

        i = Selector(response)
        item = items.Top250Item()
        item['rank'] = i.xpath(
            '//span[@class = "top250-no"]/text()').extract()[0]
        item['title'] = i.xpath(
            '//span[@property="v:itemreviewed"]/text()').extract()[0]
        item['year'] = i.xpath('//span[@class = "year"]/text()').extract()[0]
        item['area'] = i.xpath(
            '//div[@id = "info"]/br[4]/following-sibling::text()').extract()[1]
        item['rating'] = i.xpath(
            '//strong[@class="ll rating_num"]/text()').extract()[0]
        item['rating_people'] = i.xpath(
            '//span[@property="v:votes"]/text()').extract()[0]
        item['intro'] = i.xpath(
            '//span[@property="v:summary"]/text()').extract()
        item['style'] = i.xpath('//span[@property="v:genre"]/text()').extract()

        yield item
Beispiel #22
0
 def start_requests(self):
     driver = webdriver.Chrome(
         executable_path="/Users/yuanlang/work/javascript/chromedriver")
     driver.get(
         "https://www.toutiao.com/search/?keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6"
     )
     time.sleep(2)
     for url in self.start_urls:
         for page in range(0, 8):
             driver.get(
                 url=
                 f"{url}&offset={20*page}&timestamp={'%d'%(time.time()*1000)}"
             )
             time.sleep(5)
             html = scrapy.Selector(text=driver.page_source)
             content = html.css("body > pre::text").extract_first()
             data = json.loads(content)["data"]
             for item in data:
                 try:
                     if "article_url" not in item:
                         if "display" not in item:
                             print(item)
                             continue
                         print(item["display"])
                         _url = item["display"]["info"]["url"]
                         title = item["display"]["emphasized"]["title"]
                     else:
                         title = item["abstract"]
                         _url = item["article_url"]
                     schame = urllib.parse.urlparse(_url).netloc
                     sql = f"insert into seed(url,title,site_name,type) values('{_url}','{title}','{schame}',1)"
                     self.mysql.excute_sql(sql)
                 except Exception as e:
                     logger.error(
                         f"requests.get(data_dict['url']).url ===>>> {str(e)}"
                     )
Beispiel #23
0
    def parse_weibo_context(self, soup, uid):
        weibo_info = WeiboItem()
        if self.first_flag_home:
            self.first_flag_home = False
            return None
        else:
            contexts = soup.find_all("div", class_="c")
            for item in contexts:
                try:
                    context = item.find("span", class_="ctt")
                    if not context:
                        continue
                    weibo_text = context.text.encode("utf-8", "ignore").replace(" ", "").\
                        replace("\n", "").replace("\xc2\xa0", "").replace("\xF0\x9F\x91\x8A", "").\
                        replace("\xF0\x9F\x91\xBC", "").replace("\xF0\x9F\x8C\xB8\xF0\x9F", "")
                    parent_ele = context.parent.parent
                    like_ele = parent_ele.find(text=re.compile(u"^赞\[\d*\]$"))
                    relay_ele = parent_ele.find(
                        text=re.compile(u"^转发\[\d*\]$"))
                    comment_ele = parent_ele.find(
                        text=re.compile(u"^评论\[\d*\]$"))
                    issue_time_ele = parent_ele.find("span", class_="ct")
                    issue_time = issue_time_ele.text
                    issue_time = issue_time.encode("utf-8")

                    issue = issue_time.split("来自")
                    issue_datetime = ""
                    if len(issue) > 0:
                        if "分钟" in issue[0]:
                            min = filter(str.isdigit, issue[0])
                            t = datetime.datetime.now() - datetime.timedelta(
                                minutes=int(min))
                            issue_datetime = t.strftime("%Y-%m-%d %H:%M:%S")
                        elif "今天" in issue[0]:
                            time = issue[0].replace("今天 ", "").replace(
                                "\xc2\xa0", "")
                            issue_datetime = datetime.datetime.now().strftime(
                                "%Y-%m-%d ") + time
                        else:
                            issue_datetime = issue[0].replace(
                                "月", "-").replace("日",
                                                  "").replace("\xc2\xa0", "")
                            if issue[0].count("-") < 2:
                                issue_datetime = datetime.datetime.now(
                                ).strftime("%Y-") + issue_datetime
                    issue_device = issue[1] if len(issue) > 1 else None

                    weibo_info["context"] = weibo_text
                    weibo_info["user_id"] = uid
                    weibo_info["issue_time"] = issue_datetime.strip()
                    weibo_info["get_time"] = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    weibo_info["like_count"] = filter(str.isdigit,
                                                      like_ele.encode("utf-8"))
                    weibo_info["relay_count"] = filter(
                        str.isdigit, relay_ele.encode("utf-8"))
                    weibo_info["comment_count"] = filter(
                        str.isdigit, comment_ele.encode("utf-8"))
                    weibo_info["device"] = issue_device

                    # print issue_datetime, issue_device, weibo_text
                    # print like_ele.encode("utf-8"), relay_ele.encode("utf-8"), comment_ele.encode("utf-8")
                    return weibo_info
                    # 只爬去第一条微博
                except Exception, e:
                    logger.error(e)
Beispiel #24
0
 def parse_error(self, response):
     logger.error("post:%s" % response.url)
Beispiel #25
0
 def parse_error(self, response):
     logger.error("post:%s" % response.url)
Beispiel #26
0
def get_char(js):
    all_var = {}
    # 判断混淆 无参数 返回常量 函数
    if_else_no_args_return_constant_function_functions = []
    """
    function zX_() {
            function _z() {
                return '09';
            };
            if (_z() == '09,') {
                return 'zX_';
            } else {
                return _z();
            }
        }
    """
    constant_function_regex4 = re.compile(
        """
        function\s+\w+\(\)\s*\{\s*
            function\s+\w+\(\)\s*\{\s*
                return\s+[\'\"][^\'\"]+[\'\"];\s*
            \};\s*
            if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s*
                return\s*[\'\"][^\'\"]+[\'\"];\s*
            \}\s*else\s*\{\s*
                return\s*\w+\(\);\s*
            \}\s*
        \}
        """, re.X)
    l = constant_function_regex4.findall(js)
    for i in l:
        function_name = re.search(
            """
        function\s+(\w+)\(\)\s*\{\s*
            function\s+\w+\(\)\s*\{\s*
                return\s+[\'\"]([^\'\"]+)[\'\"];\s*
            \};\s*
            if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s*
                return\s*[\'\"]([^\'\"]+)[\'\"];\s*
            \}\s*else\s*\{\s*
                return\s*\w+\(\);\s*
            \}\s*
        \}
        """, i, re.X)
        if_else_no_args_return_constant_function_functions.append(
            function_name.groups())
        js = js.replace(i, "")
        # 替换全文
        a, b, c, d = function_name.groups()
        all_var["%s()" % a] = d if b == c else b

    # 判断混淆 无参数 返回函数 常量
    if_else_no_args_return_function_constant_functions = []
    """
    function wu_() {
            function _w() {
                return 'wu_';
            };
            if (_w() == 'wu__') {
                return _w();
            } else {
                return '5%';
            }
        }
    """
    constant_function_regex5 = re.compile(
        """
        function\s+\w+\(\)\s*\{\s*
            function\s+\w+\(\)\s*\{\s*
                return\s+[\'\"][^\'\"]+[\'\"];\s*
            \};\s*
            if\s*\(\w+\(\)\s*==\s*[\'\"][^\'\"]+[\'\"]\)\s*\{\s*
                return\s*\w+\(\);\s*
            \}\s*else\s*\{\s*
                return\s*[\'\"][^\'\"]+[\'\"];\s*
            \}\s*
        \}
        """, re.X)
    l = constant_function_regex5.findall(js)
    for i in l:
        function_name = re.search(
            """
        function\s+(\w+)\(\)\s*\{\s*
            function\s+\w+\(\)\s*\{\s*
                return\s+[\'\"]([^\'\"]+)[\'\"];\s*
            \};\s*
            if\s*\(\w+\(\)\s*==\s*[\'\"]([^\'\"]+)[\'\"]\)\s*\{\s*
                return\s*\w+\(\);\s*
            \}\s*else\s*\{\s*
                return\s*[\'\"]([^\'\"]+)[\'\"];\s*
            \}\s*
        \}
        """, i, re.X)
        if_else_no_args_return_function_constant_functions.append(
            function_name.groups())
        js = js.replace(i, "")
        # 替换全文
        a, b, c, d = function_name.groups()
        all_var["%s()" % a] = b if b == c else d

    # var 参数等于返回值函数
    var_args_equal_value_functions = []
    """
    var ZA_ = function(ZA__) {
            'return ZA_';
            return ZA__;
        };
    """
    constant_function_regex1 = re.compile(
        "var\s+[^=]+=\s*function\(\w+\)\{\s*[\'\"]return\s*\w+\s*[\'\"];\s*return\s+\w+;\s*\};"
    )
    l = constant_function_regex1.findall(js)
    for i in l:
        function_name = re.search("var\s+([^=]+)", i).group(1)
        var_args_equal_value_functions.append(function_name)
        js = js.replace(i, "")
        # 替换全文
        a = function_name
        js = re.sub("%s\(([^\)]+)\)" % a, r"\1", js)

    # var 无参数 返回常量 函数
    var_no_args_return_constant_functions = []
    """
    var Qh_ = function() {
            'return Qh_';
            return ';';
        };
    """
    constant_function_regex2 = re.compile(
        """
            var\s+[^=]+=\s*function\(\)\{\s*
                [\'\"]return\s*\w+\s*[\'\"];\s*
                return\s+[\'\"][^\'\"]+[\'\"];\s*
                \};
            """, re.X)
    l = constant_function_regex2.findall(js)
    for i in l:
        function_name = re.search(
            """
            var\s+([^=]+)=\s*function\(\)\{\s*
                [\'\"]return\s*\w+\s*[\'\"];\s*
                return\s+[\'\"]([^\'\"]+)[\'\"];\s*
                \};
            """, i, re.X)
        var_no_args_return_constant_functions.append(function_name.groups())
        js = js.replace(i, "")
        # 替换全文
        a, b = function_name.groups()
        all_var["%s()" % a] = b

    # 无参数 返回常量 函数
    no_args_return_constant_functions = []
    """
    function ZP_() {
            'return ZP_';
            return 'E';
        }
    """
    constant_function_regex3 = re.compile(
        """
            function\s*\w+\(\)\s*\{\s*
                [\'\"]return\s*[^\'\"]+[\'\"];\s*
                return\s*[\'\"][^\'\"]+[\'\"];\s*
            \}\s*
        """, re.X)
    l = constant_function_regex3.findall(js)
    for i in l:
        function_name = re.search(
            """
            function\s*(\w+)\(\)\s*\{\s*
                [\'\"]return\s*[^\'\"]+[\'\"];\s*
                return\s*[\'\"]([^\'\"]+)[\'\"];\s*
            \}\s*
        """, i, re.X)
        no_args_return_constant_functions.append(function_name.groups())
        js = js.replace(i, "")
        # 替换全文
        a, b = function_name.groups()
        all_var["%s()" % a] = b

    # 无参数 返回常量 函数 中间无混淆代码
    no_args_return_constant_sample_functions = []
    """
    function do_() {
            return '';
        }
    """
    constant_function_regex3 = re.compile(
        """
            function\s*\w+\(\)\s*\{\s*
                return\s*[\'\"][^\'\"]*[\'\"];\s*
            \}\s*
        """, re.X)
    l = constant_function_regex3.findall(js)
    for i in l:
        function_name = re.search(
            """
            function\s*(\w+)\(\)\s*\{\s*
                return\s*[\'\"]([^\'\"]*)[\'\"];\s*
            \}\s*
        """, i, re.X)
        no_args_return_constant_sample_functions.append(function_name.groups())
        js = js.replace(i, "")
        # 替换全文
        a, b = function_name.groups()
        all_var["%s()" % a] = b

    # 字符串拼接时使无参常量函数
    """
    (function() {
                'return sZ_';
                return '1'
            })()
    """
    constant_function_regex6 = re.compile(
        """
            \(function\(\)\s*\{\s*
                [\'\"]return[^\'\"]+[\'\"];\s*
                return\s*[\'\"][^\'\"]*[\'\"];?
            \}\)\(\)
        """, re.X)
    l = constant_function_regex6.findall(js)
    for i in l:
        function_name = re.search(
            """
            \(function\(\)\s*\{\s*
                [\'\"]return[^\'\"]+[\'\"];\s*
                return\s*([\'\"][^\'\"]*[\'\"]);?
            \}\)\(\)
        """, i, re.X)
        js = js.replace(i, function_name.group(1))

    # 字符串拼接时使用返回参数的函数
    """
    (function(iU__) {
                'return iU_';
                return iU__;
            })('9F')
    """
    constant_function_regex6 = re.compile(
        """
            \(function\(\w+\)\s*\{\s*
                [\'\"]return[^\'\"]+[\'\"];\s*
                return\s*\w+;
            \}\)\([\'\"][^\'\"]*[\'\"]\)
        """, re.X)

    l = constant_function_regex6.findall(js)
    for i in l:
        function_name = re.search(
            """
            \(function\(\w+\)\s*\{\s*
                [\'\"]return[^\'\"]+[\'\"];\s*
                return\s*\w+;
            \}\)\(([\'\"][^\'\"]*[\'\"])\)
        """, i, re.X)
        js = js.replace(i, function_name.group(1))

    # 获取所有变量
    var_regex = "var\s+(\w+)=(.*?);\s"

    for var_name, var_value in re.findall(var_regex, js):
        var_value = var_value.strip("\'\"").strip()
        if "(" in var_value:
            var_value = ";"
        all_var[var_name] = var_value

    # 注释掉 此正则可能会把关键js语句删除掉
    # js = re.sub(var_regex, "", js)

    for var_name, var_value in all_var.items():
        js = js.replace(var_name, var_value)

    js = re.sub("[\s+']", "", js)

    # 寻找%E4%B8%AD%E5%80%92%E 密集区域
    #string_region = re.findall("((?:%\w\w)+)", js)
    string_region = re.findall("((?:%\w\w|[A-Za-z\d])+)", js)
    # 去重
    string_region = set(string_region)
    # 判断是否存在汉字
    chinese_flag = 0
    for string_ in string_region:
        if re.search("%\w\w", string_):
            chinese_flag = 1
    if not chinese_flag:
        # 可能混淆字符为纯英文 。。。尚未解决
        return []
    string_str = ""
    for string_ in string_region:
        if not re.search("%\w\w", string_):
            continue
        # 过滤
        try:
            string = unquote(string_)
        except:
            continue
        if len(string_) > len(string_str):
            string_str = string_
    string = unquote(string_str)
    string_list = list(string)
    # 从 字符串密集区域后面开始寻找索引区域
    index_m = re.search("([\d,]+(;[\d,]+)+)",
                        js[js.find(string_str) + len(string_str):])
    index_list = index_m.group(1).split(";")

    _word_list = []
    for word_index_list in index_list:
        _word = ""
        if "," in word_index_list:
            word_index_list = word_index_list.split(",")
            word_index_list = [int(x) for x in word_index_list]
        else:
            word_index_list = [int(word_index_list)]

        for word_index in word_index_list:
            try:
                _word += string_list[word_index]
            except IndexError:
                logger.error('IndexError found: {} {}'.format(
                    string_list, len(string_list), word_index))
                pass
        _word_list.append(_word)
    return _word_list