Ejemplo n.º 1
0
 def __init__(self):
     try:
         # print(mysqlconfig)
         self.conn = pymysql.connect(**mysqlconfig)
     except Exception as e:
         logging.error(
             'Fatal Error :Mysql connect get an Fatal Error : %s' % e)
     else:
         logging.info('Mysql connect success')
Ejemplo n.º 2
0
 def delete(self, condition=None):
     '''当没有输入条件的时候,会删除所有的结果'''
     condition = {} if condition is None and condition is dict else condition
     try:
         num = self.collection.remove({})
     except Exception as e:
         logging.error('"delete" get an Fatal Error {}'.format(e))
     return num if num else 0
     pass
Ejemplo n.º 3
0
 def __init__(self):
     try:
         self.mconn = pymongo.MongoClient(mongohost, int(mongoport))
         self.db = self.mconn['eshop']
         self.collection = self.db['reviews_JD6']
     except Exception as e:
         logging.error(
             'Fatal Error :Mongodb connect get an Fatal Error : %s' % e)
     else:
         logging.info('Mongodb connect success!')
Ejemplo n.º 4
0
 def find(self, condition=None):
     if condition is None:
         try:
             result = self.collection.find()
         except Exception as e:
             logging.error('something wrong with geting a record')
             return
     else:
         try:
             result = self.collection.find(condition)
         except Exception as e:
             logging.error('something wrong with geting a record')
             return
     return list(result) if result else []
Ejemplo n.º 5
0
    def __init__(self,
                 goodsname='Dell',
                 keyword='P2317H',
                 client='Rosa',
                 tag=None,
                 debug=False):
        self.debug = debug
        self.tag = tag
        #self.dbmysql = MysqlPipeline()
        self.client = client
        self.goodsname = goodsname
        self.keyword = self.goodsname + ' ' + keyword
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'
        }
        self.baseurl = 'https://search.jd.com'
        self.baseurl2 = 'http://item.jd.com'
        self.searchurl = 'http://search.jd.com/Search'
        self.prefix1 = 'Summary_'
        self.prefix2 = 'Main_'  #第一层json的值

        proxy = get_one_proxy()
        self.proxy = {'http': proxy, 'https': proxy}
        self.keyword2 = keyword  #用来匹配
        self.pattern = r'\/(\d+?)\.html'  #从url中获取goodid
        self.urlmoduel = 'https://item.jd.com/{}.html'

        # 用来去掉已经爬过了的店铺
        self.setfilter = set()

        data = {
            'keyword': self.keyword,
            'enc': 'utf-8',
            'wq': self.keyword,
            'pvid': '7e5dd963f7084c468d817cf06a3351dc'
        }
        # print(data)
        self.lock = True
        try:
            resp = requests.get(self.searchurl,
                                params=data,
                                headers=self.headers,
                                proxies=self.proxy)
            if '汪~没有找到' in resp.content.decode():
                self.lock = False
            # print(resp.status_code)
        except Exception as e:
            print('Fatal error')
            logging.error('Fatal error:' + self.searchurl + 'downloaded fail')
            self.refer = self.searchurl
        else:
            self.refer = resp.url

        self.switch = True  # 当分页处理完成,设置为False
        self.comment_switch = {}  # 评论分页开关,键名为goodid
        # print(self.refer)

        # 测试
        self.test = []

        # 更新
        self.maxpage = 5  #更新状态,最多翻看5页评论。
        self.update_status = False  #默认不开启更新状态
Ejemplo n.º 6
0
    def comment_detail(self,
                       goodid,
                       page=0,
                       callback=None,
                       meta=None,
                       keyword=None,
                       goodsname=None,
                       client=None):
        if self.debug:
            return print(goodid, page, keyword, goodsname, client)
        # url = self.urlmoduel.format(goodid)
        #url = 'http://sclub.jd.com/comment/productPageComments.action'
        url = 'http://club.jd.com/comment/skuProductPageComments.action'
        # 解析详情页,获取评论信息
        data = {
            'callback': 'fetchJSON_comment98vv229',
            'productId': goodid,
            'score': '0',
            'sortType': '6',  #按时间排序
            'page': page,
            'pageSize': '10',  # 最多显示十条评论
            'isShadowSku': '0',
            'fold': '1'
        }
        data = {
            'callback': 'fetchJSON_comment98vv762',
            'productId': goodid,
            'score': '0',
            'sortType': '6',
            'page': page,
            'pageSize': '10',
            'isShadowSku': '0',
            'rid': '0',
            'fold': '1'
        }
        try:
            proxy = get_one_proxy()
            proxies = {'http': proxy, 'https': proxy}
            resp = requests.get(url,
                                params=data,
                                headers=self.headers,
                                proxies=proxies)
        except Exception as e:
            print('{}'.format(e))
            logging.error('Fatal error:' + url + 'downloaded fail')
            return
        cod = resp.encoding
        result = resp.content.decode(cod)
        reT = r'\w+?\((.*?)\);$'
        res = re.search(reT, result, re.S)
        print(res)  # 调试
        if res:
            res = res.group(1)
            res = json.loads(res)
            # print(res)
            try:
                comments = res.get("comments")
            except Exception as e:
                logging.error('comment_detail error:' + e)
                return

            if len(comments) == 0:
                self.comment_switch[goodid] = False
                return

            myresult = []  # 最终要获得的数据 结构
            for i in comments:
                # print(i)
                temp = {}
                temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d')
                temp['size'] = i.get('productSize', None)
                temp['comment_time'] = i.get('creationTime',
                                             None)  # 2015-09-09 11:35:27
                temp['content'] = i.get('showOrderComment', {}).get(
                    "content", i.get('content'))  # 这个当然不错的了,我们是用来作图的
                temp['img'] = re.findall(
                    r'http\:\/\/img30\.360buyimg\.com\/shaidan\/jfs\/[\w\/]+?\.jpg',
                    temp.get('content', ''))
                if len(temp['img']) == 0:
                    temp['img'] = None
                temp['content'] = i.get('content')
                # print(temp['website_url'])
                temp['website'] = 'JD'
                temp['website_url'] = 'http://www.jd.com'
                #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg
                temp['type'] = self.keyword if keyword is None else keyword
                temp['client'] = self.client if client is None else client
                temp['score'] = i.get('score', None)
                replies = i.get('replies', None)
                temp['replytime'] = None
                if replies is not None:
                    try:
                        temp['replytime'] = replies[0].get(
                            'creationTime', None)
                    except IndexError:
                        temp['replytime'] = None
                else:
                    temp['replytime'] = None  # 回复时间
                temp['md5_id'] = self.md5('{}{}'.format(
                    goodid, i.get('id', '')))
                temp[
                    'goodsname'] = self.goodsname if goodsname is None else goodsname
                norepeat = self.md5('{}{}{}'.format(
                    goodid, i.get('id', ''),
                    temp['replytime'] if temp['replytime'] else 'null'))
                if not dbredis.sadd("Reviews_norepeat6", norepeat):
                    self.comment_switch[goodid] = False
                    #break

                myresult.append(temp)
            pipelines = MongodbPipeline()
            try:
                # print(myresult)
                pipelines.insert(myresult)
            except Exception as e:
                logging.error('insert error' + self.keyword + e +
                              '{}'.format(page))
        else:
            self.comment_switch[goodid] = False
        pass
Ejemplo n.º 7
0
    def __search(self, page=1, callback=None, meta=None):
        '''
		page:第多少个半页
		'''
        # 解析第一部分 第一部分和第二部分可以合并
        # 解析第二部分
        # 处理分页
        if page >= 200:
            return
        refer = self.refer
        url2 = 'https://search.jd.com/s_new.php'
        headers = {}
        headers['Referer'] = refer
        headers.update(self.headers)
        data2 = {
            'keyword': self.keyword,
            'enc': 'utf-8',
            'qrst': '1',
            'rt': '1',
            'stop': '1',
            'vt': '2',
            'wq': self.keyword,
            'page': page,
            's': (page - 1) * 30 + 1,
            'scrolling': 'y',
            'log_id': time.time(),
            'tpl': '1_M',
        }
        # print('测试') #测试时候使
        try:
            proxy = get_one_proxy()
            proxies = {'http': proxy, 'https': proxy}
            resp = requests.get(url2,
                                params=data2,
                                headers=headers,
                                proxies=proxies)
        except Exception as e:
            logging.error('Fatal error:' + url2 + 'downloaded fail')
            return
        # code = resp.encoding
        logging.info('status code : {}'.format(resp.status_code))
        # print(resp.status_code)
        result = resp.text
        # print(result)
        html = etree.HTML(result)
        items = html.xpath(r'//li[@class = "gl-item"]')
        length = len(items)
        if length == 0:
            self.switch = False
        for item in items:
            temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href')
            # print(temp_url)
            if len(temp_url) > 0:
                _ = re.findall(self.pattern, temp_url[0])
                if len(_) > 0:
                    url = self.urlmoduel.format(_[0])
                    goodid = _[0]
                    # print(url)
                else:
                    continue
                    pass
            else:
                continue

            # 为了数据完整性,此处需要修改

            res = etree.tostring(item)
            cod = chardet.detect(res).get("encoding")
            res = res.decode(cod)
            # kw = self.keyword.split(' ')
            reT = self.keyword2 + '[a-zA-Z]'
            # print(reT)

            res = re.sub(r'<font.+?>', '', res)
            res = re.sub(r'</font>', '', res)
            tres = etree.HTML(res)
            tres = tres.xpath(r'//a/em/text()')  # 获取标题
            if len(tres):
                res = tres[0]
            else:
                print('空')
                continue

            print(res)
            if re.search(reT, res, re.S):
                logging.info('Invalid Match ')
                # print(goodid,'x')
                continue
            if self.keyword2 not in res:
                continue
            if '显示器' not in res:
                continue
            else:
                logging.info('{}'.format(goodid))
                print(res)
                # print(reT)
                # print(goodid,'okay')
                # continue #测试的时候使用
                if goodid in self.setfilter:  #去掉爬过了的网页
                    continue
                else:
                    self.setfilter.add(goodid)

                print(goodid)  #测试
                callback(goodid=goodid, callback=self.comment_detail)
                '''break # 必须删除,调试的时候使用