def __init__(self): try: # print(mysqlconfig) self.conn = pymysql.connect(**mysqlconfig) except Exception as e: logging.error( 'Fatal Error :Mysql connect get an Fatal Error : %s' % e) else: logging.info('Mysql connect success')
def delete(self, condition=None): '''当没有输入条件的时候,会删除所有的结果''' condition = {} if condition is None and condition is dict else condition try: num = self.collection.remove({}) except Exception as e: logging.error('"delete" get an Fatal Error {}'.format(e)) return num if num else 0 pass
def __init__(self): try: self.mconn = pymongo.MongoClient(mongohost, int(mongoport)) self.db = self.mconn['eshop'] self.collection = self.db['reviews_JD6'] except Exception as e: logging.error( 'Fatal Error :Mongodb connect get an Fatal Error : %s' % e) else: logging.info('Mongodb connect success!')
def find(self, condition=None): if condition is None: try: result = self.collection.find() except Exception as e: logging.error('something wrong with geting a record') return else: try: result = self.collection.find(condition) except Exception as e: logging.error('something wrong with geting a record') return return list(result) if result else []
def __init__(self, goodsname='Dell', keyword='P2317H', client='Rosa', tag=None, debug=False): self.debug = debug self.tag = tag #self.dbmysql = MysqlPipeline() self.client = client self.goodsname = goodsname self.keyword = self.goodsname + ' ' + keyword self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36' } self.baseurl = 'https://search.jd.com' self.baseurl2 = 'http://item.jd.com' self.searchurl = 'http://search.jd.com/Search' self.prefix1 = 'Summary_' self.prefix2 = 'Main_' #第一层json的值 proxy = get_one_proxy() self.proxy = {'http': proxy, 'https': proxy} self.keyword2 = keyword #用来匹配 self.pattern = r'\/(\d+?)\.html' #从url中获取goodid self.urlmoduel = 'https://item.jd.com/{}.html' # 用来去掉已经爬过了的店铺 self.setfilter = set() data = { 'keyword': self.keyword, 'enc': 'utf-8', 'wq': self.keyword, 'pvid': '7e5dd963f7084c468d817cf06a3351dc' } # print(data) self.lock = True try: resp = requests.get(self.searchurl, params=data, headers=self.headers, proxies=self.proxy) if '汪~没有找到' in resp.content.decode(): self.lock = False # print(resp.status_code) except Exception as e: print('Fatal error') logging.error('Fatal error:' + self.searchurl + 'downloaded fail') self.refer = self.searchurl else: self.refer = resp.url self.switch = True # 当分页处理完成,设置为False self.comment_switch = {} # 评论分页开关,键名为goodid # print(self.refer) # 测试 self.test = [] # 更新 self.maxpage = 5 #更新状态,最多翻看5页评论。 self.update_status = False #默认不开启更新状态
def comment_detail(self, goodid, page=0, callback=None, meta=None, keyword=None, goodsname=None, client=None): if self.debug: return print(goodid, page, keyword, goodsname, client) # url = self.urlmoduel.format(goodid) #url = 'http://sclub.jd.com/comment/productPageComments.action' url = 'http://club.jd.com/comment/skuProductPageComments.action' # 解析详情页,获取评论信息 data = { 'callback': 'fetchJSON_comment98vv229', 'productId': goodid, 'score': '0', 'sortType': '6', #按时间排序 'page': page, 'pageSize': '10', # 最多显示十条评论 'isShadowSku': '0', 'fold': '1' } data = { 'callback': 'fetchJSON_comment98vv762', 'productId': goodid, 'score': '0', 'sortType': '6', 'page': page, 'pageSize': '10', 'isShadowSku': '0', 'rid': '0', 'fold': '1' } try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url, params=data, headers=self.headers, proxies=proxies) except Exception as e: print('{}'.format(e)) logging.error('Fatal error:' + url + 'downloaded fail') return cod = resp.encoding result = resp.content.decode(cod) reT = r'\w+?\((.*?)\);$' res = re.search(reT, result, re.S) print(res) # 调试 if res: res = res.group(1) res = json.loads(res) # print(res) try: comments = res.get("comments") except Exception as e: logging.error('comment_detail error:' + e) return if len(comments) == 0: self.comment_switch[goodid] = False return myresult = [] # 最终要获得的数据 结构 for i in comments: # print(i) temp = {} temp['crawltime'] = datetime.utcnow().strftime('%Y-%m-%d') temp['size'] = i.get('productSize', None) temp['comment_time'] = i.get('creationTime', None) # 2015-09-09 11:35:27 temp['content'] = i.get('showOrderComment', {}).get( "content", i.get('content')) # 这个当然不错的了,我们是用来作图的 temp['img'] = re.findall( r'http\:\/\/img30\.360buyimg\.com\/shaidan\/jfs\/[\w\/]+?\.jpg', temp.get('content', '')) if len(temp['img']) == 0: temp['img'] = None temp['content'] = i.get('content') # print(temp['website_url']) temp['website'] = 'JD' temp['website_url'] = 'http://www.jd.com' #http://img30.360buyimg.com/shaidan/jfs/t23899/69/1404782488/83204/3b210e9c/5b5ef8f1N3d24d6b6.jpg temp['type'] = self.keyword if keyword is None else keyword temp['client'] = self.client if client is None else client temp['score'] = i.get('score', None) replies = i.get('replies', None) temp['replytime'] = None if replies is not None: try: temp['replytime'] = replies[0].get( 'creationTime', None) except IndexError: temp['replytime'] = None else: temp['replytime'] = None # 回复时间 temp['md5_id'] = self.md5('{}{}'.format( goodid, i.get('id', ''))) temp[ 'goodsname'] = self.goodsname if goodsname is None else goodsname norepeat = self.md5('{}{}{}'.format( goodid, i.get('id', ''), temp['replytime'] if temp['replytime'] else 'null')) if not dbredis.sadd("Reviews_norepeat6", norepeat): self.comment_switch[goodid] = False #break myresult.append(temp) pipelines = MongodbPipeline() try: # print(myresult) pipelines.insert(myresult) except Exception as e: logging.error('insert error' + self.keyword + e + '{}'.format(page)) else: self.comment_switch[goodid] = False pass
def __search(self, page=1, callback=None, meta=None): ''' page:第多少个半页 ''' # 解析第一部分 第一部分和第二部分可以合并 # 解析第二部分 # 处理分页 if page >= 200: return refer = self.refer url2 = 'https://search.jd.com/s_new.php' headers = {} headers['Referer'] = refer headers.update(self.headers) data2 = { 'keyword': self.keyword, 'enc': 'utf-8', 'qrst': '1', 'rt': '1', 'stop': '1', 'vt': '2', 'wq': self.keyword, 'page': page, 's': (page - 1) * 30 + 1, 'scrolling': 'y', 'log_id': time.time(), 'tpl': '1_M', } # print('测试') #测试时候使 try: proxy = get_one_proxy() proxies = {'http': proxy, 'https': proxy} resp = requests.get(url2, params=data2, headers=headers, proxies=proxies) except Exception as e: logging.error('Fatal error:' + url2 + 'downloaded fail') return # code = resp.encoding logging.info('status code : {}'.format(resp.status_code)) # print(resp.status_code) result = resp.text # print(result) html = etree.HTML(result) items = html.xpath(r'//li[@class = "gl-item"]') length = len(items) if length == 0: self.switch = False for item in items: temp_url = item.xpath(r'.//div[@class="p-img"]/a/@href') # print(temp_url) if len(temp_url) > 0: _ = re.findall(self.pattern, temp_url[0]) if len(_) > 0: url = self.urlmoduel.format(_[0]) goodid = _[0] # print(url) else: continue pass else: continue # 为了数据完整性,此处需要修改 res = etree.tostring(item) cod = chardet.detect(res).get("encoding") res = res.decode(cod) # kw = self.keyword.split(' ') reT = self.keyword2 + '[a-zA-Z]' # print(reT) res = re.sub(r'<font.+?>', '', res) res = re.sub(r'</font>', '', res) tres = etree.HTML(res) tres = tres.xpath(r'//a/em/text()') # 获取标题 if len(tres): res = tres[0] else: print('空') continue print(res) if re.search(reT, res, re.S): logging.info('Invalid Match ') # print(goodid,'x') continue if self.keyword2 not in res: continue if '显示器' not in res: continue else: logging.info('{}'.format(goodid)) print(res) # print(reT) # print(goodid,'okay') # continue #测试的时候使用 if goodid in self.setfilter: #去掉爬过了的网页 continue else: self.setfilter.add(goodid) print(goodid) #测试 callback(goodid=goodid, callback=self.comment_detail) '''break # 必须删除,调试的时候使用