def parse(self, response): PrintLog.print_start_flag(self.parse.__name__) sel = Selector(response) # pdb.set_trace() # print response.url # print response.body # 取出friendsList ''' 下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容 (?<= # 断言要匹配的文本的前缀开始 user_list : \[ # 查找字符串'user_list : [' ) # 前缀结束 [\s\S]* # 匹配任意文本 (?= # 断言要匹配的文本的后缀开始 \], # 查找字符串'[,' ) # 后缀结束 ''' friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)') yield self.parse_friends_list(friends_list=friends) # 尝试取下一页数据 #pdb.set_trace() PrintLog.print_log("get next page") page_count_str_list = sel.re(r'pageCount :\s*(.*)') if page_count_str_list: m = re.findall(r"\d", page_count_str_list[0]) self.total_page_count = int(m[0]) # print "page_count_num=", self.total_page_count self.page_num += 1 # 下一页码 if self.page_num < self.total_page_count: yield self.request_page(page_idx=self.page_num)
def convert_cookie_string_to_dict(self, str_of_cookie=""): PrintLog.print_start_flag(self.convert_cookie_string_to_dict.__name__) str0 = re.sub(r'\s', "", str_of_cookie) datadict = {} for str1 in str0.split(';'): # print str1 key, value = str1.split('=', 1) datadict[key] = value # print datadict return datadict
def parse_friends_list(self, friends_list=""): PrintLog.print_start_flag(self.parse_friends_list.__name__) # change to <type 'str'> from <type 'unicode'> utf8str = friends_list[0].encode("utf-8").strip() ''' utf8str is: {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]}, {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]} ''' item = WeixinUsersItem() item['friends_list'] = utf8str return item
def process_item(self, item, spider): PrintLog.print_start_flag(self.process_item.__name__) #pdb.set_trace() ''' item['friends_list'] is: {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]}, {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]} ''' friends_str = item['friends_list'] # 下面的正则表达式要查找和取出字符串‘{...}’ friends_list= re.findall(r'{[\s\S]*?}', friends_str) for s in friends_list: # 改为json字符串格式 s = re.sub(r'\bid\b\b', "\"id\"", s) s = re.sub(r'\bnick_name\b', "\"nick_name\"", s) s = re.sub(r'\bremark_name\b', "\"remark_name\"", s) s = re.sub(r'\bcreate_time\b', "\"create_time\"", s) s = re.sub(r'\bgroup_id\b', "\"group_id\"", s) b = json.loads(s) # print b["nick_name"], b["remark_name"] self.sheet.write(self.row, self.column, b["nick_name"]) self.sheet.write(self.row, self.column+1, b["remark_name"]) self.row += 1 return item
def process_item(self, item, spider): PrintLog.print_start_flag(self.process_item.__name__) #pdb.set_trace() ''' item['friends_list'] is: {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]}, {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]} ''' friends_str = item['friends_list'] # 下面的正则表达式要查找和取出字符串‘{...}’ friends_list = re.findall(r'{[\s\S]*?}', friends_str) for s in friends_list: # 改为json字符串格式 s = re.sub(r'\bid\b\b', "\"id\"", s) s = re.sub(r'\bnick_name\b', "\"nick_name\"", s) s = re.sub(r'\bremark_name\b', "\"remark_name\"", s) s = re.sub(r'\bcreate_time\b', "\"create_time\"", s) s = re.sub(r'\bgroup_id\b', "\"group_id\"", s) b = json.loads(s) # print b["nick_name"], b["remark_name"] self.sheet.write(self.row, self.column, b["nick_name"]) self.sheet.write(self.row, self.column + 1, b["remark_name"]) self.row += 1 return item
class WeixinUsersPipeline(object): PrintLog.print_log(__name__) def __init__(self): self.book = xlwt.Workbook(encoding='utf-8') self.sheet = self.book.add_sheet('sheet1', cell_overwrite_ok=True) self.row = 0 self.column = 0 self.sheet.write(self.row, self.column, "nick_name") self.sheet.write(self.row, self.column + 1, "remark_name") self.row += 1 def close_spider(self, spider): self.book.save(WeixinCfg.saved_file) def process_item(self, item, spider): PrintLog.print_start_flag(self.process_item.__name__) #pdb.set_trace() ''' item['friends_list'] is: {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]}, {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]} ''' friends_str = item['friends_list'] # 下面的正则表达式要查找和取出字符串‘{...}’ friends_list = re.findall(r'{[\s\S]*?}', friends_str) for s in friends_list: # 改为json字符串格式 s = re.sub(r'\bid\b\b', "\"id\"", s) s = re.sub(r'\bnick_name\b', "\"nick_name\"", s) s = re.sub(r'\bremark_name\b', "\"remark_name\"", s) s = re.sub(r'\bcreate_time\b', "\"create_time\"", s) s = re.sub(r'\bgroup_id\b', "\"group_id\"", s) b = json.loads(s) # print b["nick_name"], b["remark_name"] self.sheet.write(self.row, self.column, b["nick_name"]) self.sheet.write(self.row, self.column + 1, b["remark_name"]) self.row += 1 return item
def start_requests(self): PrintLog.print_start_flag(self.start_requests.__name__) self.cookie_dict = self.convert_cookie_string_to_dict(WeixinCfg.cookie_string) return [self.request_page(page_idx=self.page_num)]
def start_requests(self): PrintLog.print_start_flag(self.start_requests.__name__) self.cookie_dict = self.convert_cookie_string_to_dict( WeixinCfg.cookie_string) return [self.request_page(page_idx=self.page_num)]
class ContactSpider(CrawlSpider): PrintLog.print_log(__name__) name = "contact" allowed_domains = ["weixin.qq.com"] total_page_count = 1 # 记录总共多少页 page_num = 0 # 从第0页开始抓取 cookie_dict = {} contact_manage_page_prefix = 'https://mp.weixin.qq.com/cgi-bin/contactmanage?t=user/index&type=0&lang=zh_CN' contact_manage_page_pagesize = '&pagesize=' contact_manage_page_idx = '&pageidx=' contact_manage_page_token = '&token=' def start_requests(self): PrintLog.print_start_flag(self.start_requests.__name__) self.cookie_dict = self.convert_cookie_string_to_dict( WeixinCfg.cookie_string) return [self.request_page(page_idx=self.page_num)] def request_page(self, page_idx=0): # 组合url page_url = self.contact_manage_page_prefix + \ self.contact_manage_page_pagesize + WeixinCfg.page_size + \ self.contact_manage_page_idx + str(page_idx) + \ self.contact_manage_page_token + WeixinCfg.page_token # print page_url return Request(url=page_url, cookies=self.cookie_dict, callback=self.parse) def parse(self, response): PrintLog.print_start_flag(self.parse.__name__) sel = Selector(response) # pdb.set_trace() # print response.url # print response.body # 取出friendsList ''' 下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容 (?<= # 断言要匹配的文本的前缀开始 user_list : \[ # 查找字符串'user_list : [' ) # 前缀结束 [\s\S]* # 匹配任意文本 (?= # 断言要匹配的文本的后缀开始 \], # 查找字符串'[,' ) # 后缀结束 ''' friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)') yield self.parse_friends_list(friends_list=friends) # 尝试取下一页数据 #pdb.set_trace() PrintLog.print_log("get next page") page_count_str_list = sel.re(r'pageCount :\s*(.*)') if page_count_str_list: m = re.findall(r"\d", page_count_str_list[0]) self.total_page_count = int(m[0]) # print "page_count_num=", self.total_page_count self.page_num += 1 # 下一页码 if self.page_num < self.total_page_count: yield self.request_page(page_idx=self.page_num) # 取出friends_list.contacts的值作为item,交由pipelines处理 def parse_friends_list(self, friends_list=""): PrintLog.print_start_flag(self.parse_friends_list.__name__) # change to <type 'str'> from <type 'unicode'> utf8str = friends_list[0].encode("utf-8").strip() ''' utf8str is: {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]}, {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]} ''' item = WeixinUsersItem() item['friends_list'] = utf8str return item def convert_cookie_string_to_dict(self, str_of_cookie=""): PrintLog.print_start_flag(self.convert_cookie_string_to_dict.__name__) str0 = re.sub(r'\s', "", str_of_cookie) datadict = {} for str1 in str0.split(';'): # print str1 key, value = str1.split('=', 1) datadict[key] = value # print datadict return datadict