def parse(self, response):
        PrintLog.print_start_flag(self.parse.__name__)

        sel = Selector(response)

        # pdb.set_trace()
        # print response.url
        # print response.body

        # 取出friendsList
        '''
        下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容
        (?<=           # 断言要匹配的文本的前缀开始
        user_list : \[ # 查找字符串'user_list : ['
        )              # 前缀结束
        [\s\S]*        # 匹配任意文本
        (?=            # 断言要匹配的文本的后缀开始
        \],            # 查找字符串'[,'
        )              # 后缀结束
        '''
        friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)')
        yield self.parse_friends_list(friends_list=friends)

        # 尝试取下一页数据
        #pdb.set_trace()
        PrintLog.print_log("get next page")
        page_count_str_list = sel.re(r'pageCount :\s*(.*)')
        if page_count_str_list:
            m = re.findall(r"\d", page_count_str_list[0])
            self.total_page_count = int(m[0])
            # print "page_count_num=", self.total_page_count
            self.page_num += 1 # 下一页码
            if self.page_num < self.total_page_count:
                yield self.request_page(page_idx=self.page_num)
Beispiel #2
0
    def parse(self, response):
        PrintLog.print_start_flag(self.parse.__name__)

        sel = Selector(response)

        # pdb.set_trace()
        # print response.url
        # print response.body

        # 取出friendsList
        '''
        下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容
        (?<=           # 断言要匹配的文本的前缀开始
        user_list : \[ # 查找字符串'user_list : ['
        )              # 前缀结束
        [\s\S]*        # 匹配任意文本
        (?=            # 断言要匹配的文本的后缀开始
        \],            # 查找字符串'[,'
        )              # 后缀结束
        '''
        friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)')
        yield self.parse_friends_list(friends_list=friends)

        # 尝试取下一页数据
        #pdb.set_trace()
        PrintLog.print_log("get next page")
        page_count_str_list = sel.re(r'pageCount :\s*(.*)')
        if page_count_str_list:
            m = re.findall(r"\d", page_count_str_list[0])
            self.total_page_count = int(m[0])
            # print "page_count_num=", self.total_page_count
            self.page_num += 1  # 下一页码
            if self.page_num < self.total_page_count:
                yield self.request_page(page_idx=self.page_num)
 def convert_cookie_string_to_dict(self, str_of_cookie=""):
     PrintLog.print_start_flag(self.convert_cookie_string_to_dict.__name__)
     str0 = re.sub(r'\s', "", str_of_cookie)
     datadict = {}
     for str1 in str0.split(';'):
         # print str1
         key, value = str1.split('=', 1)
         datadict[key] = value
     # print datadict
     return datadict
Beispiel #4
0
 def convert_cookie_string_to_dict(self, str_of_cookie=""):
     PrintLog.print_start_flag(self.convert_cookie_string_to_dict.__name__)
     str0 = re.sub(r'\s', "", str_of_cookie)
     datadict = {}
     for str1 in str0.split(';'):
         # print str1
         key, value = str1.split('=', 1)
         datadict[key] = value
     # print datadict
     return datadict
    def parse_friends_list(self, friends_list=""):
        PrintLog.print_start_flag(self.parse_friends_list.__name__)

        # change to <type 'str'> from <type 'unicode'>
        utf8str = friends_list[0].encode("utf-8").strip()
        '''
        utf8str is:
        {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]},
        {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]}
       '''
        item = WeixinUsersItem()
        item['friends_list'] = utf8str
        return item
Beispiel #6
0
    def parse_friends_list(self, friends_list=""):
        PrintLog.print_start_flag(self.parse_friends_list.__name__)

        # change to <type 'str'> from <type 'unicode'>
        utf8str = friends_list[0].encode("utf-8").strip()
        '''
        utf8str is:
        {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]},
        {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]}
       '''
        item = WeixinUsersItem()
        item['friends_list'] = utf8str
        return item
Beispiel #7
0
 def process_item(self, item, spider):
     PrintLog.print_start_flag(self.process_item.__name__)
     #pdb.set_trace()
     '''
     item['friends_list'] is:
     {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]},
     {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]}
     '''
     friends_str = item['friends_list']
     # 下面的正则表达式要查找和取出字符串‘{...}’
     friends_list= re.findall(r'{[\s\S]*?}', friends_str)
     for s in friends_list:
         # 改为json字符串格式
         s = re.sub(r'\bid\b\b', "\"id\"", s)
         s = re.sub(r'\bnick_name\b', "\"nick_name\"", s)
         s = re.sub(r'\bremark_name\b', "\"remark_name\"", s)
         s = re.sub(r'\bcreate_time\b', "\"create_time\"", s)
         s = re.sub(r'\bgroup_id\b', "\"group_id\"", s)
         b = json.loads(s)
         # print b["nick_name"], b["remark_name"]
         self.sheet.write(self.row, self.column, b["nick_name"])
         self.sheet.write(self.row, self.column+1, b["remark_name"])
         self.row += 1
     return item
Beispiel #8
0
 def process_item(self, item, spider):
     PrintLog.print_start_flag(self.process_item.__name__)
     #pdb.set_trace()
     '''
     item['friends_list'] is:
     {id:"xxxxxx",nick_name:"yingchao1",remark_name:"",group_id:[]},
     {id:"xxxxxx",nick_name:"yingchao2",remark_name:"",group_id:[]}
     '''
     friends_str = item['friends_list']
     # 下面的正则表达式要查找和取出字符串‘{...}’
     friends_list = re.findall(r'{[\s\S]*?}', friends_str)
     for s in friends_list:
         # 改为json字符串格式
         s = re.sub(r'\bid\b\b', "\"id\"", s)
         s = re.sub(r'\bnick_name\b', "\"nick_name\"", s)
         s = re.sub(r'\bremark_name\b', "\"remark_name\"", s)
         s = re.sub(r'\bcreate_time\b', "\"create_time\"", s)
         s = re.sub(r'\bgroup_id\b', "\"group_id\"", s)
         b = json.loads(s)
         # print b["nick_name"], b["remark_name"]
         self.sheet.write(self.row, self.column, b["nick_name"])
         self.sheet.write(self.row, self.column + 1, b["remark_name"])
         self.row += 1
     return item
 def start_requests(self):
     PrintLog.print_start_flag(self.start_requests.__name__)
     self.cookie_dict = self.convert_cookie_string_to_dict(WeixinCfg.cookie_string)
     return [self.request_page(page_idx=self.page_num)]
Beispiel #10
0
 def start_requests(self):
     PrintLog.print_start_flag(self.start_requests.__name__)
     self.cookie_dict = self.convert_cookie_string_to_dict(
         WeixinCfg.cookie_string)
     return [self.request_page(page_idx=self.page_num)]