def send_tags(self, method, info, tags, sk): """Sends tags to last.fm. method is one of: album.addtags, artist.addtags or track.addtags info_dict is the artist, track and album info tags is a comma delimited list of no more than 10 tags""" #All methods require these parameters: #tags (Required) : A comma delimited list of user supplied tags to apply #to this album. Accepts a maximum of 10 tags. #api_key (Required) : A Last.fm API key. #api_sig (Required) #sk (Required) #artist (Required) : The artist name in question post_values = { "method" : method, "tags" : tags, "api_key" : self.api_key, "sk" : sk, "artist" : info['Artist']} #these methods require additional info: #album.addTags -> album #track.addTags -> track if method == "album.addtags": post_values['album'] = info['Album'] if method == "track.addtags": post_values['track'] = info['Track'] post_values['api_sig'] = self.create_api_sig(post_values) conn = HttpRequest(self.url, urllib.urlencode(post_values)) response = conn.connect()
def send_tags(self, method, info, tags, sk): """Sends tags to last.fm. method is one of: album.addtags, artist.addtags or track.addtags info_dict is the artist, track and album info tags is a comma delimited list of no more than 10 tags""" #All methods require these parameters: #tags (Required) : A comma delimited list of user supplied tags to apply #to this album. Accepts a maximum of 10 tags. #api_key (Required) : A Last.fm API key. #api_sig (Required) #sk (Required) #artist (Required) : The artist name in question post_values = { "method": method, "tags": tags, "api_key": self.api_key, "sk": sk, "artist": info['Artist'] } #these methods require additional info: #album.addTags -> album #track.addTags -> track if method == "album.addtags": post_values['album'] = info['Album'] if method == "track.addtags": post_values['track'] = info['Track'] post_values['api_sig'] = self.create_api_sig(post_values) conn = HttpRequest(self.url, urllib.urlencode(post_values)) response = conn.connect()
def _send_post(self, post_values): req = HttpRequest(url=self.submission_url, data=post_values, timeout=10) success, msg = req.connect() if success: self.deletion_ids.extend(self.del_ids) return True else: self.parent.write_info(_("There was an error sending data to last.fm:") + "\n" + "\n".join(msg)) return False
def main(): req = HttpRequest() req.start() hthread = hue.HueThread(ip="192.168.10.2") hthread.start() while True: # get image ret, img = cap.read() gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # face detect faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5) human_num = len(faces) # if face detect if (human_num > 0): hthread.changeState(hue.MeetingStart()) #time = datetime.now().strftime("%Y/%m/%d %H:%M:%S") time = datetime.now().isoformat() #time.microsecond = 0 #time = time.isoformat() print(time, ' human_num : ', human_num) req.add( StatusReq(room=room_Name, timestamp=time, occupied=human_num)) else: hthread.changeState(hue.MeetingEnd()) # draw rect for x, y, w, h in faces: cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 2) #face = img[y: y + h, x: x + w] #face_gray = gray[y: y + h, x: x + w] #eyes = eye_cascade.detectMultiScale(face_gray) #for (ex, ey, ew, eh) in eyes: # cv2.rectangle(face, (ex, ey), (ex + ew, ey + eh), (0, 255, 0), 2) # show img cv2.imshow('video image', img) # wait key key = cv2.waitKey(10) # quit if key == 27: # ESCキーで終了 break cap.release() cv2.destroyAllWindows() req.stop() req.join()
def handshake(self): self.timestamp = self.create_timestamp() self.authentication_code = self.create_authentication_code() self.url += r"/?" + self.encode_url() req = HttpRequest(url=self.url, timeout=10) success, response = req.connect() if success: self.session_id = response[1] self.submission_url = response[3] msg = req.handshake_response(response[0]) return response[0], msg
def get_popular_tags(self, method, info_dict): """method is either artist.gettoptags or track.gettoptags""" #Params #track (Optional) : The track name in question #artist (Required) : The artist name in question #api_key (Required) : A Last.fm API key. dict = {"method" : method, "artist" : info_dict['Artist'], "api_key" : self.api_key} if method == "track.gettoptags": dict['track'] = info_dict['Track'] encoded_values = urllib.urlencode(dict) url = self.url + "?" + encoded_values conn = HttpRequest(url) xml_doc = conn.connect(xml=True) return self.parse_xml_doc(xml_doc, "name")
def get_user_top_tags(self, username, limit=15): #method user.getTopTags #Params #user (Required) : The user name #limit (Optional) : Limit the number of tags returned #api_key (Required) : A Last.fm API key. encoded_values = urllib.urlencode( {"method" : "user.gettoptags", "user" : username, "limit" : limit, "api_key" : self.api_key} ) url = self.url + "?" + encoded_values conn = HttpRequest(url) xml_doc = conn.connect(xml=True) return self.parse_xml_doc(xml_doc, "name")
def get_user_top_tags(self, username, limit=15): #method user.getTopTags #Params #user (Required) : The user name #limit (Optional) : Limit the number of tags returned #api_key (Required) : A Last.fm API key. encoded_values = urllib.urlencode({ "method": "user.gettoptags", "user": username, "limit": limit, "api_key": self.api_key }) url = self.url + "?" + encoded_values conn = HttpRequest(url) xml_doc = conn.connect(xml=True) return self.parse_xml_doc(xml_doc, "name")
def set_user_image(self): webservice = webservices.LastfmWebService() url = "http://ws.audioscrobbler.com/2.0/?method=user.getinfo&user=%s&api_key=%s" request = HttpRequest(url=url % (self.username, webservice.api_key), timeout=10) msg = request.connect(xml=True) image_url = webservice.parse_xml(msg, "image") if image_url is None: return if not os.path.exists(self.HOME_DIR + os.path.basename(image_url)): request = HttpRequest(image_url) request.retrieve(image_url, self.HOME_DIR + os.path.basename(image_url), self.tree.get_widget("user_thumb")) else: image = gtk.gdk.pixbuf_new_from_file_at_size(self.HOME_DIR + os.path.basename(image_url), 100, 40) self.tree.get_widget("user_thumb").set_from_pixbuf(image)
def _send_post(self, post_values): req = HttpRequest(url=self.submission_url, data=post_values, timeout=10) success, msg = req.connect() if success: self.deletion_ids.extend(self.del_ids) return True else: if msg[0] == "BADSESSION": self.bad_session_count += 1 server_response, message = self.handshake() if server_response == "OK" and self.bad_session_count <= 3: return self._send_post(post_values) else: self.parent.write_info(_("Error during handshake.")) self.parent.write_info(_("There was an error sending data to last.fm:") + "\n" + "\n".join(msg)) return False
def get_popular_tags(self, method, info_dict): """method is either artist.gettoptags or track.gettoptags""" #Params #track (Optional) : The track name in question #artist (Required) : The artist name in question #api_key (Required) : A Last.fm API key. dict = { "method": method, "artist": info_dict['Artist'], "api_key": self.api_key } if method == "track.gettoptags": dict['track'] = info_dict['Track'] encoded_values = urllib.urlencode(dict) url = self.url + "?" + encoded_values conn = HttpRequest(url) xml_doc = conn.connect(xml=True) return self.parse_xml_doc(xml_doc, "name")
def __init__(self, client, address): """ Handles the request and sends a response to the client. Args: client (socket.socket): The client of the request. address (tuple(str, int)): The client address and port, for logging purposes. """ self.__client = client self.__address = address self.__response = HttpResponse() self.__request = None self.__close_connection = True stop_handling_request = False try: """ First parses the HTTP request and, if there are hooks to call after parsing them, calls them. """ self.__request = HttpRequest(client) self.__after_parsing() except StopHandlingRequestException: """ If there is any reason to stop the regular execution of the request handling, the after parsing hooks have to raise a `StopHandlingRequestException`. See `after_parsing_request` method for more information. """ stop_handling_request = True except HttpRequestParseErrorException: """ If the request cannot be parsed, it returns a 400 HTTP error code to the client. """ stop_handling_request = True self.__response.status = 400 if not stop_handling_request: request_uri = self.__request.request_uri request_method = self.__request.method """ Checks if the request has a valid HTTP method, if not, it returns a 400 HTTP error code to the client. """ if request_method in [ "GET", "POST", "HEAD", "PUT", "DELETE", "TRACE", "OPTIONS", "CONNECT", "PATCH" ]: """ Checks if the request is for the API or the app and handles it accordingly. """ if request_uri == self.__API_URI or request_uri.startswith( self.__API_URI + "/"): self.__handle_api_request() else: self.__handle_app_request() else: self.__response.status = 400 self.__end_handling() else: self.__end_handling()
def _send_post(self, post_values): req = HttpRequest(url=self.submission_url, data=post_values, timeout=10) success, msg = req.connect() if success: self.deletion_ids.extend(self.del_ids) return True else: if msg[0] == "BADSESSION": self.bad_session_count += 1 server_response, message = self.handshake() if server_response == "OK" and self.bad_session_count <= 3: return self._send_post(post_values) else: self.parent.write_info(_("Error during handshake.")) self.parent.write_info( _("There was an error sending data to last.fm:") + "\n" + "\n".join(msg)) return False
def handshake(self): self.handshake_attempts += 1 self.timestamp = self.create_timestamp() self.authentication_code = self.create_authentication_code() self.url = self.base_url + r"/?" + self.encode_url() req = HttpRequest(url=self.url, timeout=10) success, response = req.connect() print req.handshake_response(response[0]) if success: self.session_id = response[1] status = response[0] self.submission_url = response[3] if status != "OK": if self.handshake_attempts <= 3: print "Handshake error attempt %d of 3" % self.handshake_attempts return self.handshake() else: return "FAILED", "Failed to make a handshake with Last.fm" else: self.handshake_attempts = 0 msg = req.handshake_response(response[0]) return response[0], msg
def set_user_image(self): webservice = webservices.LastfmWebService() url = "http://ws.audioscrobbler.com/2.0/?method=user.getinfo&user=%s&api_key=%s" request = HttpRequest(url=url % (self.username, webservice.api_key), timeout=10) msg = request.connect(xml=True) image_url = webservice.parse_xml(msg, "image") if image_url is None: return if not os.path.exists(self.HOME_DIR + os.path.basename(image_url)): request = HttpRequest(image_url) request.retrieve(image_url, self.HOME_DIR + os.path.basename(image_url), self.tree.get_widget("user_thumb")) else: image = gtk.gdk.pixbuf_new_from_file_at_size( self.HOME_DIR + os.path.basename(image_url), 100, 40) self.tree.get_widget("user_thumb").set_from_pixbuf(image)
def crawl(self, response): print(response) urls = [ 'https://list.mgtv.com/3/176--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/3/175--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/3/177--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/3/178--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/3/43--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/3/44--------a1-c2-1--a1-.html?channelId=3', 'https://list.mgtv.com/2/a1-10--------c2-1---.html?channelId=2', 'https://list.mgtv.com/2/a1-12--------c2-1---.html?channelId=2', 'https://list.mgtv.com/2/a1-11--------c2-1---.html?channelId=2', 'https://list.mgtv.com/2/a1-193--------c2-1---.html?channelId=2', 'https://list.mgtv.com/50/a1-52--------c2-1---.html?channelId=50', 'https://list.mgtv.com/50/a1-53--------c2-1---.html?channelId=50', 'https://list.mgtv.com/1/a1-1--------c2-1---.html?channelId=1', 'https://list.mgtv.com/1/a1-2--------c2-1---.html?channelId=1' ] for url in urls: yield HttpRequest(url=url, callback=self.crawl1)
def handshake(self): self.handshake_attempts += 1 self.timestamp = self.create_timestamp() self.authentication_code = self.create_authentication_code() self.url = self.base_url + r"/?" + self.encode_url() req = HttpRequest(url=self.url, timeout=10) success, response = req.connect() print req.handshake_response(response[0]) if success: self.session_id = response[1] status = response[0] self.submission_url = response[3] if status != "OK": if self.handshake_attempts <= 3: print "Handshake error attempt %d of 3" % self.handshake_attempts return self.handshake() else: return "FAILED", "Failed to make a handshake with Last.fm" else: self.handshake_attempts = 0 msg = req.handshake_response(response[0]) return response[0], msg
def get_post_content_and_time( post_url, post_type, post_name, time_last_time ): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer' : 'http://www.hi-pda.com/forum/', 'Host' : 'www.hi-pda.com' } logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) ) #根据url请求帖子内容 post_request = HttpRequest( post_full_url, None, post_headers ) post_request.send_request() post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode post response content failed.' ) logging.exception( e ) #从帖子内容中解析帖子的摘要 re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' ) result_content = re_pattern_content.search( post_resp_content ) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn( 'Request failed.' ) else: post_content = result_content.groups()[0] if post_content is None: logging.warn( 'Get post conetent failed.' ) else: #从帖子内容中解析帖子更新时间 re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' ) result_update_time = re_pattern_update_time.search( post_content ) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: #从帖子内容中解析帖子发表时间 re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' ) result_create_time = re_pattern_create_time.search( post_resp_content ) if result_create_time is None: logging.warn( 'Get post time failed.' ) else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post = None #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标 if ( post_create_time_stamp - time_last_time ) > 0: conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name) cursor = conn.cursor() logging.info( 'post_type:' + post_type ) logging.info( 'post_name:' + post_name ) logging.info( 'post_url:' + post_full_url ) logging.info( 'post_create_time:' + post_create_time ) logging.info( 'post_content:' + post_content ) post_id = next_id() post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time ) # post.save() cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp]) conn.commit() cursor.close() conn.close() time.sleep( 1 ) else: logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime ) return post
def format_crawl(cls): self = cls(cls.start_urls) for url in self.start_urls: yield HttpRequest(url=url, callback=self.crawl)
class PreRequest: def __init__(self): self.httpRequest = HttpRequest() def request(self, data): mode = config.get_config("mode", "mode") if mode == "upload": self.upload(data) elif mode == "normal": self.normal(data) elif mode == "verifygame": if config.get_config("verifygame", "step") == "1": self.verify_1(data) elif config.get_config("verifygame", "step") == "2": self.verify_2(data) def normal(self, jsondata): Pylog.info("".join(jsondata.keys()) + ": 开始测试") for k, v in jsondata.items(): url = "http://" + config.get_config("normal", "url") + v["url"] method = v["method"] data = v["data"] self.reps = self.httpRequest.post( url=url, data=data) if method == "POST" else self.httpRequest.get( url=url, data=data) Pylog.debug("Request Body:" + str(self.reps.request.body)) Pylog.debug("Response:" + self.reps.text) def upload(self, filename=None): Pylog.info(filename + ": 开始上载") try: src = config.get_config("upload", "src") savefile = config.get_config("upload", "savefile") savename = re.findall('E:/OtherFile/(.*)', src, re.S)[0] idfile = savefile + savename + '.csv' url = "http://img.will888.cn/photo/upload" files = {'pic': open(filename, 'rb')} self.reps = self.httpRequest.upload(url=url, files=files) Pylog.debug("Response:" + self.reps.text) pic = json.loads(self.reps.content) #CSV写入 csvfile = open(idfile, 'a', encoding='utf8', newline='') writer = csv.writer(csvfile) picId = pic["picid"] picname = re.findall(src + '/(.*)', filename, re.S) writer.writerow([picname[0], picId]) csvfile.close() except Exception as e: Pylog.error(e) def verify_1(self, data): game = config.get_config("verifygame", "game") urlfile = config.get_config("verifygame", "urlfile") Pylog.info(data) datas = {"id": data} self.httpRequest.headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Authorization": "MAuth-870fc3d727723a7410d2c0fa15154072cdb9300e9a54f89e09b9a27d32f852a44fd07348ae46208298303f281bcdc9a9079fa0b79310115038b4071b44edbe42-MAuth", "X-APP-ID": "20" } url = "http://" + config.get_config("normal", "url") + "/v1/config/kd/game/start" self.reps = self.httpRequest.post(url=url, data=datas) Pylog.debug("Request Body:" + str(self.reps.request.body)) Pylog.debug("Response:" + self.reps.text) csvfile = open(urlfile + '/' + game + '.csv', 'a', encoding='utf8', newline='') writer = csv.writer(csvfile) #正则表达式 gameurl = re.findall("action='(.*)'> </form>", self.reps.text, re.S) writer.writerow([data, gameurl[0]]) csvfile.close() def verify_2(self, data): Pylog.info(list(data.keys())[0]) game_id = list(data.keys())[0] game_url = data[game_id] self.httpRequest.headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Authorization": "MAuth-870fc3d727723a7410d2c0fa15154072cdb9300e9a54f89e09b9a27d32f852a44fd07348ae46208298303f281bcdc9a9079fa0b79310115038b4071b44edbe42-MAuth", "X-APP-ID": "20" } self.reps = self.httpRequest.get(url=game_url) Pylog.debug("Request Body:" + str(self.reps.request.body))
def __init__(self): self.httpRequest = HttpRequest()
def getRequest(self): url = self.url + "?" + HttpRequest.urlencode(self.params) return urllib2.Request(url)
else: get_post_time_last_time = datetime.strptime( get_post_time_last_time, '%Y-%m-%d %H:%M:%S') get_post_time_last_time_stamp = get_post_time_last_time.timestamp() logging.info('************last time: %s************' % get_post_time_last_time) #using cookieJar & HTTPCookieProcessor to automatically handle cookies cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) pda_url = 'http://www.hi-pda.com/' pda_request = HttpRequest(pda_url) pda_request.send_request() pda_resp_content = pda_request.get_resp_content() formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/' formhash_request = HttpRequest(formhash_url, None, {'Host': 'www.hi-pda.com'}) formhash_request.send_request() formhash_resp_content = formhash_request.get_resp_content() try: formhash_resp_content = formhash_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error('Decode formhash response content failed.') logging.exception(e) # print( formhash_resp_content )
def get_post_content_and_time( post_url, post_type, post_name, time_last_time ): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer' : 'http://www.hi-pda.com/forum/', 'Host' : 'www.hi-pda.com' } logging.info( 'Get post[%s] by url[%s].' % ( post_name, post_url ) ) post_request = HttpRequest( post_full_url, None, post_headers ) try: post_request.send_request() except TimeoutError: logging.warn(" Request url[%s] failed. " % post_full_url) post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode post response content failed.' ) logging.exception( e ) #<meta name="description" content=" Hi!PDA 本帖最后由 一炮而红 于 2015-12-1 22:59 编辑 三原色智能通讯欢迎您! http://187161236.taobao.com/ 论坛5年商家。&nbsp;&nbsp;微信&nbsp;&nbsp;QQ:18 ... - Board" /> re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''' ) result_content = re_pattern_content.search( post_resp_content ) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn( 'Request failed.' ) else: post_content = result_content.groups()[0] if post_content is None: logging.warn( 'Get post conetent failed.' ) else: re_pattern_update_time = re.compile( r'''于 (.*) 编辑''' ) result_update_time = re_pattern_update_time.search( post_content ) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: re_pattern_create_time = re.compile( r'''<em id=".+">发表于 (.+)</em>''' ) result_create_time = re_pattern_create_time.search( post_resp_content ) if result_create_time is None: logging.warn( 'Get post time failed.' ) else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post_create_time_stamp - time_last_time post = None if ( post_create_time_stamp - time_last_time ) > 0: conn = mysql.connector.connect(user = db_user, password = db_passwd, database = db_name) cursor = conn.cursor() logging.info( 'post_type:' + post_type ) logging.info( 'post_name:' + post_name ) logging.info( 'post_url:' + post_full_url ) logging.info( 'post_create_time:' + post_create_time ) logging.info( 'post_content:' + post_content ) post_id = next_id() post = Post( id = post_id, post_type = post_type, post_title = post_name, post_owner = 'hipda', post_content = post_content, post_link = post_full_url, post_time = post_create_time ) # post.save() cursor.execute('insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp]) conn.commit() cursor.close() conn.close() time.sleep( 1 ) else: logging.info( 'Post time[%s] is not after last time.' % post_create_time_datetime ) return post
def getRequest(self): return urllib2.Request(self.url, HttpRequest.urlencode(self.params))
if get_post_time_last_time == '': get_post_time_last_time = 0 get_post_time_last_time_stamp = 0 else: get_post_time_last_time = datetime.strptime( get_post_time_last_time, '%Y-%m-%d %H:%M:%S' ) get_post_time_last_time_stamp = get_post_time_last_time.timestamp() logging.info( '************last time: %s************' % get_post_time_last_time ) #using cookieJar & HTTPCookieProcessor to automatically handle cookies cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) pda_url = 'http://www.hi-pda.com/' pda_request = HttpRequest( pda_url ) pda_request.send_request() pda_resp_content = pda_request.get_resp_content() formhash_url = 'http://www.hi-pda.com/forum/logging.php?action=login&referer=http%3A//www.hi-pda.com/forum/' formhash_request = HttpRequest( formhash_url, None, { 'Host' : 'www.hi-pda.com' } ) formhash_request.send_request() formhash_resp_content = formhash_request.get_resp_content() try: formhash_resp_content = formhash_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error( 'Decode formhash response content failed.' ) logging.exception( e ) # print( formhash_resp_content ) # <input type="hidden" name="formhash" value="2f68efff" />
def get_post_content_and_time(post_url, post_type, post_name, time_last_time): post_full_url = 'http://www.hi-pda.com/forum/' + post_url post_headers = { 'Referer': 'http://www.hi-pda.com/forum/', 'Host': 'www.hi-pda.com' } logging.info('Get post[%s] by url[%s].' % (post_name, post_url)) #根据url请求帖子内容 post_request = HttpRequest(post_full_url, None, post_headers) post_request.send_request() post_resp_content = post_request.get_resp_content() try: post_resp_content = post_resp_content.decode('gbk') except UnicodeDecodeError as e: logging.error('Decode post response content failed.') logging.exception(e) #从帖子内容中解析帖子的摘要 re_pattern_content = re.compile( r'''<meta name="description" content="(.*)" />''') result_content = re_pattern_content.search(post_resp_content) post_content = None post_update_time = None post_create_time = None if result_content is None: logging.warn('Request failed.') else: post_content = result_content.groups()[0] if post_content is None: logging.warn('Get post conetent failed.') else: #从帖子内容中解析帖子更新时间 re_pattern_update_time = re.compile(r'''于 (.*) 编辑''') result_update_time = re_pattern_update_time.search(post_content) if result_update_time is None: pass else: post_update_time = result_update_time.groups()[0] if post_update_time is None: #从帖子内容中解析帖子发表时间 re_pattern_create_time = re.compile(r'''<em id=".+">发表于 (.+)</em>''') result_create_time = re_pattern_create_time.search(post_resp_content) if result_create_time is None: logging.warn('Get post time failed.') else: post_create_time = result_create_time.groups()[0] else: post_create_time = post_update_time post_create_time_datetime = datetime.strptime(post_create_time, '%Y-%m-%d %H:%M') post_create_time_stamp = post_create_time_datetime.timestamp() post = None #比较帖子时间和上次爬取时间,如果大于上次爬取时间,则视为本次爬取目标 if (post_create_time_stamp - time_last_time) > 0: conn = mysql.connector.connect(user=db_user, password=db_passwd, database=db_name) cursor = conn.cursor() logging.info('post_type:' + post_type) logging.info('post_name:' + post_name) logging.info('post_url:' + post_full_url) logging.info('post_create_time:' + post_create_time) logging.info('post_content:' + post_content) post_id = next_id() post = Post(id=post_id, post_type=post_type, post_title=post_name, post_owner='hipda', post_content=post_content, post_link=post_full_url, post_time=post_create_time) # post.save() cursor.execute( 'insert into posts (id, post_type, post_title, post_owner, post_content, post_link, post_time, created_at ) values (%s, %s, %s, %s, %s, %s, %s, %s)', [ post_id, post_type, post_name, 'hipda', post_content, post_full_url, post_create_time_stamp, post_create_time_stamp ]) conn.commit() cursor.close() conn.close() time.sleep(1) else: logging.info('Post time[%s] is not after last time.' % post_create_time_datetime) return post
def getRequest(self): return urllib2.Request(self.url, HttpRequest.urlencode(self.params))