def get_court_data(self): url = 'http://www.gpai.net/sf/courtList.do' html = UrlUtil.get_html(url) court_data_list = re.findall( re.compile( r'<a href="\S*?court.do\?id=(\d+)" target="_blank">(\S*?)</a>\s*<span class="iconfont-sf">\((\d+)', re.S), html.decode('utf-8')) return court_data_list
def spider_auction_list_and_insert(self, url, court_id, category_id, status_id, mysql_instance, table_name): item_html = UrlUtil.get_html_with_proxy(url, False) url_partial_list = re.findall(re.compile(r'"\/\/sf-item.taobao.com\/sf_item\/(\S+.htm)'), item_html.decode('gbk')) for url_partial in url_partial_list: url = 'https://sf-item.taobao.com/sf_item/' + url_partial auction_json = self.get_auction_json(url, court_id, category_id, status_id) mysql_instance.upsert_auction(auction_json, table_name) return len(url_partial_list)
def spider_auction_and_insert(self, url, court_id, category_id, status_id, mysql_instance): item_html = UrlUtil.get_html_with_proxy(url) url_list = re.findall( re.compile(r'<a href="(\S*?)item2.do?(\S*?)"><img'), item_html.decode('utf8')) for urls in url_list: url = 'http://www.gpai.net/sf/item2.do' + urls[1] auction_json = self.get_auction_json(url, court_id, category_id, status_id) mysql_instance.upsert_auction(auction_json)
def converUrl(self, url): try: u = UrlUtil(url) if u.suffix == '': new_url = self.getMd5(u.host) + '_' + self.getMd5( u.path) + '.html' else: new_url = self.getMd5(u.host) + '_' + self.getMd5( u.path) + u.suffix return new_url except Exception, e: return ''
def __getSongs__(self): print "----getSongs----" url = self.__urlList__.pop() if self.__usedList__.count(url) == 0: self.__usedList__.append(url) html = UrlUtil.get_content(url) if html is None: yield self.__getSongs__() else: soup = BeautifulSoup(html, 'html5lib') print url if self.TYPE == self.__CLASSIFY__: yield self.__getClassifyMusic__(soup) return
def get_auction_json(self, url, court_id, category_id, status_id): auction_json = {} html = UrlUtil.get_html_with_proxy(url, False) et = etree.HTML(html) soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk') auction_json['AuctionModel'] = "" auction_json['AuctionType'] = "" auction_json['SellingPeriod'] = "" # print(soup.find('td', class_='delay-td').find('span')) # print(soup.find('td', class_='delay-td').find_all('span')[1]) auction_json['AuctionTimes'] = soup.find('td', class_='delay-td').find_all('span')[1].text[1:] auction_json['OnlineCycle'] = soup.find('span', class_='pay-mark').text auction_json['DelayCycle'] = soup.find('td', class_='delay-td').text.replace('\n', '').strip() auction_json['CashDeposit'] = "" auction_json['PaymentAdvance'] = "" top_info = soup.find('tbody', id='J_HoverShow') tds = top_info.find_all('td') start_price_span = tds[0].find_all('span')[2] increment_span = tds[1].find_all('span')[2] auction_type_span = tds[2].find_all('span')[1].span cash_deposit_span = tds[3].find_all('span')[1].span auction_cycle_span = tds[4].find_all('span')[1].span prior_buyer_span = tds[5].find_all('span')[1] access_price_span = tds[6].find_all('span')[1].span self.assign_auction_property(auction_json, 'StartPrice', start_price_span, True) self.assign_auction_property(auction_json, 'FareIncrease', increment_span, True) self.assign_auction_property(auction_json, 'CashDeposit', cash_deposit_span, True) self.assign_auction_property(auction_json, 'AccessPrice', access_price_span, True) auction_json['Title'] = soup.find('h1').text.replace(u"\u2022", u" ").replace(u"\xa0", u" ").strip() auction_json['CurrentPrice'] = soup.find('span', class_='pm-current-price').text.replace(',', '').strip() auction_json['CorporateAgent'] = soup.find('span', class_='item-announcement').text.strip() auction_json['Phone'] = soup.find('div', class_='contact-unit').find('p', class_='contact-line').find('span', class_='c-text').text auction_json['BiddingRecord'] = soup.find('span', class_='current-bid-user').text.strip() if soup.find('span', class_='current-bid-user') else '' auction_json['SetReminders'] = soup.find('span', class_='pm-reminder').find('em').text if soup.find('span', class_='pm-reminder') else 0 auction_json['Onlookers'] = soup.find('span', class_='pm-surround').find('em').text if soup.find('span', class_='pm-surround') else 0 auction_json['Enrollment'] = soup.find('em', class_='J_Applyer').text auction_json['Url'] = url auction_json['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) auction_json['AuctionId'] = url[35:-4] auction_json['CourtId'] = court_id auction_json['CategoryId'] = category_id auction_json['StatusId'] = status_id return auction_json
def get_all_request_url_history_url(user): """ 获取此用户的访问历史记录 :return: 返回一个列表 [(tie, raw_url, tran_url, request_number),(),...] """ request_url_list = RequestHistory.objects.filter(user=user).order_by('-request_number')[:15] result = [] for request_url in request_url_list: raw_url = request_url.url_info tran_url = UrlUtil.get_tran_url(raw_url) title = request_url.url.title request_number = request_url.request_number result.append((title, raw_url, tran_url, request_number)) return result
def get_all_request_url_history_url(user): """ 获取此用户的访问历史记录 :return: 返回一个列表 [(tie, raw_url, tran_url, request_number),(),...] """ request_url_list = RequestHistory.objects.filter( user=user).order_by('-request_number')[:15] result = [] for request_url in request_url_list: raw_url = request_url.url_info tran_url = UrlUtil.get_tran_url(raw_url) title = request_url.url.title request_number = request_url.request_number result.append((title, raw_url, tran_url, request_number)) return result
def run(): url = client.pop() if not url == None: urlUtil = UrlUtil(url) python_shell = '' if url.find('/games/') >= 1: # game下为下载游戏目录 is_game_html = re.findall(r"\b\/\d*.htm\b", url) #game下载页面 if len(is_game_html) >= 1: python_shell = "nohup python ./GameDown.py " + url + " www_newyx_net & > /dev/null" else: python_shell = "nohup python ./Content.py " + url + " www_newyx_net & > /dev/null" elif len(re.findall(r"/[^\s]+\.(jpg|gif|png|bmp)", url)) >= 1: python_shell = "nohup python ./ImageDown.py " + url + " www_newyx_net & > /dev/null" else: python_shell = "nohup python ./Content.py " + url + " www_newyx_net & > /dev/null" print python_shell os.popen(python_shell) else: print 'url is null'
def get_user_id(self, url): page = UrlUtil.get_html_with_proxy(url, False) user_id_part = re.findall(re.compile(r'<input type="hidden" name="userId" value="(\d+)"'), page.decode('gbk')) return user_id_part[0]
def get_total_count(self, url): page = UrlUtil.get_html_with_proxy(url, False) total_counts = re.findall(re.compile(r'<em class="count">(\d+)</em>'), page.decode('gbk')) return int(total_counts[0])
def get_auction_json(self, url, court_id, category_id, status_id): auction_json = {} html = UrlUtil.get_html_with_proxy(url) et = etree.HTML(html) auction_model_div = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[1]/text()') auction_json['AuctionModel'] = "" if auction_model_div.__len__() != 0: auction_model = auction_model_div[0] len = auction_model.__len__() if len > 7: auction_json['AuctionModel'] = auction_model[7:] else: auction_json['AuctionModel'] = auction_model[5:] auction_json['SellingPeriod'] = "" auction_json['AuctionTimes'] = "" auctionTimes = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()') if auctionTimes.__len__() != 0: auction_times = auctionTimes[0] if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖': auction_json['SellingPeriod'] = auction_times[4:] else: auction_json['AuctionTimes'] = auction_times[5:] self.assign_auction_property_et( auction_json, 'AuctionType', et, '//div[@class="d-m-tb"]/table[1]/tr[1]/td[3]/text()', 5) onlineCycle = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[2]/td[1]/text()') auction_json['OnlineCycle'] = "" if onlineCycle.__len__() != 0: online_cycle = onlineCycle[0] len = online_cycle.__len__() if len > 8: auction_json['OnlineCycle'] = online_cycle[6:] else: auction_json['OnlineCycle'] = online_cycle[4:] self.assign_auction_property_et( auction_json, 'DelayCycle', et, '//div[@class="d-m-tb"]/table[1]/tr[2]/td[2]/text()', 5) self.assign_auction_property(auction_json, 'FareIncrease', html, r'<span id="Price_Step">(.*?)</span>', True) self.assign_auction_property(auction_json, 'StartPrice', html, r'<span id="Price_Start">(.*?)</span>', True) auction_json['CashDeposit'] = "" auction_json['PaymentAdvance'] = "" if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖': paymentAdvance = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()') cashDeposit = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[3]/text()') if paymentAdvance.__len__() != 0: payment_advance = paymentAdvance[0] cash_deposit = cashDeposit[0] auction_json['cash_deposit'] = cash_deposit[4:].replace( ",", "") auction_json['payment_advance'] = payment_advance[6:].replace( ",", "") else: cashDeposit = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()') if cashDeposit.__len__() != 0: cash_deposit = cashDeposit[0] auction_json['cash_deposit'] = cash_deposit[4:].replace( ",", "") accessPrice = et.xpath( '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()') auction_json['AccessPrice'] = "" if accessPrice.__len__() != 0: access_price = accessPrice[0] auction_json['AccessPrice'] = access_price[4:].replace(",", "").replace( " ", "") self.assign_auction_property(auction_json, 'Title', html, r'class="d-m-title"><b>(.*?)</b>', True) self.assign_auction_property_et( auction_json, 'Enrollment', et, '//div[@class="peoples-infos"]/span[1]/b[1]/text()') self.assign_auction_property_et( auction_json, 'SetReminders', et, '//div[@class="peoples-infos"]/span[2]/b[1]/text()') self.assign_auction_property_et( auction_json, 'Onlookers', et, '//div[@class="peoples-infos"]/span[3]/b[1]/text()') self.assign_auction_property(auction_json, 'CourtName', html, r"<td nowrap class='pr7'>(.*?)</td>", False, 5) self.assign_auction_property(auction_json, 'CorporateAgent', html, r"<td valign='top'>(.*?)</td>", False, 4) self.assign_auction_property(auction_json, 'Phone', html, r"<td colspan='2'>(.*?)</td>", False, 5) self.assign_auction_property(auction_json, 'BiddingRecord', html, r"id='html_Bid_Shu'>(.*?)</span>", True) self.assign_auction_property(auction_json, 'CurrentPrice', html, r"<b class='price-red'>(.*?)</b>", True) auction_json['Url'] = url auction_json['datetime'] = dataTime = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) auction_json['AuctionId'] = url[44:] auction_json['CourtId'] = court_id auction_json['CategoryId'] = category_id auction_json['StatusId'] = status_id return auction_json
# -*- coding: UTF-8 -*- # -*- author: kexiaohong -*- import UrlUtil import SongUtil import ConstData from bs4 import BeautifulSoup if __name__ == '__main__': html = UrlUtil.get_content(ConstData.baseBaiduUrl) if html is not None: soup = BeautifulSoup(html,'html5lib') SongUtil.getindexsongs(soup) SongUtil.gettypesongs(soup)
from client import * # url = 'http://www.newyx.net/zq/wuxiadanji/' # host = 'www_newyx_net' url = '' host = '' if __name__ == '__main__': url = sys.argv[1] host = sys.argv[2] if not url == '': mq = lg_mq_client("127.0.0.1") f = FileUtil(host) f.initPath() # 初始化目录结构 u = Util(url) urlinfo = UrlUtil(url) soup = u.getSoup() info = u.getInfo(soup) main_md5 = u.getMd5(urlinfo.host) minor_md5 = u.getMd5(urlinfo.path) f.saveInfo(main_md5 + "_" + minor_md5, info) #过滤顶部菜单 u.filterColumn(soup) a_lsit = u.filterHtmlByA(soup, urlinfo) img_list = u.filterHtmlByImg(soup, urlinfo, f) for a in a_lsit: mq.push(a)
import urllib from UrlUtil import * from Util import * from FileUtil import * import HTMLParser import logging import logging.config logging.config.fileConfig("../config/logging.conf") logger = logging.getLogger() # url = 'http://img.newyx.net/tj/201702/28/798fac1681.jpg' # host = 'www_newyx_net' url = '' host = '' if __name__ == '__main__': url = sys.argv[1] host = sys.argv[2] if not url == '': u = UrlUtil(url) util = Util(url) file = FileUtil(host) main_md5 = util.getMd5(u.host) fall_md5 = util.getMd5(u.path_suffix) dir = file.image urllib.urlretrieve(url, file.image + util.converUrl(url)) logger.info('success=>' + url) else: logger.error('err=>没有图片路径' + url)
def get_total_count(self, url): page = UrlUtil.get_html_with_proxy(url, False) total_counts = re.findall(re.compile(r'<label>(.*?)</label>'), page.decode('utf8')) return int(total_counts[0])
def change_url(pre_url): un_change_patterns = ['css', 'png', 'js', 'ico', 'tgz', 'zip', 'rar', 'pdf', 'gif', 'git'] for un_change_pattern in un_change_patterns: if pre_url.endswith(un_change_pattern): return pre_url return UrlUtil.get_tran_page_url(pre_url)
def get_court_auction_count(self, court_auction_count_url): court_auction_count_json = UrlUtil.get_json(court_auction_count_url) return court_auction_count_json['data']
def get_court_data(self, court_list_url, court_item_regex): html = UrlUtil.get_html(court_list_url) court_data_list = re.findall(re.compile(court_item_regex, re.S), html.decode('utf8')) return court_data_list