Beispiel #1
0
 def get_court_data(self):
     url = 'http://www.gpai.net/sf/courtList.do'
     html = UrlUtil.get_html(url)
     court_data_list = re.findall(
         re.compile(
             r'<a href="\S*?court.do\?id=(\d+)" target="_blank">(\S*?)</a>\s*<span class="iconfont-sf">\((\d+)',
             re.S), html.decode('utf-8'))
     return court_data_list
 def spider_auction_list_and_insert(self, url, court_id, category_id, status_id, mysql_instance, table_name):
     item_html = UrlUtil.get_html_with_proxy(url, False)
     url_partial_list = re.findall(re.compile(r'"\/\/sf-item.taobao.com\/sf_item\/(\S+.htm)'), item_html.decode('gbk'))
     for url_partial in url_partial_list:
         url = 'https://sf-item.taobao.com/sf_item/' + url_partial
         auction_json = self.get_auction_json(url, court_id, category_id, status_id)
         mysql_instance.upsert_auction(auction_json, table_name)
     return len(url_partial_list)
 def spider_auction_and_insert(self, url, court_id, category_id, status_id,
                               mysql_instance):
     item_html = UrlUtil.get_html_with_proxy(url)
     url_list = re.findall(
         re.compile(r'<a href="(\S*?)item2.do?(\S*?)"><img'),
         item_html.decode('utf8'))
     for urls in url_list:
         url = 'http://www.gpai.net/sf/item2.do' + urls[1]
         auction_json = self.get_auction_json(url, court_id, category_id,
                                              status_id)
         mysql_instance.upsert_auction(auction_json)
Beispiel #4
0
 def converUrl(self, url):
     try:
         u = UrlUtil(url)
         if u.suffix == '':
             new_url = self.getMd5(u.host) + '_' + self.getMd5(
                 u.path) + '.html'
         else:
             new_url = self.getMd5(u.host) + '_' + self.getMd5(
                 u.path) + u.suffix
         return new_url
     except Exception, e:
         return ''
Beispiel #5
0
 def __getSongs__(self):
     print "----getSongs----"
     url = self.__urlList__.pop()
     if self.__usedList__.count(url) == 0:
         self.__usedList__.append(url)
         html = UrlUtil.get_content(url)
         if html is None:
             yield self.__getSongs__()
         else:
             soup = BeautifulSoup(html, 'html5lib')
             print url
             if self.TYPE == self.__CLASSIFY__:
                 yield self.__getClassifyMusic__(soup)
     return
Beispiel #6
0
    def get_auction_json(self, url, court_id, category_id, status_id):
        auction_json = {}
        html = UrlUtil.get_html_with_proxy(url, False)
        et = etree.HTML(html)
        soup = BeautifulSoup(html, 'html.parser', from_encoding='gbk')

        auction_json['AuctionModel'] = ""
        auction_json['AuctionType'] = ""
        auction_json['SellingPeriod'] = ""
        # print(soup.find('td', class_='delay-td').find('span'))
        # print(soup.find('td', class_='delay-td').find_all('span')[1])
        auction_json['AuctionTimes'] = soup.find('td', class_='delay-td').find_all('span')[1].text[1:]
        auction_json['OnlineCycle'] = soup.find('span', class_='pay-mark').text
        auction_json['DelayCycle'] = soup.find('td', class_='delay-td').text.replace('\n', '').strip()

        auction_json['CashDeposit'] = ""
        auction_json['PaymentAdvance'] = ""

        top_info = soup.find('tbody', id='J_HoverShow')
        tds = top_info.find_all('td')
        start_price_span = tds[0].find_all('span')[2]
        increment_span = tds[1].find_all('span')[2]
        auction_type_span = tds[2].find_all('span')[1].span
        cash_deposit_span = tds[3].find_all('span')[1].span
        auction_cycle_span = tds[4].find_all('span')[1].span
        prior_buyer_span = tds[5].find_all('span')[1]
        access_price_span = tds[6].find_all('span')[1].span

        self.assign_auction_property(auction_json, 'StartPrice', start_price_span, True)
        self.assign_auction_property(auction_json, 'FareIncrease', increment_span, True)
        self.assign_auction_property(auction_json, 'CashDeposit', cash_deposit_span, True)
        self.assign_auction_property(auction_json, 'AccessPrice', access_price_span, True)

        auction_json['Title'] = soup.find('h1').text.replace(u"\u2022", u" ").replace(u"\xa0", u" ").strip()
        auction_json['CurrentPrice'] = soup.find('span', class_='pm-current-price').text.replace(',', '').strip()
        auction_json['CorporateAgent'] = soup.find('span', class_='item-announcement').text.strip()
        auction_json['Phone'] = soup.find('div', class_='contact-unit').find('p', class_='contact-line').find('span', class_='c-text').text
        auction_json['BiddingRecord'] = soup.find('span', class_='current-bid-user').text.strip() if soup.find('span', class_='current-bid-user') else ''
        auction_json['SetReminders'] = soup.find('span', class_='pm-reminder').find('em').text if soup.find('span', class_='pm-reminder') else 0
        auction_json['Onlookers'] = soup.find('span', class_='pm-surround').find('em').text if soup.find('span', class_='pm-surround') else 0
        auction_json['Enrollment'] = soup.find('em', class_='J_Applyer').text

        auction_json['Url'] = url
        auction_json['datetime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        auction_json['AuctionId'] = url[35:-4]
        auction_json['CourtId'] = court_id
        auction_json['CategoryId'] = category_id
        auction_json['StatusId'] = status_id
        return auction_json
Beispiel #7
0
def get_all_request_url_history_url(user):
    """
    获取此用户的访问历史记录
    :return: 返回一个列表  [(tie, raw_url, tran_url, request_number),(),...]
    """
    request_url_list = RequestHistory.objects.filter(user=user).order_by('-request_number')[:15]
    result = []
    for request_url in request_url_list:
        raw_url = request_url.url_info
        tran_url = UrlUtil.get_tran_url(raw_url)
        title = request_url.url.title
        request_number = request_url.request_number
        result.append((title, raw_url, tran_url, request_number))

    return result
Beispiel #8
0
def get_all_request_url_history_url(user):
    """
    获取此用户的访问历史记录
    :return: 返回一个列表  [(tie, raw_url, tran_url, request_number),(),...]
    """
    request_url_list = RequestHistory.objects.filter(
        user=user).order_by('-request_number')[:15]
    result = []
    for request_url in request_url_list:
        raw_url = request_url.url_info
        tran_url = UrlUtil.get_tran_url(raw_url)
        title = request_url.url.title
        request_number = request_url.request_number
        result.append((title, raw_url, tran_url, request_number))

    return result
Beispiel #9
0
def run():
    url = client.pop()
    if not url == None:
        urlUtil = UrlUtil(url)
        python_shell = ''
        if url.find('/games/') >= 1:  # game下为下载游戏目录
            is_game_html = re.findall(r"\b\/\d*.htm\b", url)
            #game下载页面
            if len(is_game_html) >= 1:
                python_shell = "nohup python ./GameDown.py " + url + " www_newyx_net & > /dev/null"
            else:
                python_shell = "nohup python ./Content.py " + url + " www_newyx_net & > /dev/null"
        elif len(re.findall(r"/[^\s]+\.(jpg|gif|png|bmp)", url)) >= 1:
            python_shell = "nohup python ./ImageDown.py " + url + " www_newyx_net & > /dev/null"
        else:
            python_shell = "nohup python ./Content.py " + url + " www_newyx_net & > /dev/null"
        print python_shell
        os.popen(python_shell)
    else:
        print 'url is null'
 def get_user_id(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     user_id_part = re.findall(re.compile(r'<input type="hidden" name="userId" value="(\d+)"'), page.decode('gbk'))
     return user_id_part[0]
 def get_total_count(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     total_counts = re.findall(re.compile(r'<em class="count">(\d+)</em>'), page.decode('gbk'))
     return int(total_counts[0])
    def get_auction_json(self, url, court_id, category_id, status_id):
        auction_json = {}
        html = UrlUtil.get_html_with_proxy(url)
        et = etree.HTML(html)
        auction_model_div = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[1]/text()')
        auction_json['AuctionModel'] = ""
        if auction_model_div.__len__() != 0:
            auction_model = auction_model_div[0]
            len = auction_model.__len__()
            if len > 7:
                auction_json['AuctionModel'] = auction_model[7:]
            else:
                auction_json['AuctionModel'] = auction_model[5:]
        auction_json['SellingPeriod'] = ""
        auction_json['AuctionTimes'] = ""
        auctionTimes = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[2]/text()')
        if auctionTimes.__len__() != 0:
            auction_times = auctionTimes[0]
            if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖':
                auction_json['SellingPeriod'] = auction_times[4:]
            else:
                auction_json['AuctionTimes'] = auction_times[5:]

        self.assign_auction_property_et(
            auction_json, 'AuctionType', et,
            '//div[@class="d-m-tb"]/table[1]/tr[1]/td[3]/text()', 5)

        onlineCycle = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[2]/td[1]/text()')
        auction_json['OnlineCycle'] = ""
        if onlineCycle.__len__() != 0:
            online_cycle = onlineCycle[0]
            len = online_cycle.__len__()
            if len > 8:
                auction_json['OnlineCycle'] = online_cycle[6:]
            else:
                auction_json['OnlineCycle'] = online_cycle[4:]

        self.assign_auction_property_et(
            auction_json, 'DelayCycle', et,
            '//div[@class="d-m-tb"]/table[1]/tr[2]/td[2]/text()', 5)
        self.assign_auction_property(auction_json, 'FareIncrease', html,
                                     r'<span id="Price_Step">(.*?)</span>',
                                     True)
        self.assign_auction_property(auction_json, 'StartPrice', html,
                                     r'<span id="Price_Start">(.*?)</span>',
                                     True)

        auction_json['CashDeposit'] = ""
        auction_json['PaymentAdvance'] = ""
        if str(auction_json['AuctionModel'].encode('utf-8')) == '变卖':
            paymentAdvance = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')
            cashDeposit = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[3]/text()')
            if paymentAdvance.__len__() != 0:
                payment_advance = paymentAdvance[0]
                cash_deposit = cashDeposit[0]
                auction_json['cash_deposit'] = cash_deposit[4:].replace(
                    ",", "")
                auction_json['payment_advance'] = payment_advance[6:].replace(
                    ",", "")
        else:
            cashDeposit = et.xpath(
                '//div[@class="d-m-tb"]/table[1]/tr[3]/td[2]/text()')
            if cashDeposit.__len__() != 0:
                cash_deposit = cashDeposit[0]
                auction_json['cash_deposit'] = cash_deposit[4:].replace(
                    ",", "")

        accessPrice = et.xpath(
            '//div[@class="d-m-tb"]/table[1]/tr[4]/td[1]/text()')
        auction_json['AccessPrice'] = ""
        if accessPrice.__len__() != 0:
            access_price = accessPrice[0]
            auction_json['AccessPrice'] = access_price[4:].replace(",",
                                                                   "").replace(
                                                                       "	", "")

        self.assign_auction_property(auction_json, 'Title', html,
                                     r'class="d-m-title"><b>(.*?)</b>', True)
        self.assign_auction_property_et(
            auction_json, 'Enrollment', et,
            '//div[@class="peoples-infos"]/span[1]/b[1]/text()')
        self.assign_auction_property_et(
            auction_json, 'SetReminders', et,
            '//div[@class="peoples-infos"]/span[2]/b[1]/text()')
        self.assign_auction_property_et(
            auction_json, 'Onlookers', et,
            '//div[@class="peoples-infos"]/span[3]/b[1]/text()')
        self.assign_auction_property(auction_json, 'CourtName', html,
                                     r"<td nowrap class='pr7'>(.*?)</td>",
                                     False, 5)
        self.assign_auction_property(auction_json, 'CorporateAgent', html,
                                     r"<td valign='top'>(.*?)</td>", False, 4)
        self.assign_auction_property(auction_json, 'Phone', html,
                                     r"<td colspan='2'>(.*?)</td>", False, 5)
        self.assign_auction_property(auction_json, 'BiddingRecord', html,
                                     r"id='html_Bid_Shu'>(.*?)</span>", True)
        self.assign_auction_property(auction_json, 'CurrentPrice', html,
                                     r"<b class='price-red'>(.*?)</b>", True)

        auction_json['Url'] = url
        auction_json['datetime'] = dataTime = time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        auction_json['AuctionId'] = url[44:]
        auction_json['CourtId'] = court_id
        auction_json['CategoryId'] = category_id
        auction_json['StatusId'] = status_id
        return auction_json
Beispiel #13
0
# -*- coding: UTF-8 -*-
# -*- author: kexiaohong -*-
import UrlUtil
import SongUtil
import ConstData
from bs4 import BeautifulSoup

if __name__ == '__main__':
    html = UrlUtil.get_content(ConstData.baseBaiduUrl)
    if html is not None:
        soup = BeautifulSoup(html,'html5lib')
        SongUtil.getindexsongs(soup)
        SongUtil.gettypesongs(soup)

Beispiel #14
0
from client import *

# url = 'http://www.newyx.net/zq/wuxiadanji/'
# host = 'www_newyx_net'
url = ''
host = ''
if __name__ == '__main__':
    url = sys.argv[1]
    host = sys.argv[2]

if not url == '':
    mq = lg_mq_client("127.0.0.1")
    f = FileUtil(host)
    f.initPath()  # 初始化目录结构
    u = Util(url)
    urlinfo = UrlUtil(url)
    soup = u.getSoup()
    info = u.getInfo(soup)
    main_md5 = u.getMd5(urlinfo.host)
    minor_md5 = u.getMd5(urlinfo.path)
    f.saveInfo(main_md5 + "_" + minor_md5, info)

    #过滤顶部菜单
    u.filterColumn(soup)

    a_lsit = u.filterHtmlByA(soup, urlinfo)
    img_list = u.filterHtmlByImg(soup, urlinfo, f)

    for a in a_lsit:
        mq.push(a)
Beispiel #15
0
import urllib
from UrlUtil import *
from Util import *
from FileUtil import *
import HTMLParser
import logging
import logging.config

logging.config.fileConfig("../config/logging.conf")
logger = logging.getLogger()
# url = 'http://img.newyx.net/tj/201702/28/798fac1681.jpg'
# host = 'www_newyx_net'

url = ''
host = ''
if __name__ == '__main__':
    url = sys.argv[1]
    host = sys.argv[2]

if not url == '':
    u = UrlUtil(url)
    util = Util(url)
    file = FileUtil(host)
    main_md5 = util.getMd5(u.host)
    fall_md5 = util.getMd5(u.path_suffix)
    dir = file.image
    urllib.urlretrieve(url, file.image + util.converUrl(url))
    logger.info('success=>' + url)
else:
    logger.error('err=>没有图片路径' + url)
 def get_total_count(self, url):
     page = UrlUtil.get_html_with_proxy(url, False)
     total_counts = re.findall(re.compile(r'<label>(.*?)</label>'),
                               page.decode('utf8'))
     return int(total_counts[0])
Beispiel #17
0
def change_url(pre_url):
    un_change_patterns = ['css', 'png', 'js', 'ico', 'tgz', 'zip', 'rar', 'pdf', 'gif', 'git']
    for un_change_pattern in un_change_patterns:
        if pre_url.endswith(un_change_pattern):
            return pre_url
    return UrlUtil.get_tran_page_url(pre_url)
Beispiel #18
0
 def get_court_auction_count(self, court_auction_count_url):
     court_auction_count_json = UrlUtil.get_json(court_auction_count_url)
     return court_auction_count_json['data']
Beispiel #19
0
 def get_court_data(self, court_list_url, court_item_regex):
     html = UrlUtil.get_html(court_list_url)
     court_data_list = re.findall(re.compile(court_item_regex, re.S),
                                  html.decode('utf8'))
     return court_data_list