Beispiel #1
0
    def fetch_versions(self):
        if self.versions:
            return

        result = get(self.url)
        tables = result('.tabel95')
        self.title = tables.find('.titulo').contents()[0].strip()

        for i, table in enumerate(tables[2:-1:2]):
            trs = query(table)('tr')

            release = trs.find('.NewsTitle').text().partition(',')[0]
            release = re.sub('version ', '', release, 0, re.I)

            infos = trs.next().find('.newsDate').eq(0).text()
            infos = re.sub('(?:should)? works? with ', '', infos, 0, re.I)

            for tr in trs[2:]:
                tr = query(tr)
                language = tr('.language')
                if not language:
                    continue

                completeness = language.next().text().partition(' ')[0]
                language = language.text()
                download = tr('a[href*=updated]') or tr('a[href*=original]')
                if not download:
                    continue
                hearing_impaired = \
                    bool(tr.next().find('img[title="Hearing Impaired"]'))
                download = download.attr.href
                self.add_version(download, language, release, infos,
                                 completeness, hearing_impaired)
Beispiel #2
0
    def run(self):
        while 1:
            print('开始爬取...')

            page_goods = []
            url = 'http://list.showjoy.com/search/?page={page}'.format(page=self.page)
            html = query(Sp(url).get('text'))
            containers = html('.brick-cover')

            # 获取最大页
            if self.max_count + self.max_page == 0:
                self.max_count = int(html('.highlight').text())
                self.max_page = int(self.max_count / 20) + 1
                print('获取最大页:' + str(self.max_page))

            for g in containers:
                g = query(g)
                url = g('.brick-pic').eq(0).attr('href')
                img = g('.brick-pic').eq(0)('img').eq(0).attr('src')
                brand = {
                    'name': g('.brand').eq(0)('img').eq(0).attr('alt'),
                    'img': g('.brand').eq(0)('img').eq(0).attr('src')
                }
                title = g('.brick-title').text().strip()

                # 价格转换为数字格式
                price = g('.price').text().strip()
                price = float(re.findall('¥(\d+\.\d+)', price)[0])

                # 销量转换为数字格式
                sales = g('.sales').text().strip()
                sales = int(re.findall('最近成交(\d+)笔', sales)[0])

                page_goods.append({
                    'url': url,
                    'img': img,
                    'brand': brand,
                    'title': title,
                    'price': price,
                    'sales': sales
                })

                print('{title},价格:{price},销量:{sales}'.format(title=title, price=price, sales=sales))

            print('第 {page} 页:商品总数:{count}'.format(page=self.page, count=len(page_goods)))
            self.goods += page_goods
            self.page += 1
            if self.page > self.max_page:
                self.end_time = time.time()
                break

        print(
            '完成所有商品获取,总页数:{max_page},商品总数:{count},耗费时间:{times}'.format(max_page=self.max_page,
                                                                       count=len(self.goods),
                                                                       times=self.end_time-self.start_time,
                                                                                           ))
Beispiel #3
0
 def __extractWeight(html):
     weight = {}
     try:
         query_site_info = query(html)('div.siteinfo')
         html_font = query_site_info('font')
         keys = ['baidu_weight', 'key_count', 'net_flow', 'place']
         for i in range(html_font.size()):
             weight[keys[i]] = query(html_font[i]).text()
     except Exception, e:
         print e
 def parse_next_page_href(html):
     next_page_list = []
     try:
         query_page_a_s = query(html)('#nav a.fl')
         for item in query_page_a_s:
             next_page_list.append(
                 'https://www.google.com.hk' + query(item).attr('href'))
     except Exception, e:
         print 'DataParser: parse next page info failed'
         print e
 def __extract_img_info(item):
     item_json = {}
     try:
         query_img_div = query(item)
         query_a_img = query_img_div('.r a')
         query_summary_span = query_img_div('.s')
         #print 'query_summary_span:', query_summary_span
         #print 'query_summary_span', query_summary_span.html().decode('iso-8859-1')
         if(query_a_img != None):
             img_title = query_a_img.text()
             img_web_url = query_a_img.attr('href')
             img_thumnail_src = query_img_div(
                 'img').attr('src')
             # print query_a_img('div.s')
             img_original_src = query_img_div(
                 'div.s')('a').attr('href')
             img_original_src = urlparse.parse_qs(
                 urlparse.urlparse(img_original_src).query, True)['imgurl'][0]
             item_json['img_web_url'] = img_web_url
             item_json['img_title'] = img_title
             item_json['img_thumnail_src'] = img_thumnail_src
             item_json['img_original_src'] = img_original_src
             item_json['img_weight'] = 1
             item_json['img_datetime'] = '2015.05.01'
             # print img_title
             # print img_web_url
             # print img_original_src
             # print img_thumnail_src
     except Exception, e:
         # print 'extract item json failed'
         print e
         pass
Beispiel #6
0
    def process_goods(self, html):
        """
        清理获取到的商品数据,返回商品列表

        :param ret:
        :return:
        """
        goods = []
        d_goods = html('.gl-item')

        #  解析商品数据
        for d_good in d_goods:
            d_good = query(d_good)
            good = {
                'id': d_good.attr('data-sku'),
                'title': d_good('.p-name > a').text(),
                'price': d_good('.p-price > strong > i').text(),
                'url': 'https:' + d_good('div.p-img > a').attr('href')
            }

            # 商品封面
            icon = d_good('.p-img > a > img').attr('source-data-lazy-img')

            if not icon or icon == 'done':
                icon = d_good('.p-img > a > img').attr('src')
            good.setdefault('icon', 'https:' + icon)

            goods.append(good)

        return goods
Beispiel #7
0
def parse_item(i):
    d = {}
    q = query(i)
    
    d['lat'] = float(q.attr('data-latitude')) if len(q.attr('data-latitude')) > 1 else ''
    d['lng'] = float(q.attr('data-longitude')) if len(q.attr('data-longitude')) > 1 else ''
    d['postdata'] = q.find('.itemdate').text()
    d['price'] = q.find('.itemph').text().split(' ')[0]
    d['title'] = q.find('a').text()
    d['link'] = q.find('a').attr('href')
    
    # print d['title']
    
    d['district'] = ''
    district = q.find('.itempn').find('font').text()
    if district is not None and district is not "":
        for dis in districts.keys():
            if dis in district:
                d['district'] = districts[dis]
                break
    
    for k in validationDistrict.keys():
        r = validationDistrict[k]
        # print '%s > %s > %s' % (r[0], d['lat'], r[2])
        if r[0] > d['lat'] and d['lat'] > r[2]:
            # print '  %s > %s > %s' % (r[1], d['lng'], r[3])
            if r[1] < d['lng'] and d['lng'] < r[3]:
                d['district'] = districts[k]
                # print '%s %s - %s [%s]' % (k, d['price'], d['title'], d['link'])
    
    return d if d['district'] is not '' else None
Beispiel #8
0
    def fetch_versions(self):
        if self.versions:
            return

        result = self._page or session.get(self.url)
        tables = result('.tabel95')
        self.title = tables.find('.titulo').contents()[0].strip()

        for i, table in enumerate(tables[2:-1:2]):
            trs = query(table)('tr')

            release = encode(trs.find('.NewsTitle').text().partition(',')[0])
            release = re.sub('version ', '', release, 0, re.I)

            infos = encode(trs.next().find('.newsDate').eq(0).text())
            infos = re.sub('(?:should)? works? with ', '', infos, 0, re.I)

            for tr in trs[2:]:
                tr = query(tr)
                language = tr('.language')
                if not language:
                    continue

                completeness = encode(language.next().text().partition(' ')[0])
                language = encode(language.text())
                download = tr('a[href*=updated]') or tr('a[href*=original]')
                if not download:
                    continue

                hearing_impaired = \
                    bool(tr.next().find('img[title="Hearing Impaired"]'))
                url = encode(download.attr.href)
                favorite = tr('a[href*="saveFavorite"]')[0].attrib['href']
                id, language_id, version = \
                    re.search(r'(\d+),(\d+),(\d+)', favorite).groups()

                self.add_version(
                    id=id,
                    language_id=language_id,
                    version=version,
                    url=url,
                    language=language,
                    release=release,
                    infos=infos,
                    completeness=completeness,
                    hearing_impaired=hearing_impaired,
                )
Beispiel #9
0
    def get_max_page(self):
        """
        获取商品搜索最大页

        :return:
        """
        url = 'http://search.jd.com/Search?keyword=' + self.keyword
        max_page = query(spider(url).get('text')).html('#J_topPage > span > i').text()
        print('获取最大页:' + max_page)
        return max_page
Beispiel #10
0
def get_cards():
    cards = []
    page = 1

    start_time = time()

    while 1:
        params = {
            'resourceType': 1,  # 0是软件排行,1是游戏排行
            'page': page
        }
        ret = spider('http://www.wandoujia.com/api/top/more', params=params, fields='json')
        json_data = ret.get('json')
        content = unpack_dict(json_data, 'data.content')

        # 到最大页则 content 为空
        if not content:
            end_time = time()
            print('爬取完成,总页数:{page},游戏总数:{count},耗时:{times}'.format(page=page - 1, count=len(cards), times=end_time-start_time))
            return cards

        document = query(content)
        cards_dom = document('.card')
        for card_dom in cards_dom:
            # 游戏名称、下载量、图标、下载地址
            # 下载地址需安装豌豆荚
            card_dom = query(card_dom)
            download_btn = card_dom('a.i-source.install-btn')

            name = download_btn.attr('data-name')
            downloads = download_btn.attr('data-install')
            icon = download_btn.attr('data-app-icon')
            url = download_btn.attr('href')

            cards.append({
                'name': name,
                'downloads': downloads,
                'icon': icon,
                'url': url
            })

        print('完成第 {page} 页爬取,当前总数:{count}'.format(page=page, count=len(cards)))
        page += 1
 def parse_img_info(html):
     print 'parse_img_info()'
     result_list = []
     try:
         query_img_div_s = query(html)('.rc')
         if query_img_div_s == None:
             return result_list
         for item in query_img_div_s:
             result_list.append(DataParser.__extract_img_info(item))
     except Exception, e:
         print 'DataParser: parse img info failed'
         print e
Beispiel #12
0
def main(sc):
    q = query(findapturl)
    print 'starting: %s' % (q('p').eq(0).find('.itemph').text())
    
    geo = q('*[data-latitude]').filter(lambda i: query(this).attr('data-latitude') != '')
    district = q('p').filter(lambda i: query(this).find('.itempn').each(in_district))
    
    results = []
    resultNodes = geo + district
    for item in resultNodes:
        obj = parse_item(item)
        if obj:
            results.append( obj )
    
    f = open(''.join([os.path.expanduser('~'),'/.craigslist.json']), 'r+')
    for item in results:
        if len(item['link']) > 1 and item['link'] not in data:
            print item['price']
            try:
                cost = int(item['price'].replace('$',''))
            except:
                continue
            
            if cost < maxSMSRent:
                shortened = requests.get('http://is.gd/create.php?format=simple&url=%s' % (item['link']))
                # notify( '%s [%s] %s %s' % (item['price'], item['district'], shortened.text, item['title']) )
                
                page = requests.get(item['link'])
                email(page.text, item['link'], '*****@*****.**')
                # email(page.text, item['link'], '*****@*****.**')
                
            item['added'] = datetime.datetime.now()
            data[item['link']] = item
    
    
    f = open(''.join(crapy_JSON_db), 'w')
    f.write(json.dumps(data, indent=4, default=date_encoder))
    f.close()
    
    sc.enter(sleeptime, 1, main, (sc,))
Beispiel #13
0
    def get_page_goods(self, page):
        page = page * 2 - 1  # 京东商品搜索页每页请求两次
        url = 'http://search.jd.com/s_new.php'

        """ 获取上半部分商品 """
        params = {
            "keyword": self.keyword,
            "page": page,
            "click": "0",
            "enc": "utf-8",
            "qrst": "1",
            "rt": "1",
            "s": "110",
            "stop": "1",
            "vt": "2"
        }
        html = query(spider(url, params=params).get('text'))
        goods = self.process_goods(html)

        """ 获取下半部分商品 """
        params = {
            "keyword": self.keyword,
            "show_items": ','.join([g.get('id') for g in goods]),
            "enc": "utf-8",
            "page": "2",
            "log_id": "1510505434.63851",
            "qrst": "1",
            "rt": "1",
            "s": "28",
            "scrolling": "y",
            "stop": "1",
            "tpl": "2_M",
            "vt": "2"
        }
        html = query(spider(url, params=params).get('text'))
        goods.extend(self.process_goods(html))

        print('第 {page} 页商品数量:{count}'.format(page=page, count=len(goods)))
        return goods
Beispiel #14
0
    def get_recommend_goods(self, page, keyword):
        """
        获取(左侧)商品精选

        :param page:
        :param keyword:
        :return:
        """
        goods = []

        url = 'http://x.jd.com/Search'
        params = {
            "page": page,
            "keyword": keyword,
            "_": time.time() * 1000,
            "adType": "7",
            "ad_ids": "291:20",
            "area": "1",
            "callback": "jQuery5618052",
            "enc": "utf-8",
            "xtest": "new_search"
        }

        ret = spider(url, fields='json', params=params, debug=True)
        if ret.get('code') == 200:
            # 解析商品数据,格式化保存
            json_data = ret.get('json')
            json_goods = json_data.get('291', [])
            for json_good in json_goods:
                good = {
                    'id': json_good.get('sku_id'),  # 通过 ID 取商品评价
                    'title': query(json_good.get('ad_title')).text(),
                    'icon': 'http://img1.360buyimg.com/n1/' + json_good.get('image_url'),
                    'price': json_good.get('pc_price'),
                    'url': json_good.get('click_url')
                }
                # 获取商品评价
                good.setdefault('comments', self.get_comments(good.get('id')))
                goods.append(good)
                print('已获取商品:' + good.get('title'))
            else:
                print('第 {page} 页商品获取完成'.format(page=page))
        else:
            print('获取商品出错:' + str(ret.get('err')))

        return goods
Beispiel #15
0
def get_one_page(url: str):
    """
    将 $url 对应的文章保存为 html 和 md
    虽然最后有清洗的步骤,最后格式还是需要手动调整(比如强迫症的我)
    """
    assert url.strip(), "Invalid url!"

    headers = {
        "user-agent": Faker().chrome(),
    }
    if url[-1] == "/":
        url = url[:-1]
    page_id = url.split("/")[-1]

    if not os.path.exists(f"files/{page_id}.html"):
        print(f"Retrieving {url}...")
        r = requests.get(url, headers=headers)
        doc = query(r.text)
        print(f"Saving the article to files/{page_id}.html...")
        with open(f"files/{page_id}.html", "w", encoding="utf-8") as f:
            f.write(doc("article").html())
    print("Converting html to md...")
    os.system(f"pandoc -o files/{page_id}.md files/{page_id}.html")
    print("Cleaning the md...")
    with open(f"files/{page_id}.md") as f:
        lines = f.readlines()

    trash_dots_lines = []
    for idx, line in enumerate(lines):
        if line.startswith(":::"):
            trash_dots_lines.append(idx)
        if re.search(r"{#.*?}", line):
            lines[idx] = re.sub(r"{#.*?}", "", line)

    for x in trash_dots_lines[::-1]:
        lines.pop(x)

    text = re.sub(r"{\..*?}", "", "".join(lines), flags=re.S)
    while re.search(r"{\..*?}", text, flags=re.S):
        text = re.sub(r"{\..*?}", "", text, flags=re.S)
    text = re.sub(r"<div>.*?</div>", "", text, flags=re.S)
    text = text.replace("\\\n", "").replace("\n\n\n",
                                            "\n\n").replace("\n\n\n", "\n\n")
    with open(f"files/{page_id}.md", "w", encoding="utf-8") as f:
        f.write(text)
Beispiel #16
0

class Users(pymodel.BaseModel):
    def __init__(self):
        self.name = UserInfo()


class Employees(pymodel.BaseModel):
    def __init__(self):
        import datetime
        self.set_model_name("employees")
        self.firstname = str
        self.lastname = str
        self.bithdate = datetime.datetime


employees = Employees()
qr = pyquery.query(employees.get_model_name())
qr.project(employees.firstname, employees.lastname)
print qr.pipeline
# user=Users()
# c=pydoc.document.nam + pydoc.document.x>("1+{0}+3",15)
# print c
# pyaggregatebuilders.Match(pydoc.FilterFiels)
# c=pyaggregatebuilders.Project(
#     pydoc.document.fullName<<("concat(firstName,'{0}',lastName)","xxx")
#
# )
# print c
# c=(pyfuncs.regex(pydoc.document.name,"12"))
# print c
Beispiel #17
0
import pymongo
from pymongo import MongoClient
import pyfuncs
import pydocs
Fields = pydocs.Fields()
# print pyfuncs.toDouble(X.name)

# fields=pydoc.Fields()
# x=pyfuncs.cmp(fields.amount,fields.name)==0
# print isinstance(x,pydoc.Fields)
# # c=x.__owner__
# print x.__tree__
cnn = MongoClient(host="localhost", port=27017)
db = cnn.get_database("hrm")
db.authenticate(name="root", password="******")
qr = pyquery.query(db, "test.coll001")
qr = qr.where(pyfuncs.regex(Fields.fx, "^312313$"))

# qr.project({
#     Fields.Users.username:1,
#     Fields.Users.fullName:pyfuncs.concat(Fields.Users.firstName, " ",Fields.Users.lastname)
# })

# qr=qr+2
#     #.set(x=1,y=2)
# import pprint
# items=list(qr.objects)
import pprint
x = list(qr.objects)
pprint.pprint(list(qr.items))
Beispiel #18
0
def get_page_content(page):
    items = []

    domain = 'https://www.qiushibaike.com'

    # 获取原始链接,内容详情页
    url = 'https://www.qiushibaike.com/8hr/page/{0}/'.format(page)
    ret = spider(url)
    text = ret.get('text')
    index_document = query(text)

    articles = index_document('.article')
    for article in articles:
        href = query(article)('a.contentHerf').attr('href')
        items.append({'url': domain + href})
    print('第 {page} 页获取链接数:{count}'.format(page=page, count=len(items)))
    if len(items) == 0:
        print(ret)

    for index, item in enumerate(items):
        for i in range(0, 2):
            # 访问详情页,获取内容 + 评论
            text = spider(item['url']).get('text')
            document = query(text)
            # 内容
            content = document('#single-next-link > div').text()
            if not content:
                print('获取失败,重试获取,进度:{index}/{maxlength}'.format(
                    index=index + 1, maxlength=len(items)))
                continue

            # 内容配图
            img_href = document('#single-next-link > div.thumb > img').attr(
                'src') or ''
            if img_href:
                img_href = 'https:' + img_href

            # 评论
            comments = []
            comments_dom = document('.comment-block > div.replay > span.body')
            for span in comments_dom:
                comments.append(query(span).text())

            item.update({
                'content': content,
                'img_href': img_href,
                'comments': comments
            })
            print('获取第 {page} 页,进度:{index}/{maxlength}'.format(
                page=page, index=index + 1, maxlength=len(items)))
            break

    print('第 {page} 页获取完成'.format(page=page))

    if page == 1:
        max_page = int(
            index_document(
                '#content-left > ul > li:nth-child(7) > a > span').text())
        print('最大页:' + str(max_page))
        return max_page, items

    return items
Beispiel #19
0
def get(url, raw=False, **params):
    global last_url
    url = urljoin(last_url, url)
    request = requests.get(url, headers={'Referer': last_url}, params=params)
    last_url = url
    return request.content if raw else query(request.content)