Example #1
0
def crawl(url):
    tree = lt.fetch(url)
    imgs = tree.css('dl.list-left dd')[:-1]
    for img in imgs:
        link = img.css('a::attr(href)').extract_first()
        bango = link.split('/')[-1][:-5]
        detail = lt.fetch(link, headers=headers)
        max_page = detail.css('.content-page .page-ch::text').re_first(r'\d+')
        img_urls = [
            f'http://img1.mm131.me/pic/{bango}/{n}.jpg'
            for n in range(1,
                           int(max_page) + 1)
        ]
        lt.async_save_imgs(img_urls, headers=headers, random_name=True)
Example #2
0
def crawl(url):
    tree = lt.fetch(url)
    imgs = tree.cssselect('dl.list-left dd')[:-1]
    for img in imgs:
        link = img.cssselect('a')[0].get('href')
        bango = link.split('/')[-1][:-5]
        detail = lt.fetch(link, headers=headers)
        pagination = detail.cssselect('.content-page .page-ch')[0].text
        max_page = int(re.findall(r'\d+', pagination)[0])
        img_urls = [
            f'http://img1.mm131.me/pic/{bango}/{n}.jpg'
            for n in range(1, max_page + 1)
        ]
        lt.async_save_imgs(img_urls, headers=headers, random_name=True)
Example #3
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.note-list li')
    for item in items:
        title = item.css('.content a.title::text').extract_first()
        author = item.css('a.nickname::text').extract_first()
        source = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}"
        vote = max(map(int, (item.css('.meta span').re(r'\d+'))))
        site = 'jianshu'
        date = datetime.utcnow()
        view = 0
        comment = 0
        try:
            comment = int(item.css('.meta a::text').re_first(r'\d+'))
        except TypeError:
            pass
        collect = 0
        row = (title, author, source, vote, site, date, view, comment, collect)
        print(row)
        try:
            cursor.execute(
                'INSERT INTO `article` (`title`, `author`, `source`, `vote`, `site`, `date`, `view`, `comment`, `collect`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)',
                row)
            connection.commit()
        except Exception as e:
            print(e)
Example #4
0
def test_get_img_name():
    tree = lt.fetch(f'{domain}/post')
    img = tree.css('a.directlink::attr(href)').extract_first()
    name = lt.get_img_name(img)
    random_name = lt.get_img_name(img, random_name=True)
    assert '%' not in name
    assert random_name != name
Example #5
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.cssselect('table tr')[1:]
    for item in items:
        schema = item.cssselect('td')[-5].text.lower()
        ip = item.cssselect('td')[1].text
        port = item.cssselect('td')[2].text
        proxy = f'{schema}://{ip}:{port}'
        print(proxy)
        proxies.append(proxy)
Example #6
0
def test_save_img():
    tree = lt.fetch(f'{domain}/post')
    img = tree.css('a.directlink::attr(href)').extract_first()
    name = lt.get_img_name(img)
    lt.save_img(img)
    with open(name, 'rb') as f:
        img_data = f.read()
    assert isinstance(img_data, bytes) and len(img_data) > 100
    assert not lt.save_img(broken_domain)
    os.remove(name)
Example #7
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('#waterfall .item')
    for item in items:
        data = {}
        data['name'] = item.css('img::attr(title)').extract_first()
        data['cover'] = item.css('img::attr(src)').extract_first()
        data['link'] = item.css('.movie-box::attr(href)').extract_first()
        data['bango'] = item.css('date::text').extract_first()
        data['date'] = item.css('date::text').extract()[1]
        yield data
Example #8
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('.list-view .item')
    for item in items:
        data = {}
        data['title'] = item.css('a::text').extract_first().strip()
        data['url'] = item.css('a::attr(href)').extract_first().strip()
        intro = item.css('span.intro::text').extract_first()
        data['date'] = intro[:10]
        data['intro'] = intro
        yield data
Example #9
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('li.shot-thumbnail')
    for item in items:
        data = {}
        data['title'] = item.css('a strong::text').extract_first()
        data['url'] = f"{domain}{item.css('a::attr(href)').extract_first()}"
        data['author'] = item.css('.display-name::text').extract_first()
        data['fav'] = int(item.css('span.toggle-fav::text').extract_first().strip())
        data['comment'] = int(item.css('li.cmnt span::text').extract_first().strip())
        yield data
Example #10
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul#browserItemList li.item')
    for item in items:
        data = {}
        data['title'] = item.css('h3 a.l::text').extract_first()
        info = item.css('p.info::text').extract_first().strip()
        date = info.split(r'/')[1].strip() if r'/' in info else info
        data['date'] = format_date(date)
        data['url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}"
        yield data
Example #11
0
def crawl(url):
    time.sleep(1)
    tree = lt.fetch(url)
    items = tree.css('#TopicsNode .cell')
    for item in items:
        data = {}
        data['title'] = item.css('span.item_title a::text').extract_first()
        data['author'] = item.css('span.small.fade strong a::text').extract_first()
        data['source'] = f"{domain}{item.css('span.item_title a::attr(href)').extract_first()}"
        reply = item.css('a.count_livid::text').extract_first()
        data['reply'] = int(reply) if reply else 0
        total.append(data)
Example #12
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('.type-post')
    for item in items:
        data = {}
        data['title'] = item.css('h2 a::text').extract_first()
        data['url'] = item.css('h2 a::attr(href)').extract_first()
        data['date'] = re.sub(
            r'年|月|日', '-',
            item.css('small span.date::text').extract_first())[:-1]
        data['view'] = int(item.css('small::text').re('\d+')[0])
        yield data
Example #13
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.car-monthlisting li')
    total = []
    for item in items:
        data = {}
        data['name'] = item.css('a::text').extract_first()
        data['url'] = item.css('a::attr(href)').extract_first()
        data['comments'] = int(item.css('span::text').re_first(r'(\d+)'))
        pprint(data)
        total.append(data)
    return total
Example #14
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.subject-list li.subject-item')
    for item in items:
        data = {}
        data['title'] = item.css('h2 a::text').extract_first().strip()
        data['link'] = item.css('h2 a::attr(href)').extract_first()
        data['pub'] = item.css('.pub::text').extract_first().strip()
        data['rating'] = float(
            item.css('span.rating_nums::text').extract_first())
        data['comments'] = int(item.css('span.pl').re_first(r'\d+'))
        pprint(data)
        col.insert_one(data)
Example #15
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('#inner_mid_col article')
    for item in items:
        if item.css('p.note::text').extract()[-1][10:] == 'English':
            data = {}
            data['name'] = item.css('p.title a::text').extract_first().strip()
            data['link'] = item.css('p.title a::attr(href)').extract_first().strip()
            data['author'] = item.css('p.note::text').extract_first()[3:]
            data['publisher'] = item.css('p.publisher::text').extract_first()[11:]
            data['year'] = int(''.join(item.css('p.date2::text').extract_first().split())[-4:])
            pprint(data)
            col.insert_one(data)
Example #16
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul#browserItemList li.item')
    for item in items:
        data = {}
        data['title'] = item.css('h3 a.l::text').extract_first()
        data['date'] = format_date(
            item.css('p.info::text').extract_first().strip().split(r'/')
            [1].strip())
        data[
            'url'] = f"{domain}{item.css('h3 a.l::attr(href)').extract_first()}"
        pprint(data)
        total.append(data)
Example #17
0
def crawl(url):
    tree = lt.fetch(url)
    data = dict()
    cols = ['salary', 'place', 'date', 'nature', 'experience', 'degree', 'amount', 'category']
    for i, col in enumerate(cols):
        data[col] = tree.cssselect('ul.terminal-ul li')[i].cssselect('strong')[0].text
    data['salary'] = "".join(data['salary'].split())
    del data['place']
    data['date'] = tree.cssselect('ul.terminal-ul li')[2].cssselect('strong #span4freshdate')[0].text
    data['category'] = tree.cssselect('ul.terminal-ul li')[7].cssselect('strong a')[0].text
    detail = tree.cssselect('.tab-inner-cont p')
    data['detail'] = ''.join([p.text for p in detail if p.text]).strip()
    pprint(data)
    total.append(data)
Example #18
0
def crawl(url):
    try:
        tree = lt.fetch(url)
        items = tree.css('ul.note-list li')
        for item in items:
            data = {}
            data['title'] = item.css('.content a.title::text').extract_first()
            data['author'] = item.css('a.nickname::text').extract_first()
            data['source'] = f"{domain}{item.css('.content a.title::attr(href)').extract_first()}"
            data['vote'] = max(map(int, (item.css('.meta span').re(r'\d+'))))
            pprint(data)
            total.append(data)
    except Exception as e:
        print(e)
Example #19
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.cssselect('.question-summary')
    for item in items:
        data = dict()
        data['question'] = item.cssselect('a.question-hyperlink')[0].text
        data['link'] = domain + item.cssselect('a.question-hyperlink')[0].get(
            'href')
        data['votes'] = int(item.cssselect('.vote-count-post strong')[0].text)
        data['answers'] = int(item.cssselect('.status strong')[0].text)
        data['views'] = int(''.join(
            item.cssselect('.views')[0].get('title')[:-6].split(',')))
        data['timestamp'] = item.cssselect('.relativetime')[0].get('title')
        pprint(data)
        col.insert_one(data)
Example #20
0
def crawl(url):
    tree = lt.fetch(url)
    posts = tree.cssselect('.cg')
    for post in posts:
        data = dict()
        title = post.cssselect('h1 a')[0]
        data['name'] = title.text
        data['url'] = domain + title.get('href')
        data['artist'] = post.cssselect('.artist-list')[0].text
        dj_content = post.cssselect('.dj-content .dj-desc')[0]
        td = dj_content.cssselect('tr td')
        data['series'] = td[1].text.strip() or 'N/A'
        data['type'] = td[3].cssselect('a')[0].text.strip()
        data['language'] = td[5].text.strip()
        data['tags'] = ', '.join([tag.text for tag in td[7].cssselect('.relatedtags ul li a')]) or 'N/A'
        data['date'] = post.cssselect('.dj-content p.cg-date')[0].text
        pprint(data)
Example #21
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.cssselect('ul.bigimg li')
    for item in items:
        data = dict()
        data['title'] = item.cssselect('a')[0].get('title').strip()
        data['detail'] = item.cssselect('a')[0].get('href')
        data['price'] = float(
            item.cssselect('p.price .search_now_price')[0].text[1:])
        data['author'] = item.cssselect('p.search_book_author a')[0].get(
            'title')
        data['date'] = item.cssselect(
            'p.search_book_author span')[1].text.strip()[1:]
        data['press'] = item.cssselect('p.search_book_author a')[-1].text
        data['comments'] = int(
            item.cssselect('p.search_star_line a')[0].text[:-3])
        pprint(data)
Example #22
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('ul.subject-list li.subject-item')
    for item in items:
        data = {}
        data['title'] = item.css('h2 a::text').extract_first().strip()
        data['link'] = item.css('h2 a::attr(href)').extract_first()
        data['pub'] = item.css('.pub::text').extract_first().strip()
        try:
            data['rating'] = float(
                item.css('span.rating_nums::text').extract_first())
        except Exception:
            data['rating'] = 0.0
        try:
            data['comments'] = int(item.css('span.pl').re_first(r'\d+'))
        except Exception:
            data['comments'] = 0
        yield data
Example #23
0
def get_total(categories):
    total = []
    for category in categories:
        tree = lt.fetch(category)
        links = tree.css('a::attr(href)').re('.*?/blog/\d+/\d+/.*')
        hints = tree.css('span.hint::text').extract()
        hints = [hint[1:][:-1].split('@') for hint in hints]
        titles = [
            title for title in tree.css('li a::text').extract()
            if title not in categories_texts
        ]
        data = [{
            'title': title,
            'url': link,
            'comments': int(hint[0]),
            'date': hint[1]
        } for title, link, hint in zip(titles, links, hints)]
        pprint(data)
        total.extend(data)
    return total
Example #24
0
def crawl(url):
    tree = lt.fetch(url)
    items = tree.css('table.n_worklist tr')
    for item in items:
        data = {}
        data['name'] = item.css('.work_name a::text').extract_first()
        data['link'] = item.css('.work_name a::attr(href)').extract_first()
        data['maker'] = item.css('dd.maker_name a::text').extract_first()
        try:
            data['price'] = int(''.join(item.css('span.work_price::text').extract_first().split(',')))
            data['rate'] = int(item.css('.star_rating::text').re_first('\d+'))
            data['review'] = int(item.css('.work_review a::text').re_first('\d+'))
        except Exception as e:
            print(e)
            data['price'] = 0
            data['rate'] = 0
            data['review'] = 0
        if not data['name']:
            continue
        yield data
Example #25
0
def crawl(url):
    tree = lt.fetch(url, use_cookies=True, headers=headers)
    time.sleep(0.5)
    items = tree.css('.wrap')
    for item in items:
        data = {}
        data['views'] = item.css('span.views var::text').extract_first()
        data['rating'] = int(item.css('.value::text').extract_first()[:-1])
        viewKey = item.css('a::attr(href)').extract_first().split('=')[-1]
        video = requests.get(f'https://{domain}/embed/{viewKey}',
                             cookies=cookies,
                             headers=headers).text
        flashvars = re.findall('var flashvars =(.*?),\n', video)[0]
        info = json.loads(flashvars)
        data['title'] = info.get('video_title')
        data['duration'] = info.get('video_duration')
        data['image'] = info.get('image_url')
        data['link'] = info.get('link_url')
        data['quality_480p'] = info.get('quality_480p')
        pprint(data)
        col.insert_one(data)
Example #26
0
def test_get_img_info():
    tree = lt.fetch(f'{domain}/post')
    img = tree.cssselect('a.directlink')[0]
    url, name = lt.get_img_info(img)
    assert url == img.get('href') and '%' not in name
Example #27
0
def test_fetch():
    tree = lt.fetch(f'{domain}/post')
    imgs = tree.cssselect('a.directlink')
    assert len(imgs) > 0
Example #28
0
def test_fetch():
    tree = lt.fetch(f'{domain}/post')
    imgs = tree.css('a.directlink::attr(href)').extract()
    assert len(imgs) > 0 and isinstance(imgs[0], str)
    assert not lt.fetch(broken_domain)
Example #29
0
def crawl(url):
    tree = lt.fetch(url)
    imgs = tree.cssselect('a.directlink')
    lt.async_save_imgs(imgs)
Example #30
0
def crawl(url):
    tree = lt.fetch(url)
    imgs = tree.cssselect('img.lazy')
    imgs = [img.get('data-original') for img in imgs]
    lt.save_imgs(imgs)