Python Spider.analyse Examples

Programming Language: Python

Namespace/Package Name: spider

Class/Type: Spider

Method/Function: analyse

Examples at hotexamples.com: 3

Python Spider.analyse - 3 examples found. These are the top rated real world Python examples of spider.Spider.analyse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Example #1

Show file

 def main():
     reload(sys)
     sys.setdefaultencoding('utf8')
     spider = Spider('python', '杭州')
     spider.setSalay(5.9, 16, 10.9, 31.0)
     spider.addShieldCompany('畅唐网络')
     spider.addShieldCompany('中国亿教亿学网')
     spider.addContainText('C++')
     spider.addContainText('c++')
     #spider.addContainText('爬虫')
     spider.analyse()

Example #2

Show file

File: wechat_main.py Project: AdamWu/spider

def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    # 读取url列表
    file = open('msglist.json')
    text = file.read()
    file.close()
    urls = json.loads(text)

    urls_visited = []
    if os.path.exists('visited.txt'):
        file = open('visited.txt', 'r')
        for line in file:
            urls_visited.append(line.rstrip())

    urlmap = {}
    for item in urls:
        title = item['title']
        url = item['url']
        if url in urls_visited:
            print 'visited', url
            continue

        urlmap[url] = title
        queue.put(url)

    # start 
    file = open('visited.txt', 'a')
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        Spider.crawl(url)
        print "analyse ", url
        logging.info('now analyse %s', url)
        images = Spider.analyse()
   
        queue.task_done()

        visited.add(url)

        save(images, urlmap[url])

        file.write(url+'\n')
        file.flush()

    file.close()
    print 'finished'
    logging.info('finished')

Example #3

Show file

File: main.py Project: AdamWu/spider

def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    queue.put(URL)

    # start
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        html = Spider.crawl(url)
        images = Spider.analyse(html)
        links = Spider.analyse_links(html)

        queue.task_done()

        visited.add(url)

        save(images)

        # new urls
        for link in links:
            if (link not in visited) and link[0:18] == 'http://pp.163.com/':

                exist = False
                for ignore in IGNORES:
                    match = re.search(re.compile(ignore), link)
                    if match:
                        #logging.info("exclude %s", link)
                        exist = True
                        break

                if exist == False:
                    queue.put(link)

    print 'done'