Python Spider.crawl Examples

Programming Language: Python

Namespace/Package Name: spider

Class/Type: Spider

Method/Function: crawl

Examples at hotexamples.com: 17

The python spider.Spider.crawl is a method that allows a spider program to initiate the crawling process. It is used to traverse various web pages and gather relevant information. This method enables the spider to start visiting URLs and extracting data based on the specified crawling logic. It is an essential function for building web scraping programs and automating data extraction tasks.

Python Spider.crawl - 17 examples found. These are the top rated real world Python examples of spider.Spider.crawl extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Spider(30)

crawl_page(30)

crawl(14)

__init__(8)

craw(4)

Search(4)

crawl_genre(3)

build_node(3)

analyse(3)

process_page(2)

court(2)

add_url(2)

content_list(2)

GetInfo(2)

crowl(1)

crowl_page(1)

GET(1)

crawled_page(1)

createResultExcel(1)

get2l_url(1)

crawledPage(1)

crawle_page_in_queue(1)

crawl_weather(1)

crawl_video_urls(1)

crawl_robots(1)

data(1)

getfilename(1)

get3l_url(1)

post(1)

update(1)

startCrawl(1)

setworkdir(1)

setfilename(1)

setDaemon(1)

responseCallback(1)

parse_blog(1)

getSoup(1)

linkCallback(1)

levelCallback(1)

is_valid(1)

is_outgoing(1)

htmlCallback(1)

get_pdfs(1)

crawl_page_graph(1)

crawl_async_slots(1)

crawl_next_page_from_queue(1)

authorized(1)

Process(1)

ReturnValues(1)

Text(1)

Example #1

Show file

File: wechat_main.py Project: AdamWu/spider

def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    # 读取url列表
    file = open('msglist.json')
    text = file.read()
    file.close()
    urls = json.loads(text)

    urls_visited = []
    if os.path.exists('visited.txt'):
        file = open('visited.txt', 'r')
        for line in file:
            urls_visited.append(line.rstrip())

    urlmap = {}
    for item in urls:
        title = item['title']
        url = item['url']
        if url in urls_visited:
            print 'visited', url
            continue

        urlmap[url] = title
        queue.put(url)

    # start 
    file = open('visited.txt', 'a')
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        Spider.crawl(url)
        print "analyse ", url
        logging.info('now analyse %s', url)
        images = Spider.analyse()
   
        queue.task_done()

        visited.add(url)

        save(images, urlmap[url])

        file.write(url+'\n')
        file.flush()

    file.close()
    print 'finished'
    logging.info('finished')

Example #2

Show file

File: main.py Project: adamturn/port-spider

def get_history():
    """Download historical vessel schedules."""

    src_dir = get_src_dir_path(__file__)
    data_dir = src_dir.parent / "data"

    start_date = datetime.date(year=2020, month=1, day=1)
    end_date = datetime.date(year=2020, month=1, day=10)
    time_delta = datetime.timedelta(days=1)

    file_paths = []
    jocasta = Spider()
    while start_date <= end_date:
        time.sleep(random.randrange(0, 5, 1))
        file_path = jocasta.crawl(data_dir, date=start_date)
        file_paths.append(file_path)
        start_date += time_delta
        
    for file_path in file_paths:
        if file_path.endswith('.xlsx'):
            print("unhandled xlsx file!")

        elif file_path.endswith('.pdf'):
            tables = Wrangler.parse_pdf(file_path)

        else:
            raise ValueError(f"Unexpected file type: {file_path}")

    return None

Example #3

Show file

File: crawler.py Project: luckrill/webcrawler

def main():
    url = "https://www.readmorejoy.com"

    args = sys.argv[0:]

    # Execute function depending on arguments
    # print(len(args))
    if len(args) == 1:
        print_help()
    elif len(args) == 2:
        url = args[1]
        #print(url)
        print("start a web crawling")
        spider = Spider(url)
        spider.crawl(url)
        print("web crawling done")
    else:
        print_help()

Example #4

Show file

def update(data_dir):
    print("Crawling...")
    url = "https://civilization.fandom.com/wiki/Leaders_(Civ6)"
    response = Spider.crawl(url)

    print("Processing html...")
    records = Wrangler.process_html(response, data_dir)

    return records

Example #5

Show file

def run():
    req_data = request.get_json()

    email = None
    if 'email' in req_data:
        email = req_data['email']
    password = None
    if 'password' in req_data:
        password = req_data['password']

    url_list = get_urls()

    if email and password and url_list:
        spidy = Spider()
        url_list = spidy.modify_urls(url_list)
        spidy.crawl(url_list=url_list, email=email, password=password)
        return ''' Done '''
    else:
        return ''' ERROR in email or password or while fetching urls '''

Example #6

Show file

def main():
    # Fetch the arguments; the first elements in sys.argv is this python file itself - so ignore
    args = sys.argv[1:]

    # Execute function depending on arguments
    if len(args) == 1:
        if args[0] == "-test":  # test
            test = Test()
            test.test_all()
        elif args[0] == "-help":  # help
            print_help()
        else:
            print_help(True)
    elif len(args) == 2:
        if args[0] == "-c":  # crawl
            url = args[1]
            print("[crawler.py] start crawling")
            spider = Spider()
            spider.crawl(url)
            print("[crawler.py] crawling done")
        else:
            print_help(True)
    else:
        print_help(True)

Example #7

Show file

File: main.py Project: AdamWu/spider

def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    queue.put(URL)

    # start
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        html = Spider.crawl(url)
        images = Spider.analyse(html)
        links = Spider.analyse_links(html)

        queue.task_done()

        visited.add(url)

        save(images)

        # new urls
        for link in links:
            if (link not in visited) and link[0:18] == 'http://pp.163.com/':

                exist = False
                for ignore in IGNORES:
                    match = re.search(re.compile(ignore), link)
                    if match:
                        #logging.info("exclude %s", link)
                        exist = True
                        break

                if exist == False:
                    queue.put(link)

    print 'done'

Example #8

Show file

class Helper(object):

    def __init__(self, url, cook):
        self.spider = Spider(url, cook)
        self.content = ''
        self.mail_helper = Mailhelper()


    def refresh(self):
    """重新抓取网页内容"""
    
        self.content = self.spider.crawl()


    def is_new_msg(self):
    """判断抓取的内容是否为最新动态"""
    
        if os.path.exists('weibo.txt'):
            with open('weibo.txt', 'r') as fi:
                # 这里的split作用是去除微博中的时间信息，避免误判旧消息为新消息
                txt_content = '\n'.join(fi.read().split('\n')[:-1])
                new_content = '\n'.join(self.content.encode('utf8').split('\n')[:-1])
                if new_content is txt_content:
                    return False
                else:
                    return True
        else:
            return True


    def send_mail(self):
    """发送邮件到指定邮箱"""
    
        to_list = ['*****@*****.**']      # 接收方邮箱
        sub = u'微博更新'             # 邮件标题
        if self.mail_helper.send_mail(to_list, sub, self.content):
            print u'发送成功！'
        else:
            print u'发送失败！'

Example #9

Show file

File: run.py Project: elonzh/hfut-data-mining

    shortcut = Student(2013217413, '123456789012', 'XC')
    c = pymongo.MongoClient()

    db = c['hfut']
    init_db(db)

    # 初始化任务池
    # 合适的数据库任务池大小和缓冲区大小能更好的利用带宽
    # 数据库记录的最大并发为 db_pool_size * batch_size
    # 当请求池大小大于20时很容易导致服务器错误抓取不到结果
    job_manager = JobManager(pool_size=20)
    db_manager = DatabaseManager(db, batch_size=80)

    j = Spider(shortcut, job_manager, db_manager)

    j.crawl()

    # def patch():
    #     for i in range(21, 31):
    #         term = '%03d' % i
    #         yield term, None, '_'
    #         # for args in j.iter_teaching_class(term, course_name='_'):
    #         #     yield args
    #
    #
    # jobs = (patch, j.iter_teaching_class, j.sync_students)
    # job_manager.jobs = jobs
    #
    # logger.info('Crawl start!'.center(72, '='))
    # job_manager.start()
    # logger.info('Jobs are all dispatched. Waiting for database requests handling.')

Example #10

Show file

File: scheduler.py Project: srt4-archive/Qrawler

	def startCrawl(self):
		spider = Spider(self.userList)
		spider.crawl(self.hasProcessed)

Example #11

Show file

File: main.py Project: nithishdivakar/Sigen

log_it("TEMP DIR",temp_dire)

if os.path.exists(temp_dire):
    shutil.rmtree(temp_dire)
distutils.dir_util.copy_tree(src_dir,temp_dire)
owd = os.getcwd()

log_it("LOG","Crawling started")
spider = Spider(temp_dire)
log_it("LOG","Crawling done")
# spider.crawl()

log_it("LOG","Compileing pages started")
posts_data=[]
for post_folder in spider.crawl():
    config = json.load(open(os.path.join(post_folder,"__pub.lish")))
    t_date = time.strptime(config['date'],"%Y-%m-%d")
    posts_data.append({
        'title': config['name'].replace('-', ' '),
        'url'  : post_folder[len(temp_dire)+1:],
        'year' : time.strftime("%Y",t_date),
        'day'  : time.strftime("%d",t_date),
        'month': time.strftime("%b",t_date),
        'date' : t_date
    })
    compiler = Compilers[config['type']]
    owd = os.getcwd()
    os.chdir(post_folder)
    compiler.compile(config['file'])
    os.chdir(owd)

Example #12

Show file

File: run.py Project: NARESH2017/intelliQ

                           user=config.db_user,
                           passwd=config.db_password,
                           db=config.db_database,
                           charset='utf8')
    cursor = conn.cursor()
    cursor.execute(
        'select configValue from t_spider_config where configKey=%s',
        (arg_config.get(sys.argv[1]), ))
    config_values = [row[0] for row in cursor.fetchall()]
    if sys.argv[1] == 'paper':
        spider_paper = Spider('paper')
        for search_exp in config_values:
            reqs = parser.paper_page_parser(search_exp)[:500]
            for req in reqs:
                spider_paper.add_request(req)
        spider_paper.crawl()

    if sys.argv[1] == 'news':
        spider_news = Spider('news')
        for seed_url in config_values:
            spider_news.add_request(
                Request(arg=seed_url, parser=parser.news_parser))
        spider_news.crawl()

    if sys.argv[1] == 'patent':
        spider_patent = Spider('patent')
        for search_exp in config_values:
            spider_patent.add_request(
                Request(arg=search_exp, parser=parser.patent_parser))
        spider_patent.crawl()

Example #13

Show file

def work():
    while True:
        url = queue.get()
        Spider.crawl(threading.current_thread().name, url)
        queue.task_done()

Example #14

Show file

File: run.py Project: C0reFast/intelliQ

    conn = MySQLdb.connect(host=config.db_host,
                           user=config.db_user,
                           passwd=config.db_password,
                           db=config.db_database,
                           charset='utf8')
    cursor = conn.cursor()
    cursor.execute('select configValue from t_spider_config where configKey=%s',
                   (arg_config.get(sys.argv[1]),))
    config_values = [row[0] for row in cursor.fetchall()]
    if sys.argv[1] == 'paper':
        spider_paper = Spider('paper')
        for search_exp in config_values:
            reqs = parser.paper_page_parser(search_exp)[:500]
            for req in reqs:
                spider_paper.add_request(req)
        spider_paper.crawl()

    if sys.argv[1] == 'news':
        spider_news = Spider('news')
        for seed_url in config_values:
            spider_news.add_request(Request(arg=seed_url,
                                    parser=parser.news_parser))
        spider_news.crawl()

    if sys.argv[1] == 'patent':
        spider_patent = Spider('patent')
        for search_exp in config_values:
            spider_patent.add_request(Request(arg=search_exp,
                                      parser=parser.patent_parser))
        spider_patent.crawl()

Example #15

Show file

File: main.py Project: jorsae/Crawler

def test2():
    sp = Spider('reddit.com')
    sp.crawl_robots()
    sp.test()
    sp.crawl()

Example #16

Show file

File: main_simple.py Project: tripathyamit/WebCrawlerLight

def crawl_t():
    waiting_list_count=len(Spider.waiting_list)
    while len(Spider.waiting_list) > 0:
        # print("No Of Links Waiting To be Further Crawled: " + str(len(Spider.waiting_list)))
        url=Spider.waiting_list.pop()
        Spider.crawl("Spider",url)

Example #17

Show file

File: app.py Project: ChowRobin/ndvw

def search(username):
    print(username)
    spider = Spider()
    spider.crawl(username)
    return render_template('detail.html')