Python Spider.Spider Examples, Spider.Spider.Spider Python Examples

Example #1

0

Show file

 def getMovieHtml(
     self, filename, name, j, end, z
 ):  #从filename中获取影片存放影片具体信息的网页url，抓取url指向的网页，name是开始url，抓取下来保存文件的名字，end是结束url
     file = open(filename, 'r', encoding="UTF-8")
     i = 0
     j = j
     flag = False
     for url in file:
         if i % 2 == 0:
             m = re.search(r'h.*', url)
             if m.group() == name:
                 flag = True
             if flag:
                 html = Spider().getHtml(m.group())
                 if html:
                     Spider().saveHtml(html, dizhi[z] + str(j) + ".html")
                     print(str(m) + " :" + str(j) + ".html已存储~")
                     j += 1
             if m.group() == end:
                 return
         else:
             print(url.strip())
         i += 1
     if j % 90 == 0:
         time.sleep(120)  #为了防止请求过于频繁，抓取一定页数就暂停一下

Example #2

0

Show file

File: run.py Project: saber110/EnglishComprehensive

def going(url):
    mail = email()
    text = wordsDeal()
    spider1 = Spider(url, 'test')
    hrefs = spider1.hrefFor2018()
    for item in hrefs:
        try:
            content = spider1.contentOfArtical(item['href'])
            if content['content'] != "contents":
                query = "insert into Christian(href,title, content, sent) values ( '" + text.sqlEscape(
                    item['href']) + "','" + text.sqlEscape(
                        item['title']) + "','" + text.sqlEscape(
                            content['content']
                        ) + "','" + config.notSend + "');"
                sqlQuery(query)
                print "YES ", item['title']
                # mail.sendAuto(content['title'], content['content'] + '<p>' + Chinese + '</p>')
        except BaseException as error:
            query = "insert into SpiderExcept(href,except) values ( '" + text.sqlEscape(
                item['href']) + "','" + text.sqlEscape(str(error)) + "');"
            sqlQuery(query)
        else:
            print "NO ", item['title']
    del spider1
    del text
    del mail

Example #3

0

Show file

File: start.py Project: 993162337/crawler

def main():
    # 根据所带参数，确定使用哪个网站的配置参数
    try:
        website = sys.argv[1]
        url = sys.argv[2]
    except Exception as e:
        print "please choose one website"
        exit()

    # 实例化
    dic = {
        "qidian": Qidian,
        "heiyan": Heiyan,
    }
    config = dic[website]()

    # 获取关键信息
    handler = Spider(config.title, config.content, config.next)

    chapters = config.getList(url)

    book = open("text.txt", "w")

    for item in chapters:
        print "正在下载->", item["title"]
        content = handler.getContent(item["href"])

        book.writelines(item["title"] + "\n")
        book.writelines(content["content"] + "\n")

Example #4

0

Show file

File: app_gui.py Project: 993162337/crawler

	def startCB(self):
		# 保存内容的文件
		file = open(self.filePath, "w")

		# 爬取得规则
		titleKlass = {"class": "j_chapterName"}
		contentKlass = {"class": "j_readContent"}
		nextKlass = {"id": "j_chapterNext"}

		page = self.entryUrl.get()
		# 开始爬取
		spider = Spider(titleKlass, contentKlass, nextKlass)

		if page == "" or self.filePath == "":
			tkMessageBox.showerror("woolson", "小说名称或链接未填写！")
		else:
			# 循环抓取下一章
			while page != "":
				result = spider.getContent(page)

				try:
					page = result["nextUrl"]
					file.write(result["title"] + "\n")
					file.write(result["content"] + "\n\n")

					print "正在写入->" + result["title"]
				except Exception as e:
					page = ""
					print "结束", result["error"]

Example #5

0

Show file

def parse_listing(categories):

    postman = Postman.init()

    for category in categories:
        print(category)
        spider = Spider(url=category["url"])

        with open("page.html", "a") as f:
            f.write(spider.get_page())

        soup = BeautifulSoup(spider.get_page(), "html.parser")
        soup_a = soup.find_all("a", class_="item _item")
        products = [a["href"] for a in soup_a]

        print(products)

        for num, url in enumerate(products):
            parse_detail(category, url, postman)
            print(category, num, len(products))
            print("sleep 3")
            time.sleep(1)
            print("sleep 2")
            time.sleep(1)
            print("sleep 1")
            time.sleep(1)

        break

Example #6

0

Show file

def initialization():
    # get a list of user ids as tasks
    uid_list = get_tasks(TASK_NUM)
    # get a list of users as crawling accounts
    user_list = get_accounts(ACCOUNT_NUM)
    spider = Spider(user_list)

    return spider, uid_list, user_list

Example #7

0

Show file

File: Crawler.py Project: ADS-E/WebCrawler_Defined_Alter

    def create_threads(self):
        """Create, start and add threads to a list. Threads run an instance of Spider.
        The amount of threads created depends on the amount of cores found in the system."""

        for i in range(1, multiprocessing.cpu_count()):
            name = "Thread-%s" % i
            thread = Spider(name, self.queue, self.result)
            thread.start()
            threads.append(thread)

Example #8

0

Show file

File: DefaultManager.py Project: zhit/spider

    def spider_store_details(self):
        stores = self.query_store()
        for store in stores:
            time.sleep(1)
            try:
                contacts = None
                # 爬取联系方式页
                spider_time = time.time()
                self.logger.info('开始爬取：' + store[1])
                result_contacts = Spider().spider_URL(
                    url=store[1], is_proxies=True)
                self.logger.info('爬取耗时：' + str(time.time() - spider_time))

                # 解析联系方式页
                interpreting_time = time.time()
                self.logger.info('开始解析：' + store[1])
                contacts = Interpreter().interpreting_contact_info(
                    result_contacts)
                self.logger.info('解析耗时：' +
                                 str(time.time() - interpreting_time))
                # 爬取联系方式页
                if not contacts:
                    spider_time = time.time()
                    store[1] += '/shop/company.html'
                    self.logger.info('开始爬取：' + store[1])
                    result_contacts = Spider().spider_URL(
                        url=store[1], is_proxies=True)
                    self.logger.info('爬取耗时：' + str(time.time() - spider_time))

                    # 解析联系方式页
                    interpreting_time = time.time()
                    self.logger.info('开始解析：' + store[1])
                    contacts = Interpreter().interpreting_contact_info(
                        result_contacts)
                    self.logger.info('解析耗时：' +
                                     str(time.time() - interpreting_time))

                # 更新联系方式
                self.update_contacts(store[0], contacts)
            except BaseException:
                self.logger.error('爬取或更新联系方式出错：' + traceback.format_exc())
                continue

Example #9

0

Show file

File: Manage.py Project: xfzhu2003/github

def run(key):
    url = set_url(host, key)
    Cookies()
    spider = Spider(url)
    html = spider.spider(BASEHEADERS)
    if not verify(html):
        BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str(
            ["acw_tc", "PHPSESSID"])
        proxieser.proxies()
    parser = HtmlParser(html)
    data = parser.parser("fund")

    print(data)

Example #10

0

Show file

def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems'

    ### p_key and n
    p_key = []
    n_key = []

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

Example #11

0

Show file

File: api.py Project: ZhenShaw/GZHU-ClassTable

def course():

    post_format = {"username": "", "password": ""}
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']

        user = Spider(username, password)
        user.login()
        if user.login_status:
            info = user.modify_data()
            return jsonify(info)
        else:
            return "登录失败"
    else:
        return render_template("index.html", format=post_format)

Example #12

0

Show file

File: api.py Project: ZhenShaw/GZHU-ClassTable

def grade():

    post_format = {"username": "", "password": ""}
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']

        grade = Spider(username, password)
        grade.login()
        if grade.login_status:
            info = grade.modify_grade()
            print(info)
            return jsonify(info)
        else:
            return "登录失败"
    else:
        return render_template("index.html", format=post_format)

Example #13

0

Show file

    def spider_product(self, category_2nd, start, end):
        # 解析产品列表page1~page100
        for i in range(start, end):
            try:
                # 爬取下一页休息3s
                time.sleep(1)
                # 分三次抓取当前页60个产品
                product_list = []
                for j in range(3):
                    req = category_2nd[
                        1] + '&ap=A&t=1&afadprenum=0&af=1' + '&ee=' + str(
                            i) + '&afadbeg=' + str(60 * (i - 1) + (j * 20) + 1)
                    # 爬取并解析
                    spider_time = time.time()
                    self.logger.info('开始爬取：' + req)
                    result_product_list = Spider().spider_URL(url=req,
                                                              is_proxies=True)

                    self.logger.info('爬取耗时：' + str(time.time() - spider_time))

                    interpreting_time = time.time()
                    self.logger.info('开始解析：' + req)
                    product_list.extend(
                        Interpreter().interpreting_product_list(
                            result_product_list))
                    self.logger.info('解析耗时：' +
                                     str(time.time() - interpreting_time))
                '''
                先爬取所有产品列表信息，后再逐个抓取产品详情及公司信息
                '''
                for product in product_list:
                    # 保存公司信息
                    contact = {}
                    contact['公司名'] = product.get('company')
                    contact['公司主页'] = product.get('homepage')
                    store_id = self.save_contacts(contact)
                    # 保存产品
                    self.save_product(product, category_2nd[0], store_id)
            except BaseException:
                self.logger.error('爬取或解析' + req + '出错：' +
                                  traceback.format_exc())
                continue

Example #14

0

Show file

File: DefaultManager.py Project: zhit/spider

    def spider_product_details(self):
        products = self.query_products()
        for product in products:
            time.sleep(1)
            try:
                # 爬取产品详情页
                spider_time = time.time()
                self.logger.info('开始爬取：' + product[1])
                result_products = Spider().spider_URL(
                    url=product[1], is_proxies=True)
                self.logger.info('爬取耗时：' + str(time.time() - spider_time))

                # 解析产品详情页
                interpreting_time = time.time()
                self.logger.info('开始解析：' + product[1])
                details = Interpreter().interpreting_product_details(
                    result_products)
                self.logger.info('解析耗时：' +
                                 str(time.time() - interpreting_time))
                '''                  
                # 若产品详情需要再次爬取
                product_bcid = details.get('bcid')
                if product_bcid:
                    interpreting_time = time.time()
                    xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid='
                    result_product_introduce = Spider().spider_URL(
                        url=xss_filter + product_bcid)
                    self.logger.info('开始解析：' + xss_filter + product_bcid)
                    details['desc'] = Interpreter(
                    ).interpreting_product_details_desc(
                        result_product_introduce)
                    # 组装产品详情
                    details = Interpreter().assemble_product_details(details)
                    self.logger.info('解析耗时：' +
                                     str(time.time() - interpreting_time)) '''
                # 更新产品详情
                self.update_products(product[0], details)
            except BaseException:
                self.logger.error('爬取或更新产品详情出错：' + traceback.format_exc())
                continue

Example #15

0

Show file

File: google_crawler.py Project: w140601/google-scholar-crawler

def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5'

    ### p_key and n
    p_key = [
        'wdrc', 'dynamic range compression', 'hearing aid', 'speech',
        'noise cancellation', 'noise reduction', 'feedback cancellation',
        'sound', 'hearing loss'
    ]
    n_key = [
        'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum',
        'photon'
    ]

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

Example #16

0

Show file

File: xfdspider.py Project: XiaohuiTian/pythonspider--xinfadi

    def spider_job():
        try:
            spider = Spider.Spider()
            spider.spider(categoryId=1)
            spider.spider(categoryId=2)
            spider.spider(categoryId=3)
            spider.spider(categoryId=4)
            spider.spider(categoryId=5)

            print("success")

            time.sleep(5)

        except Exception as err:

            # 错误预警，发送邮件
            errStr = ",".join(err.args)
            myEmail = MyEmail.MyEmail()
            myEmail.tag = "新发地商品数据爬去异常"
            myEmail.to_list = ["*****@*****.**"]
            myEmail.content = errStr
            myEmail.send()
            print(errStr)

Example #17

0

Show file

def main(batchSize=25,
         threads=8,
         timeout=3.0,
         maxDepth=1,
         limitPerSpider=200,
         webpagesLimit=1016,
         initialize=False,
         recursive=False,
         allLinks=False,
         mode='normal'):

    colorsCombinations = [(color, style) for style in styles
                          for color in colors]

    db = Database()

    if (initialize):
        db.Initialize()
        return 0

    toVisit = []

    if mode == 'explore':
        recursive = False
        allLinks = True
        toVisit = list(
            db.notVisited.find({'baseDomain': True}, {
                'url': 1,
                'depth': 1
            }))
    elif mode == 'in-depth':
        recursive = True
        allLinks = False
        toVisit = list(
            db.notVisited.find({'baseDomain': False}, {
                'url': 1,
                'depth': 1
            }))
        knownDomains = set(db.GetKnownDomains())
        toVisit = list(
            filter(
                lambda element: GetBaseDomain(element['url']) in knownDomains,
                toVisit))
    else:
        toVisit = list(
            db.notVisited.find({}, {
                'url': 1,
                'depth': 1
            }).limit(webpagesLimit))

    threads = len(toVisit) if len(toVisit) < threads else threads

    print(f'{Fore.BLUE}Webpages to visit: {len(toVisit)}{Style.RESET_ALL}')
    print(f'{Fore.BLUE}Threads: {threads}{Style.RESET_ALL}')

    if threads == 0:
        print(f'{Fore.BLUE}Nothing to do...{Style.RESET_ALL}')
        return 0

    webpagesPerSpider = int(math.floor(len(toVisit) / threads))

    webpagesPerSpider = limitPerSpider if webpagesPerSpider > limitPerSpider else webpagesPerSpider

    chunks = [
        toVisit[i:i + webpagesPerSpider]
        for i in range(0, len(toVisit), webpagesPerSpider)
    ]

    Spider.allLinks = allLinks
    Spider.batchSize = batchSize
    Spider.limit = limitPerSpider
    Spider.maxDepth = maxDepth
    Spider.recursive = recursive
    Spider.timeout = timeout

    print(f'{Fore.BLUE}Spiders: {len(chunks)}{Style.RESET_ALL}')

    spiders = []
    for i in range(len(chunks)):
        spiderColors = colorsCombinations[i % len(colorsCombinations)]
        spider = Spider(str(i), spiderColors[0], spiderColors[1])
        spider.toVisit = chunks[i]
        spiders.append(spider)

    with concurrent.futures.ThreadPoolExecutor(
            max_workers=threads) as executor:

        futures = [executor.submit(spider.Search) for spider in spiders]

        for future in futures:
            future.result()

Example #18

0

Show file

# -*- coding: utf-8 -*-
import os
from flask import Flask, request, Response
from flask_uploads import UploadSet, configure_uploads, IMAGES, patch_request_class
import json
from IRNet import IRNet
from Spider import Spider

app = Flask(__name__)
app.config['UPLOADED_PHOTOS_DEST'] = os.getcwd() + '/upload'

net = IRNet()
net.load_model()
net.predict('1.jpg')
spider = Spider()

photos = UploadSet('photos', IMAGES)
configure_uploads(app, photos)
patch_request_class(app)  # set maximum file size, default is 16MB

#net = IRNet()
#net.load_model()

html = '''
    <!DOCTYPE html>
    <title>Upload File</title>
    <h1>图片上传</h1>
    <form method=post enctype=multipart/form-data>
         <input type=file name=photo>
         <input type=submit value=上传>
    </form>

Example #19

0

Show file

File: Starter.py Project: adityathakker/Varys-Search-Engine

from Spider import Spider
from Query import Query
import sys

arguments = sys.argv
if arguments[1] == "crawl":
    spider = Spider("https://en.wikipedia.org/")
    spider.crawl()
elif arguments[1] == "query":
    query = Query(arguments[2])
    query.query()
# # query.multiWordQuery(["action","design"])

Example #20

0

Show file

from Course import Course
from Spider import Spider
import sys

if __name__ == '__main__':
    course = Course(term=sys.argv[1], CRN=sys.argv[2])
    header = {
        'User-Agent':
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0'
    }
    cookie = "JSESSIONID=3AC5E57A46288ADDCFC379DD705B8DFB; _ga=GA1.2.1751626557.1561253718; rollup=GA1.2.2143599923.1561254105; subdirectory=GA1.2.1907215284.1561254106; subdomain=GA1.2.1953509159.1561254106; _mkto_trk=id:558-EBH-425&token:_mch-neu.edu-1561254106843-66694; _hjid=f7bd8518-0e00-4b57-b491-5115c949e1f6; _sp_id.cb6f=219c05ce-514e-4010-a0eb-21cba0580c12.1561254106.4.1565081306.1564370721.6ac23e35-e12c-4820-8032-deff60463f54; _gid=GA1.2.1060020417.1574032199; nubanner-cookie=3676250523.36895.0000; IDMSESSID=B450B8320592C4EDD35CCFAB19035112A99279B3FAB0DB15988A4C35356A1ED4E3F335459F9D7568EC15FB5E07D48BF085D6D41FB53655C6A844236EC2477175"
    spider = Spider(course, cookie, header)
    spider.setTimeGap(5)
    spider.start()

Example #21

0

Show file

File: main.py Project: chocoai/TmallSpider

from util.MysqlManager import MysqlManager
from Spider import Spider
import logging

logging.basicConfig(filename='./log/20181022.txt', level=logging.INFO)

# def crawl(url, brand, class_):
#     print(os.getpid(), url, brand, class_)
#     spider = Spider(url, brand, class_)
#     spider.work()

if __name__ == "__main__":
    records = MysqlManager().fetch_all_source()
    for record in records:
        url = record.get("url")
        brand = record.get("brand")
        class_ = record.get("class")
        # p = Process(target=crawl, args=(url, brand, class_))
        # p.start()
        # time.sleep(2)
        spider = Spider(url, brand, class_)
        spider.work()

Example #22

0

Show file

File: Interpreter.py Project: zhit/spider

    def interpreting_product_details(self, html_doc):
        try:
            # 存储产品信息
            product = {}
            doc = BeautifulSoup(html_doc, 'html5lib')
            # 产品图片
            img = []
            # 产品多图
            product_li = doc.find_all('li', class_='tab-trigger')
            if product_li:
                for li in product_li:
                    product_a = li.find('a',
                                        attrs={
                                            "data-useractivelogs":
                                            "UserBehavior_detail_smallphoto"
                                        })
                    if product_a:
                        # 获取largeimage
                        product_img = product_a.find('img')
                        product_img_src = product_img.get('src')
                        # 获取原图
                        last_index = product_img_src.rfind('..')
                        img.append(product_img_src[:last_index])
            # 该产品不存在多图时，使用默认的一张大图
            else:
                product_img_div = doc.find('div', class_='vertical-img')
                if product_img_div:
                    product_img = product_img_div.find(
                        'a',
                        attrs={
                            "data-useractivelogs":
                            "UserBehavior_detail_bigphoto"
                        })
                    product_img_hrefs = product_img.get('hrefs')
                    if not product_img_hrefs:
                        product_img_src = product_img.find('img').get('src')
                        # 获取原图
                        last_index = product_img_src.rfind('..')
                        product_img_hrefs = product_img_src[:last_index]
                    img.append(product_img_hrefs)
            product['imgs'] = img
            ''' 
            产品详情有两种展示效果，因此需要不同解析
            '''
            # 产品详情整个内容
            pdetail = doc.find('div',
                               id='pdetail',
                               class_='proDetailCon tab_content_event_class')
            if pdetail is not None:
                # 获取产品唯一识别id
                product_bcid = doc.find('input', id='bcid').get('value')
                detail_bot = pdetail.find('div', class_='detailBot')
                detail_bot.decompose()
                introduce = pdetail.find('div', id='introduce')
                if product_bcid:
                    product['bcid'] = product_bcid
                    xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid='
                    result_product_introduce = Spider().spider_URL(
                        url=xss_filter + product_bcid)

                    product_introduce = self.interpreting_product_details_desc(
                        result_product_introduce)
                    introduce.replace_with(
                        BeautifulSoup(product_introduce, 'html.parser'))
            else:
                pdetail = doc.find('div',
                                   id='pdetail',
                                   class_='pdetail tab_content_event_class')
                if pdetail is not None:
                    # 基本参数
                    vopy = pdetail.find('div', class_="d-vopy")
                    # 去除基本参数列表中图片div
                    vopyImgBoxs = vopy.find_all('div', class_='d-vopyImgBox')
                    for vopyImgBox in vopyImgBoxs:
                        vopyImgBox.decompose()
                    # 去除基本参数列表中同类产品显示span
                    span = pdetail.find_all(
                        'span', class_='same-parameter-commodity-hook')
                    for s in span:
                        s.decompose()
                    # 详细说明div
                    d_xi_b = pdetail.find('div', class_='d-xi-b').find('div')
                    detail_imgs = d_xi_b.find_all('img')
                    if detail_imgs:
                        for img in detail_imgs:
                            del img['onerror']
                            del img['onload']
                    # 详细说明中包含的文本内容(不包含tag标签)
                    content_text = d_xi_b.find_all(text=True, recursive=False)
                    if content_text:
                        for text in content_text:
                            # 全部替换为'' 去除“慧聪网”字眼
                            text.replace_with('')
            style = '''<style>
                            #introduce {font-size: 14px;}
                            table {border-collapse: collapse;border-spacing: 0;}
                            p {margin: 0;}
                            .dvop-title {line-height: 30px;font-size: 14px;color: rgb(51, 51, 51);padding-bottom: 10px;}
                            .dvop-title h4 {font-weight: normal;}
                            .d-vopy table {width: 100%;float: left;font-size: 12px;margin-bottom: 18px;border-left: 1px solid rgb(237, 237, 237);border-top: 1px solid rgb(237, 237, 237);}
                            .d-vopy th {width: 200px;background-color: rgb(245, 245, 245);text-align: center;font-weight: normal;min-height: 34px;line-height: 34px;border-right: 1px solid rgb(237, 237, 237);border-bottom: 1px solid rgb(237, 237, 237);padding: 0px;}
                            .d-vopy td {border-right: 1px solid #ededed;border-bottom: 1px solid #ededed;vertical-align: top;}
                            .d-vopy td {padding-left: 20px;line-height: 34px;}
                            .d-vopy th h4 {font-size: 12px;color: rgb(51, 51, 51);margin: 0px;}
                            .d-vopyList {overflow: hidden;}
                            .d-vopyList {line-height: 34px;padding-left: 20px;}
                            .d-vopyList p {float: left;}
                            .d-vopyList p {padding-right: 20px;width: 500px;line-height: 24px;padding: 5px 0;}
                            .d-xi-b {padding: 10px 0px;font-size: 12px;}
                        </style> '''
            product['details'] = style + pdetail.prettify()
        except AttributeError:
            self.logger.error('对象没有这个属性：' + traceback.format_exc())
        except KeyError:
            self.logger.error('映射中没有这个键：' + traceback.format_exc())
        except BaseException:
            self.logger.error('解析产品详情出错：' + traceback.format_exc())
        return product

Example #23

0

Show file

import  threading
from queue import  Queue
from Spider import Spider
from domain import *
from WebCrawler import *


Project_Name="The WebCrawler"
Home_Page="http://codechannels.com/channel/thenewboston/"
Domain_Name=get_full_domain_name(Home_Page)
Queue_File=Project_Name+'_queue.txt'
Crawled_File=Project_Name+'_crawled.txt'
Number_Of_Threads=8
queue=Queue()
Spider(Project_Name,Home_Page,Domain_Name)


#create wroker threads
#Die when maix exits
def create_workers():
    for _ in range(Number_Of_Threads):
        t=threading.Thread(target=work)
        t.daemon=True
        t.start()

#do the next job in the queue
def work():
    while True:
        url=queue.get()
        Spider.crawling(threading.current_thread().name,url)
        queue.task_done()

Example #24

0

Show file

            with open('error/error.txt', 'a+') as f:
                f.write('error/error_server 76')
                f.write(str(e)+'\n')
            print('1010')
            break
    manage.shutdown()
    spider_main.save()

if __name__ == "__main__":

    pickle = os.listdir('pickle/')
    print('当前的已保存搜索文件:', pickle)
    name = input('输入搜索代号:')
    path = name + '.pickle'
    used_path = name + '_used.pickle'
    spider_main = Spider(name, used_path)
    if path not in pickle:
        start = time.time()
        url = 'https://www.bilibili.com/index/rank/all-30-3.json'
        
        
        try:
            spider_main.crawl(url, path)
        except Exception as e:
            with open('error/error.txt', 'a+') as f:
                f.write('94'+str(e) + '\n')
                
        end = time.time()
        times = int(end - start)
        if times > 60:
            mins = times//60

Example #25

0

Show file

File: WebCrawler.py Project: minh-tran-dev/PythonCrawler

 def StartSpider(self, name):
     spider = Spider(name)
     spider.start()

Example #26

0

Show file

File: main.py Project: monisha29/WebsiteCrawler

import threading
from queue import Queue
from Spider import Spider
from domain import *
from general import *

PROJECT_NAME = 'testingOne'
HOMEPAGE = 'https://thenewboston.com/'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 4
queue = Queue()
Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


#Create worker threads(will die main exits)
#var _  beacuse just want to loop some number of times
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


#Do the next job in the queue
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()

Example #27

0

Show file

File: api.py Project: ZhenShaw/GZHU-ClassTable

def library():
    lib = Spider()
    visit = lib.read_library()
    return jsonify(visit)

Example #28

0

Show file

shieldDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/shields.txt"
shieldDamageDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/shieldDamage.txt"
campfireDir = "D:/Program Files/Streamlabs Chatbot/Services/Twitch/flame.txt"

attackers = [Vine(60, 1.0, 5, 1.0, 20, 120), # dpm of 5
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Vine(60, 1.0, 5, 1.0, 20, 120),
             Spider(60, 1.0, 15, 1.0, 100, 240), # dpm of 15
             Spider(60, 1.0, 15, 1.0, 100, 240),
             Spider(60, 1.0, 15, 1.0, 100, 240),
             Spider(60, 1.0, 15, 1.0, 100, 240),
             Spider(60, 1.0, 15, 1.0, 100, 240),
             Spider(60, 1.0, 15, 1.0, 100, 240),
             ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300), # dpm of 30
             ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300),
             ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300),
             ShadowBoundBear(120, 1.0, 60, 1.0, 300, 300),
             Beast(120, 1.0, 70, 1.0, 100, 300), # dpm of 35, increases over time
             Beast(120, 1.0, 70, 1.0, 100, 300),
             Colossus(60, 5.0, 700, 1.0, 2000, 1800), # dpm of 140, increases over time
             Colossus(60, 5.0, 700, 1.0, 2000, 1800),
             Dragon(300, 1.0, 1000, 1.0, 2000, 3600), # dpm of 200. Reward increases over time, difficult to kill.
             Ashvine(60, 1.0, 30, 1.0, 60, 50), # dpm of 30. Increases over time, harder to kill over time, reward increases over time.

Example #29

0

Show file

from Goblin import Goblin
from Store import Store
from Wizard import Wizard
from Spider import Spider
from Snake import Snake
from Medic import Medic
from Shadow import Shadow
from Zombie import Zombie

if __name__ == "__main__":
    hero = Hero()
    enemies = [
        Goblin(),
        Wizard(),
        Medic(),
        Shadow(),
        Zombie(),
        Spider(),
        Snake()
    ]
    battle_engine = Battle()
    shopping_engine = Store()

    for enemy in enemies:
        hero_won = battle_engine.do_battle(hero, enemy)
        if not hero_won:
            print("YOU LOSE!")
            exit(0)
        shopping_engine.do_shopping(hero)

    print("YOU WIN!")

Example #30

0

Show file

File: main.py Project: a7medayman6/Spidering-Tool

    BASE_URL = input('Enter The website URL:\t')
    if re.match(regex, BASE_URL) is not None:
        RESPONSE = urlopen(BASE_URL).getcode()
        if RESPONSE != 200:
            WRONG = True
            print("WRONG URL")
        else:
            break
    else:
        WRONG = True
        print("WRONG URL")

DOMAIN = get_domain_name(BASE_URL)
SEARCH_WORD = input('Enter the search text, if there is none press enter:\t')

Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD)
while True:
    if len(Spider.wait_list) <= 0:
        break
    BASE_URL = Spider.wait_list.pop()
    Spider.wait_list.add(BASE_URL)
    Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID),
           SEARCH_WORD)
    SPIDER_ID += 1

URLS_GATHERED = len(Spider.crawled)
print('\n' + "Finished Crawling.\n" + "Number of URLs Gathered:\t" +
      str(URLS_GATHERED))

if SEARCH_WORD != '':
    print("\nSearch Results:\nThe Search Word Found in These URLS:\n")