Example #1
0
def parse_listing(categories):

    postman = Postman.init()

    for category in categories:
        print(category)
        spider = Spider(url=category["url"])

        with open("page.html", "a") as f:
            f.write(spider.get_page())

        soup = BeautifulSoup(spider.get_page(), "html.parser")
        soup_a = soup.find_all("a", class_="item _item")
        products = [a["href"] for a in soup_a]

        print(products)

        for num, url in enumerate(products):
            parse_detail(category, url, postman)
            print(category, num, len(products))
            print("sleep 3")
            time.sleep(1)
            print("sleep 2")
            time.sleep(1)
            print("sleep 1")
            time.sleep(1)

        break
def test_cls_Spider_scapy__print_OnScapy():
    site = "http://www.hao123.com/"
    max_size = 100
    ptns = producePtns()
    onscapy = print_OnScapy()
    spider = Spider(site,ptns,max_size=max_size,onScapy=onscapy)
    spider.scapy()
Example #3
0
	def startCB(self):
		# 保存内容的文件
		file = open(self.filePath, "w")

		# 爬取得规则
		titleKlass = {"class": "j_chapterName"}
		contentKlass = {"class": "j_readContent"}
		nextKlass = {"id": "j_chapterNext"}

		page = self.entryUrl.get()
		# 开始爬取
		spider = Spider(titleKlass, contentKlass, nextKlass)

		if page == "" or self.filePath == "":
			tkMessageBox.showerror("woolson", "小说名称或链接未填写!")
		else:
			# 循环抓取下一章
			while page != "":
				result = spider.getContent(page)

				try:
					page = result["nextUrl"]
					file.write(result["title"] + "\n")
					file.write(result["content"] + "\n\n")

					print "正在写入->" + result["title"]
				except Exception as e:
					page = ""
					print "结束", result["error"]
Example #4
0
def going(url):
    mail = email()
    text = wordsDeal()
    spider1 = Spider(url, 'test')
    hrefs = spider1.hrefFor2018()
    for item in hrefs:
        try:
            content = spider1.contentOfArtical(item['href'])
            if content['content'] != "contents":
                query = "insert into Christian(href,title, content, sent) values ( '" + text.sqlEscape(
                    item['href']) + "','" + text.sqlEscape(
                        item['title']) + "','" + text.sqlEscape(
                            content['content']
                        ) + "','" + config.notSend + "');"
                sqlQuery(query)
                print "YES ", item['title']
                # mail.sendAuto(content['title'], content['content'] + '<p>' + Chinese + '</p>')
        except BaseException as error:
            query = "insert into SpiderExcept(href,except) values ( '" + text.sqlEscape(
                item['href']) + "','" + text.sqlEscape(str(error)) + "');"
            sqlQuery(query)
        else:
            print "NO ", item['title']
    del spider1
    del text
    del mail
Example #5
0
def main():
    # 根据所带参数,确定使用哪个网站的配置参数
    try:
        website = sys.argv[1]
        url = sys.argv[2]
    except Exception as e:
        print "please choose one website"
        exit()

    # 实例化
    dic = {
        "qidian": Qidian,
        "heiyan": Heiyan,
    }
    config = dic[website]()

    # 获取关键信息
    handler = Spider(config.title, config.content, config.next)

    chapters = config.getList(url)

    book = open("text.txt", "w")

    for item in chapters:
        print "正在下载->", item["title"]
        content = handler.getContent(item["href"])

        book.writelines(item["title"] + "\n")
        book.writelines(content["content"] + "\n")
Example #6
0
 def getMovieHtml(
     self, filename, name, j, end, z
 ):  #从filename中获取影片存放影片具体信息的网页url,抓取url指向的网页,name是开始url,抓取下来保存文件的名字,end是结束url
     file = open(filename, 'r', encoding="UTF-8")
     i = 0
     j = j
     flag = False
     for url in file:
         if i % 2 == 0:
             m = re.search(r'h.*', url)
             if m.group() == name:
                 flag = True
             if flag:
                 html = Spider().getHtml(m.group())
                 if html:
                     Spider().saveHtml(html, dizhi[z] + str(j) + ".html")
                     print(str(m) + " :" + str(j) + ".html已存储~")
                     j += 1
             if m.group() == end:
                 return
         else:
             print(url.strip())
         i += 1
     if j % 90 == 0:
         time.sleep(120)  #为了防止请求过于频繁,抓取一定页数就暂停一下
def test_cls_Spider_scapy__logfile_OnScapy():
    site = "http://www.hao123.com/"
    max_size = 100
    logfile = "test_result/logfile"
    ptns = producePtns()
    onscapy = logfile_OnScapy(name=logfile)
    spider = Spider(site,ptns,max_size=max_size,onScapy=onscapy)
    spider.scapy()
Example #8
0
def work():
    while True:
        try:
            url = queue.get()
            Spider.crawlPlayer(url, threading.current_thread().name)
            queue.task_done()
        except:
            pass
Example #9
0
def yyetsFinder():
    urls = ['http://yyets.com/showresource-juji-1103.html'  # 2 BROKE GIRlS
            ,'http://yyets.com/showresource-juji-1088.html' # HOMELAND
            ,'http://yyets.com/showresource-juji-1007.html' # MENTALIST
            ,'http://yyets.com/showresource-juji-974.html'  # NEW GIRL
            ] 
    spider = Spider(urls)
    spider.start()
Example #10
0
    def create_threads(self):
        """Create, start and add threads to a list. Threads run an instance of Spider.
        The amount of threads created depends on the amount of cores found in the system."""

        for i in range(1, multiprocessing.cpu_count()):
            name = "Thread-%s" % i
            thread = Spider(name, self.queue, self.result)
            thread.start()
            threads.append(thread)
Example #11
0
def run(key):
    url = set_url(host, key)
    Cookies()
    spider = Spider(url)
    html = spider.spider(BASEHEADERS)
    if not verify(html):
        BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str(
            ["acw_tc", "PHPSESSID"])
        proxieser.proxies()
    parser = HtmlParser(html)
    data = parser.parser("fund")

    print(data)
Example #12
0
def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/citations?view_op=search_authors&hl=en&mauthors=label:complex_systems'

    ### p_key and n
    p_key = []
    n_key = []

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #13
0
def course():

    post_format = {"username": "", "password": ""}
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']

        user = Spider(username, password)
        user.login()
        if user.login_status:
            info = user.modify_data()
            return jsonify(info)
        else:
            return "登录失败"
    else:
        return render_template("index.html", format=post_format)
Example #14
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "svh", ["help", "output="])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err)) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)
    for o, a in opts:
        if o == "-s":
            s = Spider()
            s.run()
        elif o == "-v":
            voter = Voter()
            voter.run()
        else:
            assert False, "unhandled option"
Example #15
0
def initialization():
    # get a list of user ids as tasks
    uid_list = get_tasks(TASK_NUM)
    # get a list of users as crawling accounts
    user_list = get_accounts(ACCOUNT_NUM)
    spider = Spider(user_list)

    return spider, uid_list, user_list
Example #16
0
def ydyFinder():
    # everything for login
    username = '******'
    password = '******'
    auth_data = {'username': username,
                 'password': password,
                 'formhash':'592862ac'}
    login_page = 'http://bbs.sfile2012.com/logging.php?action=login&loginsubmit=yes'
    auth = FormAuth(auth_data, login_page)
    
    # good hunting
    urls = ['http://bbs.sfile2012.com/viewthread.php?tid=351496&extra=page%3D1', # BONES
            'http://bbs.sfile2012.com/viewthread.php?tid=348582&extra=page%3D1', # HOUSE
            'http://bbs.sfile2012.com/viewthread.php?tid=348117&extra=page%3D1', # MENTALIST
            ]
    spider = Spider(urls, auth)
    spider.start()
Example #17
0
def grade():

    post_format = {"username": "", "password": ""}
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']

        grade = Spider(username, password)
        grade.login()
        if grade.login_status:
            info = grade.modify_grade()
            print(info)
            return jsonify(info)
        else:
            return "登录失败"
    else:
        return render_template("index.html", format=post_format)
Example #18
0
 def __init__(self, url, **kwargs):
     self.results = defaultdict(list)
     self.maxdepth = 2
     self.URLHandler = URLHandler()
     self.candidates = set()
     self.url = url
     self.baseurl = self.URLHandler.get_provider(self.url)
     self.spider = Spider(self.url, **kwargs)
     self.mysoup = BeautifulSoup(self.spider.request.text)
Example #19
0
    def spider_store_details(self):
        stores = self.query_store()
        for store in stores:
            time.sleep(1)
            try:
                contacts = None
                # 爬取联系方式页
                spider_time = time.time()
                self.logger.info('开始爬取:' + store[1])
                result_contacts = Spider().spider_URL(
                    url=store[1], is_proxies=True)
                self.logger.info('爬取耗时:' + str(time.time() - spider_time))

                # 解析联系方式页
                interpreting_time = time.time()
                self.logger.info('开始解析:' + store[1])
                contacts = Interpreter().interpreting_contact_info(
                    result_contacts)
                self.logger.info('解析耗时:' +
                                 str(time.time() - interpreting_time))
                # 爬取联系方式页
                if not contacts:
                    spider_time = time.time()
                    store[1] += '/shop/company.html'
                    self.logger.info('开始爬取:' + store[1])
                    result_contacts = Spider().spider_URL(
                        url=store[1], is_proxies=True)
                    self.logger.info('爬取耗时:' + str(time.time() - spider_time))

                    # 解析联系方式页
                    interpreting_time = time.time()
                    self.logger.info('开始解析:' + store[1])
                    contacts = Interpreter().interpreting_contact_info(
                        result_contacts)
                    self.logger.info('解析耗时:' +
                                     str(time.time() - interpreting_time))

                # 更新联系方式
                self.update_contacts(store[0], contacts)
            except BaseException:
                self.logger.error('爬取或更新联系方式出错:' + traceback.format_exc())
                continue
def main():
    ### The start page's URL
    start_url = 'https://scholar.google.com.tw/scholar?q=frequency+lowering+algorithm&hl=zh-TW&as_sdt=0,5'

    ### p_key and n
    p_key = [
        'wdrc', 'dynamic range compression', 'hearing aid', 'speech',
        'noise cancellation', 'noise reduction', 'feedback cancellation',
        'sound', 'hearing loss'
    ]
    n_key = [
        'imagery', 'image', 'visual', 'video', 'optic', 'opto', 'quantum',
        'photon'
    ]

    ### Google Scholar Crawler, Class Spider
    myCrawler = Spider(start_url, p_key, n_key, page=5)

    results = myCrawler.crawl()

    with open('result.pickle', 'wb') as f:
        pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #21
0
 def run(self):
     global condition, products, urls
     while True:
         if condition.acquire():
             if urls.url_size() <= 20:
                 urls.add_new_url(Spider.crawl_url(self.keyword, products))
                 products += 1
                 print("Producer(%s):deliver one, now products:%s" %
                       (self.name, urls.url_size()))
                 condition.notify()
                 pass
             else:
                 condition.wait()
             # stop
             if products >= 100:
                 break
             condition.release()
Example #22
0
    def spider_product(self, category_2nd, start, end):
        # 解析产品列表page1~page100
        for i in range(start, end):
            try:
                # 爬取下一页休息3s
                time.sleep(1)
                # 分三次抓取当前页60个产品
                product_list = []
                for j in range(3):
                    req = category_2nd[
                        1] + '&ap=A&t=1&afadprenum=0&af=1' + '&ee=' + str(
                            i) + '&afadbeg=' + str(60 * (i - 1) + (j * 20) + 1)
                    # 爬取并解析
                    spider_time = time.time()
                    self.logger.info('开始爬取:' + req)
                    result_product_list = Spider().spider_URL(url=req,
                                                              is_proxies=True)

                    self.logger.info('爬取耗时:' + str(time.time() - spider_time))

                    interpreting_time = time.time()
                    self.logger.info('开始解析:' + req)
                    product_list.extend(
                        Interpreter().interpreting_product_list(
                            result_product_list))
                    self.logger.info('解析耗时:' +
                                     str(time.time() - interpreting_time))
                '''
                先爬取所有产品列表信息,后再逐个抓取产品详情及公司信息
                '''
                for product in product_list:
                    # 保存公司信息
                    contact = {}
                    contact['公司名'] = product.get('company')
                    contact['公司主页'] = product.get('homepage')
                    store_id = self.save_contacts(contact)
                    # 保存产品
                    self.save_product(product, category_2nd[0], store_id)
            except BaseException:
                self.logger.error('爬取或解析' + req + '出错:' +
                                  traceback.format_exc())
                continue
Example #23
0
    def spider_product_details(self):
        products = self.query_products()
        for product in products:
            time.sleep(1)
            try:
                # 爬取产品详情页
                spider_time = time.time()
                self.logger.info('开始爬取:' + product[1])
                result_products = Spider().spider_URL(
                    url=product[1], is_proxies=True)
                self.logger.info('爬取耗时:' + str(time.time() - spider_time))

                # 解析产品详情页
                interpreting_time = time.time()
                self.logger.info('开始解析:' + product[1])
                details = Interpreter().interpreting_product_details(
                    result_products)
                self.logger.info('解析耗时:' +
                                 str(time.time() - interpreting_time))
                '''                  
                # 若产品详情需要再次爬取
                product_bcid = details.get('bcid')
                if product_bcid:
                    interpreting_time = time.time()
                    xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid='
                    result_product_introduce = Spider().spider_URL(
                        url=xss_filter + product_bcid)
                    self.logger.info('开始解析:' + xss_filter + product_bcid)
                    details['desc'] = Interpreter(
                    ).interpreting_product_details_desc(
                        result_product_introduce)
                    # 组装产品详情
                    details = Interpreter().assemble_product_details(details)
                    self.logger.info('解析耗时:' +
                                     str(time.time() - interpreting_time)) '''
                # 更新产品详情
                self.update_products(product[0], details)
            except BaseException:
                self.logger.error('爬取或更新产品详情出错:' + traceback.format_exc())
                continue
    def spider_job():
        try:
            spider = Spider.Spider()
            spider.spider(categoryId=1)
            spider.spider(categoryId=2)
            spider.spider(categoryId=3)
            spider.spider(categoryId=4)
            spider.spider(categoryId=5)

            print("success")

            time.sleep(5)

        except Exception as err:

            # 错误预警,发送邮件
            errStr = ",".join(err.args)
            myEmail = MyEmail.MyEmail()
            myEmail.tag = "新发地商品数据爬去异常"
            myEmail.to_list = ["*****@*****.**"]
            myEmail.content = errStr
            myEmail.send()
            print(errStr)
Example #25
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
import re

from Spider import Spider

import sys
reload(sys)
sys.setdefaultencoding('utf8')


errors = []

spider = Spider()

# 爬数据
errors.extend(spider.crawljobs())

# 解析入库
errors.extend(spider.insert_jobs())

# 格式化职位的一些信息(薪资)
errors.extend(spider.analyze_jobs())

spider.adapt_job_city()

# 关联关键字
errors.extend(spider.associate_key_and_job())

# 输出错误
for e in errors:
Example #26
0
from util.MysqlManager import MysqlManager
from Spider import Spider
import logging

logging.basicConfig(filename='./log/20181022.txt', level=logging.INFO)

# def crawl(url, brand, class_):
#     print(os.getpid(), url, brand, class_)
#     spider = Spider(url, brand, class_)
#     spider.work()

if __name__ == "__main__":
    records = MysqlManager().fetch_all_source()
    for record in records:
        url = record.get("url")
        brand = record.get("brand")
        class_ = record.get("class")
        # p = Process(target=crawl, args=(url, brand, class_))
        # p.start()
        # time.sleep(2)
        spider = Spider(url, brand, class_)
        spider.work()
Example #27
0
def work():
    while True:
        url=queue.get()
        Spider.crawling(threading.current_thread().name,url)
        queue.task_done()
Example #28
0
import aiohttp

from Spider import Request
from Spider import Spider
import PageParse
import argparse

ARGS = argparse.ArgumentParser(description="caoliu spider")
ARGS.add_argument("--pages", action='store', type=int,
                  default=1, help='Limit page to spider')
ARGS.add_argument("--max_tries", action='store', type=int,
                  default=30, help='Limit retries on network errors')
ARGS.add_argument("--root_dir", action='store',
                  default='./download', help='directory store picture and torrent')
ARGS.add_argument("--max_tasks", action='store', type=int,
                  default=20, help='Limit concurrent connections')

ROOT_DIR = "/media/mosaic/软件/git-myspider/cl_spider/source/"

args = ARGS.parse_args()

loop = asyncio.get_event_loop()
spider = Spider(max_tries=args.max_tries, max_tasks=args.max_tasks)
PageParse.start(spider, 1, args.pages+1,  root_dir=args.root_dir)
loop.run_until_complete(spider.spider())
spider.close()
loop.stop()
loop.run_forever()
loop.close()

Example #29
0
    def interpreting_product_details(self, html_doc):
        try:
            # 存储产品信息
            product = {}
            doc = BeautifulSoup(html_doc, 'html5lib')
            # 产品图片
            img = []
            # 产品多图
            product_li = doc.find_all('li', class_='tab-trigger')
            if product_li:
                for li in product_li:
                    product_a = li.find('a',
                                        attrs={
                                            "data-useractivelogs":
                                            "UserBehavior_detail_smallphoto"
                                        })
                    if product_a:
                        # 获取largeimage
                        product_img = product_a.find('img')
                        product_img_src = product_img.get('src')
                        # 获取原图
                        last_index = product_img_src.rfind('..')
                        img.append(product_img_src[:last_index])
            # 该产品不存在多图时,使用默认的一张大图
            else:
                product_img_div = doc.find('div', class_='vertical-img')
                if product_img_div:
                    product_img = product_img_div.find(
                        'a',
                        attrs={
                            "data-useractivelogs":
                            "UserBehavior_detail_bigphoto"
                        })
                    product_img_hrefs = product_img.get('hrefs')
                    if not product_img_hrefs:
                        product_img_src = product_img.find('img').get('src')
                        # 获取原图
                        last_index = product_img_src.rfind('..')
                        product_img_hrefs = product_img_src[:last_index]
                    img.append(product_img_hrefs)
            product['imgs'] = img
            ''' 
            产品详情有两种展示效果,因此需要不同解析
            '''
            # 产品详情整个内容
            pdetail = doc.find('div',
                               id='pdetail',
                               class_='proDetailCon tab_content_event_class')
            if pdetail is not None:
                # 获取产品唯一识别id
                product_bcid = doc.find('input', id='bcid').get('value')
                detail_bot = pdetail.find('div', class_='detailBot')
                detail_bot.decompose()
                introduce = pdetail.find('div', id='introduce')
                if product_bcid:
                    product['bcid'] = product_bcid
                    xss_filter = 'http://wsdetail.b2b.hc360.com/XssFilter?callback=jQuery&bcid='
                    result_product_introduce = Spider().spider_URL(
                        url=xss_filter + product_bcid)

                    product_introduce = self.interpreting_product_details_desc(
                        result_product_introduce)
                    introduce.replace_with(
                        BeautifulSoup(product_introduce, 'html.parser'))
            else:
                pdetail = doc.find('div',
                                   id='pdetail',
                                   class_='pdetail tab_content_event_class')
                if pdetail is not None:
                    # 基本参数
                    vopy = pdetail.find('div', class_="d-vopy")
                    # 去除基本参数列表中图片div
                    vopyImgBoxs = vopy.find_all('div', class_='d-vopyImgBox')
                    for vopyImgBox in vopyImgBoxs:
                        vopyImgBox.decompose()
                    # 去除基本参数列表中同类产品显示span
                    span = pdetail.find_all(
                        'span', class_='same-parameter-commodity-hook')
                    for s in span:
                        s.decompose()
                    # 详细说明div
                    d_xi_b = pdetail.find('div', class_='d-xi-b').find('div')
                    detail_imgs = d_xi_b.find_all('img')
                    if detail_imgs:
                        for img in detail_imgs:
                            del img['onerror']
                            del img['onload']
                    # 详细说明中包含的文本内容(不包含tag标签)
                    content_text = d_xi_b.find_all(text=True, recursive=False)
                    if content_text:
                        for text in content_text:
                            # 全部替换为'' 去除“慧聪网”字眼
                            text.replace_with('')
            style = '''<style>
                            #introduce {font-size: 14px;}
                            table {border-collapse: collapse;border-spacing: 0;}
                            p {margin: 0;}
                            .dvop-title {line-height: 30px;font-size: 14px;color: rgb(51, 51, 51);padding-bottom: 10px;}
                            .dvop-title h4 {font-weight: normal;}
                            .d-vopy table {width: 100%;float: left;font-size: 12px;margin-bottom: 18px;border-left: 1px solid rgb(237, 237, 237);border-top: 1px solid rgb(237, 237, 237);}
                            .d-vopy th {width: 200px;background-color: rgb(245, 245, 245);text-align: center;font-weight: normal;min-height: 34px;line-height: 34px;border-right: 1px solid rgb(237, 237, 237);border-bottom: 1px solid rgb(237, 237, 237);padding: 0px;}
                            .d-vopy td {border-right: 1px solid #ededed;border-bottom: 1px solid #ededed;vertical-align: top;}
                            .d-vopy td {padding-left: 20px;line-height: 34px;}
                            .d-vopy th h4 {font-size: 12px;color: rgb(51, 51, 51);margin: 0px;}
                            .d-vopyList {overflow: hidden;}
                            .d-vopyList {line-height: 34px;padding-left: 20px;}
                            .d-vopyList p {float: left;}
                            .d-vopyList p {padding-right: 20px;width: 500px;line-height: 24px;padding: 5px 0;}
                            .d-xi-b {padding: 10px 0px;font-size: 12px;}
                        </style> '''
            product['details'] = style + pdetail.prettify()
        except AttributeError:
            self.logger.error('对象没有这个属性:' + traceback.format_exc())
        except KeyError:
            self.logger.error('映射中没有这个键:' + traceback.format_exc())
        except BaseException:
            self.logger.error('解析产品详情出错:' + traceback.format_exc())
        return product
Example #30
0
import  threading
from queue import  Queue
from Spider import Spider
from domain import *
from WebCrawler import *


Project_Name="The WebCrawler"
Home_Page="http://codechannels.com/channel/thenewboston/"
Domain_Name=get_full_domain_name(Home_Page)
Queue_File=Project_Name+'_queue.txt'
Crawled_File=Project_Name+'_crawled.txt'
Number_Of_Threads=8
queue=Queue()
Spider(Project_Name,Home_Page,Domain_Name)


#create wroker threads
#Die when maix exits
def create_workers():
    for _ in range(Number_Of_Threads):
        t=threading.Thread(target=work)
        t.daemon=True
        t.start()

#do the next job in the queue
def work():
    while True:
        url=queue.get()
        Spider.crawling(threading.current_thread().name,url)
        queue.task_done()
Example #31
0
 def StartSpider(self, name):
     spider = Spider(name)
     spider.start()
from Spider import Spider
from save_as_opml import save_to_opml

if __name__ == "__main__":
    spider = Spider()
    content_list = spider.run()
    save_to_opml(content_list, spider.tag_set, spider.name_list)
Example #33
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
from Spider import Spider

# 入口
spider = Spider()

fans = spider.get_my_fans()
for fan in fans:
    spider.user_crawl(fan.user_id)
    spider.status_crawl(fan.user_id)


followers = spider.get_my_follower()
for follower in followers:
    spider.user_crawl(fan.user_id)
    spider.status_crawl(fan.user_id)
Example #34
0
import threading
from queue import Queue
from Spider import Spider
from domain import *
from general import *

PROJECT_NAME = 'testingOne'
HOMEPAGE = 'https://thenewboston.com/'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 4
queue = Queue()
Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


#Create worker threads(will die main exits)
#var _  beacuse just want to loop some number of times
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


#Do the next job in the queue
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()
Example #35
0
# -*- Mode: Python; coding: utf-8; indent-tabs-mode: t; c-basic-offset: 4; tab-width: 4 -*- 
#
# main.py
# Copyleft 2014 Yuzo(PillowSky) <*****@*****.**>
# Compatible with python3 and pypy
#
# require PyQuery which depend on cssselect, so I pack it in the project to ensure running normally on other computers

import os.path
from datetime import datetime
from Spider import Spider

print("==Welcome to search engine for ZOJ==")
if(os.path.isfile("data.db")):
	spider = Spider()
	print("Local database is update on %s" % datetime.fromtimestamp(os.path.getmtime("data.db")))
	print("Database have %s problems stored now" % spider.getItemsCount())
else:
	print("Local database hasn't build")
	spider = Spider()

print("\nUsage:")
print("[1] update the local database in serial")
print("[2] update the local database in parallel")
print("[3] search problems about Matrix")
print("[4] search generic problems")
	
choice = raw_input()
if(choice == str(1)) :
	spider.serialFetchAllProblems()
Example #36
0
 def __init__(self, config):
     Spider.__init__(self, config)
Example #37
0
    BASE_URL = input('Enter The website URL:\t')
    if re.match(regex, BASE_URL) is not None:
        RESPONSE = urlopen(BASE_URL).getcode()
        if RESPONSE != 200:
            WRONG = True
            print("WRONG URL")
        else:
            break
    else:
        WRONG = True
        print("WRONG URL")

DOMAIN = get_domain_name(BASE_URL)
SEARCH_WORD = input('Enter the search text, if there is none press enter:\t')

Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID), SEARCH_WORD)
while True:
    if len(Spider.wait_list) <= 0:
        break
    BASE_URL = Spider.wait_list.pop()
    Spider.wait_list.add(BASE_URL)
    Spider(PROJECT_NAME, BASE_URL, DOMAIN, 'Spider' + str(SPIDER_ID),
           SEARCH_WORD)
    SPIDER_ID += 1

URLS_GATHERED = len(Spider.crawled)
print('\n' + "Finished Crawling.\n" + "Number of URLs Gathered:\t" +
      str(URLS_GATHERED))

if SEARCH_WORD != '':
    print("\nSearch Results:\nThe Search Word Found in These URLS:\n")
Example #38
0
# -*- coding: utf-8 -*-
import os
from flask import Flask, request, Response
from flask_uploads import UploadSet, configure_uploads, IMAGES, patch_request_class
import json
from IRNet import IRNet
from Spider import Spider

app = Flask(__name__)
app.config['UPLOADED_PHOTOS_DEST'] = os.getcwd() + '/upload'

net = IRNet()
net.load_model()
net.predict('1.jpg')
spider = Spider()

photos = UploadSet('photos', IMAGES)
configure_uploads(app, photos)
patch_request_class(app)  # set maximum file size, default is 16MB

#net = IRNet()
#net.load_model()

html = '''
    <!DOCTYPE html>
    <title>Upload File</title>
    <h1>图片上传</h1>
    <form method=post enctype=multipart/form-data>
         <input type=file name=photo>
         <input type=submit value=上传>
    </form>
Example #39
0
            with open('error/error.txt', 'a+') as f:
                f.write('error/error_server 76')
                f.write(str(e)+'\n')
            print('1010')
            break
    manage.shutdown()
    spider_main.save()

if __name__ == "__main__":

    pickle = os.listdir('pickle/')
    print('当前的已保存搜索文件:', pickle)
    name = input('输入搜索代号:')
    path = name + '.pickle'
    used_path = name + '_used.pickle'
    spider_main = Spider(name, used_path)
    if path not in pickle:
        start = time.time()
        url = 'https://www.bilibili.com/index/rank/all-30-3.json'
        
        
        try:
            spider_main.crawl(url, path)
        except Exception as e:
            with open('error/error.txt', 'a+') as f:
                f.write('94'+str(e) + '\n')
                
        end = time.time()
        times = int(end - start)
        if times > 60:
            mins = times//60
from Spider import Spider
from Query import Query
import sys

arguments = sys.argv
if arguments[1] == "crawl":
    spider = Spider("https://en.wikipedia.org/")
    spider.crawl()
elif arguments[1] == "query":
    query = Query(arguments[2])
    query.query()
# # query.multiWordQuery(["action","design"])
Example #41
0
def work():
	while True:
		url = queue.get()
		Spider.crawl_page(threading.current_thread().name, url)
		queue.task_done()
Example #42
0
 def __init__(self, config):
     Spider.__init__(self, config)
     self.logger = LogUtil.Logging.getLogger()
Example #43
0
 def run(self):
     spider = Spider(self.thread_name, self.city_name)
     spider.getData()
Example #44
0
from Goblin import Goblin
from Store import Store
from Wizard import Wizard
from Spider import Spider
from Snake import Snake
from Medic import Medic
from Shadow import Shadow
from Zombie import Zombie

if __name__ == "__main__":
    hero = Hero()
    enemies = [
        Goblin(),
        Wizard(),
        Medic(),
        Shadow(),
        Zombie(),
        Spider(),
        Snake()
    ]
    battle_engine = Battle()
    shopping_engine = Store()

    for enemy in enemies:
        hero_won = battle_engine.do_battle(hero, enemy)
        if not hero_won:
            print("YOU LOSE!")
            exit(0)
        shopping_engine.do_shopping(hero)

    print("YOU WIN!")
Example #45
0
class Yaff(object):
    def __init__(self, url, **kwargs):
        self.results = defaultdict(list)
        self.maxdepth = 2
        self.URLHandler = URLHandler()
        self.candidates = set()
        self.url = url
        self.baseurl = self.URLHandler.get_provider(self.url)
        self.spider = Spider(self.url, **kwargs)
        self.mysoup = BeautifulSoup(self.spider.request.text)

    def getnormalfeeds(self):
        tags = self.mysoup.findAll(['link', 'a'],
                                   {"type": ['application/rss+xml', 'application/atom+xml',
                                             "application/x.atom+xml",
                                             "text/xml", "application/xhtml+xml"]})
        for tag in tags:
            url = URLHandler.get_full_urls(self.baseurl, tag['href'])
            self.results[url].append(Result(title=tag.get('title', ''),
                                            feedtype=tag.get('type', '')))
        return self

    def gethiddenfeeds(self):
        for i in range(self.maxdepth):
            self._getcandidatetags()
            for candidate in self.candidates:
                try:
                    self.spider.make_request(candidate)
                    self.mysoup = BeautifulSoup(self.spider.request.text)
                except ValueError as e:
                    print(e)
                    continue
                if self.isfeed():
                    self.results[self.spider.request.url].append(Result(
                        title=self.mysoup.find('title').text,
                        feedtype=self.spider.contenttype))

                self.getnormalfeeds()
        return self

    def getrootrss(self):
        self.spider.make_request(self.url + '/rss')
        if self.isfeed():
            self.mysoup = BeautifulSoup(self.spider.request.text)
            url = URLHandler.get_full_urls(self.baseurl, self.url + '/rss')
            self.results[url].append(Result(
                title=self.mysoup.find('title').text,
                feedtype=self.spider.contenttype))
        return self

    def _getcandidatetags(self):
        tags = self.mysoup.findAll('a')
        feedstrings = ['feed', 'rss', 'atom', 'xml']
        for tag in tags:
            try:
                if any(fstring in tag['href'] for fstring in feedstrings):
                    self.candidates.add(URLHandler.get_full_urls(self.baseurl, tag['href']))
            except:
                continue

    def isfeed(self):
        if 'xml' in self.spider.contenttype or 'atom' in self.spider.contenttype:
            return True
        return False