Example #1
0
def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    # 读取url列表
    file = open('msglist.json')
    text = file.read()
    file.close()
    urls = json.loads(text)

    urls_visited = []
    if os.path.exists('visited.txt'):
        file = open('visited.txt', 'r')
        for line in file:
            urls_visited.append(line.rstrip())

    urlmap = {}
    for item in urls:
        title = item['title']
        url = item['url']
        if url in urls_visited:
            print 'visited', url
            continue

        urlmap[url] = title
        queue.put(url)

    # start 
    file = open('visited.txt', 'a')
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        Spider.crawl(url)
        print "analyse ", url
        logging.info('now analyse %s', url)
        images = Spider.analyse()
   
        queue.task_done()

        visited.add(url)

        save(images, urlmap[url])

        file.write(url+'\n')
        file.flush()

    file.close()
    print 'finished'
    logging.info('finished')
Example #2
0
def get_history():
    """Download historical vessel schedules."""

    src_dir = get_src_dir_path(__file__)
    data_dir = src_dir.parent / "data"

    start_date = datetime.date(year=2020, month=1, day=1)
    end_date = datetime.date(year=2020, month=1, day=10)
    time_delta = datetime.timedelta(days=1)

    file_paths = []
    jocasta = Spider()
    while start_date <= end_date:
        time.sleep(random.randrange(0, 5, 1))
        file_path = jocasta.crawl(data_dir, date=start_date)
        file_paths.append(file_path)
        start_date += time_delta
        
    for file_path in file_paths:
        if file_path.endswith('.xlsx'):
            print("unhandled xlsx file!")

        elif file_path.endswith('.pdf'):
            tables = Wrangler.parse_pdf(file_path)

        else:
            raise ValueError(f"Unexpected file type: {file_path}")

    return None
Example #3
0
def main():
    url = "https://www.readmorejoy.com"

    args = sys.argv[0:]

    # Execute function depending on arguments
    # print(len(args))
    if len(args) == 1:
        print_help()
    elif len(args) == 2:
        url = args[1]
        #print(url)
        print("start a web crawling")
        spider = Spider(url)
        spider.crawl(url)
        print("web crawling done")
    else:
        print_help()
Example #4
0
def update(data_dir):
    print("Crawling...")
    url = "https://civilization.fandom.com/wiki/Leaders_(Civ6)"
    response = Spider.crawl(url)

    print("Processing html...")
    records = Wrangler.process_html(response, data_dir)

    return records
Example #5
0
def run():
    req_data = request.get_json()

    email = None
    if 'email' in req_data:
        email = req_data['email']
    password = None
    if 'password' in req_data:
        password = req_data['password']

    url_list = get_urls()

    if email and password and url_list:
        spidy = Spider()
        url_list = spidy.modify_urls(url_list)
        spidy.crawl(url_list=url_list, email=email, password=password)
        return ''' Done '''
    else:
        return ''' ERROR in email or password or while fetching urls '''
Example #6
0
def main():
    # Fetch the arguments; the first elements in sys.argv is this python file itself - so ignore
    args = sys.argv[1:]

    # Execute function depending on arguments
    if len(args) == 1:
        if args[0] == "-test":  # test
            test = Test()
            test.test_all()
        elif args[0] == "-help":  # help
            print_help()
        else:
            print_help(True)
    elif len(args) == 2:
        if args[0] == "-c":  # crawl
            url = args[1]
            print("[crawler.py] start crawling")
            spider = Spider()
            spider.crawl(url)
            print("[crawler.py] crawling done")
        else:
            print_help(True)
    else:
        print_help(True)
Example #7
0
def main():
    # project dir
    create_dir(ROOT)

    Spider(DEFAULT_HEADERS, DEFAULT_TIMEOUT)

    queue.put(URL)

    # start
    while queue.empty() == False:
        url = queue.get()
        print "crawl ", url
        logging.info('now crawl %s', url)
        html = Spider.crawl(url)
        images = Spider.analyse(html)
        links = Spider.analyse_links(html)

        queue.task_done()

        visited.add(url)

        save(images)

        # new urls
        for link in links:
            if (link not in visited) and link[0:18] == 'http://pp.163.com/':

                exist = False
                for ignore in IGNORES:
                    match = re.search(re.compile(ignore), link)
                    if match:
                        #logging.info("exclude %s", link)
                        exist = True
                        break

                if exist == False:
                    queue.put(link)

    print 'done'
Example #8
0
class Helper(object):

    def __init__(self, url, cook):
        self.spider = Spider(url, cook)
        self.content = ''
        self.mail_helper = Mailhelper()


    def refresh(self):
    """重新抓取网页内容"""
    
        self.content = self.spider.crawl()


    def is_new_msg(self):
    """判断抓取的内容是否为最新动态"""
    
        if os.path.exists('weibo.txt'):
            with open('weibo.txt', 'r') as fi:
                # 这里的split作用是去除微博中的时间信息,避免误判旧消息为新消息
                txt_content = '\n'.join(fi.read().split('\n')[:-1])
                new_content = '\n'.join(self.content.encode('utf8').split('\n')[:-1])
                if new_content is txt_content:
                    return False
                else:
                    return True
        else:
            return True


    def send_mail(self):
    """发送邮件到指定邮箱"""
    
        to_list = ['*****@*****.**']      # 接收方邮箱
        sub = u'微博更新'             # 邮件标题
        if self.mail_helper.send_mail(to_list, sub, self.content):
            print u'发送成功!'
        else:
            print u'发送失败!'
Example #9
0
    shortcut = Student(2013217413, '123456789012', 'XC')
    c = pymongo.MongoClient()

    db = c['hfut']
    init_db(db)

    # 初始化任务池
    # 合适的数据库任务池大小和缓冲区大小能更好的利用带宽
    # 数据库记录的最大并发为 db_pool_size * batch_size
    # 当请求池大小大于20时很容易导致服务器错误抓取不到结果
    job_manager = JobManager(pool_size=20)
    db_manager = DatabaseManager(db, batch_size=80)

    j = Spider(shortcut, job_manager, db_manager)

    j.crawl()

    # def patch():
    #     for i in range(21, 31):
    #         term = '%03d' % i
    #         yield term, None, '_'
    #         # for args in j.iter_teaching_class(term, course_name='_'):
    #         #     yield args
    #
    #
    # jobs = (patch, j.iter_teaching_class, j.sync_students)
    # job_manager.jobs = jobs
    #
    # logger.info('Crawl start!'.center(72, '='))
    # job_manager.start()
    # logger.info('Jobs are all dispatched. Waiting for database requests handling.')
Example #10
0
	def startCrawl(self):
		spider = Spider(self.userList)
		spider.crawl(self.hasProcessed)
Example #11
0
log_it("TEMP DIR",temp_dire)

if os.path.exists(temp_dire):
    shutil.rmtree(temp_dire)
distutils.dir_util.copy_tree(src_dir,temp_dire)
owd = os.getcwd()

log_it("LOG","Crawling started")
spider = Spider(temp_dire)
log_it("LOG","Crawling done")
# spider.crawl()

log_it("LOG","Compileing pages started")
posts_data=[]
for post_folder in spider.crawl():
    config = json.load(open(os.path.join(post_folder,"__pub.lish")))
    t_date = time.strptime(config['date'],"%Y-%m-%d")
    posts_data.append({
        'title': config['name'].replace('-', ' '),
        'url'  : post_folder[len(temp_dire)+1:],
        'year' : time.strftime("%Y",t_date),
        'day'  : time.strftime("%d",t_date),
        'month': time.strftime("%b",t_date),
        'date' : t_date
    })
    compiler = Compilers[config['type']]
    owd = os.getcwd()
    os.chdir(post_folder)
    compiler.compile(config['file'])
    os.chdir(owd)
Example #12
0
                           user=config.db_user,
                           passwd=config.db_password,
                           db=config.db_database,
                           charset='utf8')
    cursor = conn.cursor()
    cursor.execute(
        'select configValue from t_spider_config where configKey=%s',
        (arg_config.get(sys.argv[1]), ))
    config_values = [row[0] for row in cursor.fetchall()]
    if sys.argv[1] == 'paper':
        spider_paper = Spider('paper')
        for search_exp in config_values:
            reqs = parser.paper_page_parser(search_exp)[:500]
            for req in reqs:
                spider_paper.add_request(req)
        spider_paper.crawl()

    if sys.argv[1] == 'news':
        spider_news = Spider('news')
        for seed_url in config_values:
            spider_news.add_request(
                Request(arg=seed_url, parser=parser.news_parser))
        spider_news.crawl()

    if sys.argv[1] == 'patent':
        spider_patent = Spider('patent')
        for search_exp in config_values:
            spider_patent.add_request(
                Request(arg=search_exp, parser=parser.patent_parser))
        spider_patent.crawl()
Example #13
0
def work():
    while True:
        url = queue.get()
        Spider.crawl(threading.current_thread().name, url)
        queue.task_done()
Example #14
0
    conn = MySQLdb.connect(host=config.db_host,
                           user=config.db_user,
                           passwd=config.db_password,
                           db=config.db_database,
                           charset='utf8')
    cursor = conn.cursor()
    cursor.execute('select configValue from t_spider_config where configKey=%s',
                   (arg_config.get(sys.argv[1]),))
    config_values = [row[0] for row in cursor.fetchall()]
    if sys.argv[1] == 'paper':
        spider_paper = Spider('paper')
        for search_exp in config_values:
            reqs = parser.paper_page_parser(search_exp)[:500]
            for req in reqs:
                spider_paper.add_request(req)
        spider_paper.crawl()

    if sys.argv[1] == 'news':
        spider_news = Spider('news')
        for seed_url in config_values:
            spider_news.add_request(Request(arg=seed_url,
                                    parser=parser.news_parser))
        spider_news.crawl()

    if sys.argv[1] == 'patent':
        spider_patent = Spider('patent')
        for search_exp in config_values:
            spider_patent.add_request(Request(arg=search_exp,
                                      parser=parser.patent_parser))
        spider_patent.crawl()
Example #15
0
def test2():
    sp = Spider('reddit.com')
    sp.crawl_robots()
    sp.test()
    sp.crawl()
def crawl_t():
    waiting_list_count=len(Spider.waiting_list)
    while len(Spider.waiting_list) > 0:
        # print("No Of Links Waiting To be Further Crawled: " + str(len(Spider.waiting_list)))
        url=Spider.waiting_list.pop()
        Spider.crawl("Spider",url)
Example #17
0
def search(username):
    print(username)
    spider = Spider()
    spider.crawl(username)
    return render_template('detail.html')