Example #1
0
def crawling(agent, proxy, redirect, timeout, url, cookie):
    return crawler.Crawler(agent=agent,
                           proxy=proxy,
                           redirect=redirect,
                           timeout=timeout,
                           url=url,
                           cookie=cookie).process()
Example #2
0
def main():
    site_crawler = crawler.Crawler(DOMAIN)
    collection = site_crawler.crawl()
    print collection.get_len()
    data_saver = saver.DatabaseWorker()
    for url, content in collection.pages_content():
        nodes = parser.get_elements(content, REGULARS)
        data_saver.save_item(url, nodes)
        save_to_db(url, nodes)
Example #3
0
def index(request):
    try:
        indexed_url = request.GET['request']
        if indexed_url:
            c = crawler.Crawler(indexed_url, 0)  # Add setting to change depth
            c.crawl()
    except:
        pass
    return render(request, 'index.html', {})
Example #4
0
def index(request):
    t = None
    indexed_url = request.GET.get('request', None)
    if indexed_url:
        ts = time.time()
        with open('search/static/settings/settings.json', 'r') as f:
            depth = int(json.loads(f.read()).get("depth"))
        c = crawler.Crawler(indexed_url, depth)
        c.crawl()
        t = time.time() - ts
    return render(request, 'index.html', {'time': t})
from crawler import crawler
from crawler.naver_news_parser import NaverNewsParser
from crawler.naver_blog_parser import NaverBlogParser
from crawler.naver_cafe_parser import NaverCafeParser
from crawler.naver_realtime_parser import NaverRealtimeParser
from crawler.aagag_mirror_parser import AagagMirrorParser

naver_news_crawler = crawler.Crawler('네이버뉴스', NaverNewsParser(), '네이버', '뉴스')
naver_blog_crawler = crawler.Crawler('네이버블로그', NaverBlogParser(), '네이버', '블로그')
naver_cafe_crawler = crawler.Crawler('네이버카페', NaverCafeParser(), '네이버', '카페')
naver_realtime_crawler = crawler.Crawler('네이버실시간검색', NaverRealtimeParser(),
                                         '네이버', '실시간검색')

aagag_mirror_parser = crawler.Crawler('커뮤니티AAGAG', AagagMirrorParser(), '커뮤니티',
                                      'AAGAG')
Example #6
0
	def crawler(self,a,p,r,t,u,c):
		return crawler.Crawler(
			agent=a,proxy=p,redirect=r,timeout=t,url=u,cookie=c
			).process(
			)
Example #7
0
from utils import cons
from crawler import crawler
import threading
from datetime import datetime

cr = crawler.Crawler()


def main_crawler(url):
    cr.get_restaurant_content(url)


def main():
    now = datetime.now()  # start timing
    print(now)
    thread = []
    url_list = []
    dic_cat = cr.get_all_cat_url_from_db(cons.CITIES['shenzhen'])
    for item in dic_cat:
        url = cons.DIAN_PING_URL + str(item['url'])
        main_crawler(url)
    #     next = False
    #     for url_str in url_list:
    #         if url_str == url:
    #             next = True
    #             break
    #     if next == False:
    #         print('Now to get -------- ' + url)
    #         t = threading.Thread(target=main_crawler,
    #                              args=(url,))
    #     thread.append(t)
Example #8
0
from datetime import datetime
from socketIO_client_nexus import SocketIO, LoggingNamespace
import crawler.crawler as crawler
import database.database as database


def onEvent(event, *args):
    if event == 'finish':
        socketIO.emit('finish', args)
    elif event == 'progress':
        socketIO.emit('progress', args)


def onNodeCommand(_parameters):
    print(_parameters)
    if _parameters['type'] == "database":
        m_crawler.output()
    elif _parameters['type'] == "start_crawler":
        m_crawler.Start(_parameters['args'])
    elif _parameters['type'] == "cancel_crawler":
        m_crawler.Stop()


m_crawler = crawler.Crawler()
m_crawler.setEventDelegate(onEvent)
m_crawler.setDatabase(database.Database())
socketIO = SocketIO('127.0.0.1', 3000, LoggingNamespace)
socketIO.on('command', onNodeCommand)
socketIO.emit("whoamI", "crawler")
socketIO.wait()
Example #9
0
 def __init__(self):
     self.redis = config.redis_server
     self.crawler = crawler.Crawler()
     self.database = database.Database()
     self.logger = logger.create_logger('worker')