def do(self):
        crawler = Crawler(self.url)
        crawler.run()
        links = crawler.get_all_links()

        git_urls = {}
        # Try to get the git clone URL from the web viewer
        git_clone_domain = re.match('.*?(.*?)gitweb.*?', self.url).group(1)
        git_clone_url = '{0}{1}/'.format(git_clone_domain, 'git')

        for link in links:
            # Url have the format http://git.lxde.org/gitweb/?p=lxde/lxqt-config-randr.git;a=tree
            match = re.match('.*?p=(.*?);.*?', link)
            if match:
                name = match.group(1)
                git_urls[name] = git_clone_url + name

        # The Gerrit project has a single fileset assigned (this)
        # We empty the fileset and add dynamically the ones referenced by Gerrit
        self.project.filesets = []

        for project in git_urls.keys():
            fileset_name = project
            url = git_urls[project]

            fileset = GitFileSet(self.project_name, fileset_name, url, '')
            fileset.set_pattern('.*?ca.po')
            logging.debug("Gitweb adding {0}-{1}".format(self.project_name, name))
            self.project.add(fileset)

        self.project.do()
    def main(self):
        try:
            with open('linkdb.pickle'):
                return pickle.load(open('linkdb.pickle'))

        except IOError:
            sys.setrecursionlimit(10000)
            cw = Crawler()
            data = cw.main()
            soup = BeautifulSoup(data)
            dtable = soup.findAll('table')[1]
            drows = dtable.findAll('tr')
            j = 1
            pdata = []
            while j<len(drows):
                drele = dtable.findAll('td')
                k = 1
                while k < len(drele):
                    flag = 0
                    pdict = dict()
                    try:
                        pdict['name'] = drele[k].find('a').contents[0]
                        pdict['link'] = 'http://en.wikipedia.org' + drele[k].find('a')['href']
                    except:
                        flag = 1
                        
                    if flag == 1 :
                        k += 1
                        continue
                    #print pdict
                    pdata.append(pdict)
                    k += 1
                j += 1
            pickle.dump(pdata, open('linkdb.pickle', 'wb'))
            return pdata
def baseFC(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1,p,-1,"") for p in seedURLs]
    priorityQueue = PriorityQueue(t)
    
    crawlParams["priorityQueue"]=priorityQueue
    mytfidf = TFIDF()
    
    mytfidf.buildModel(crawlParams['model'],crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer']=mytfidf
    
    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()

    '''
    f = open("base-logData.txt","w")
    furl = open("base-Output-URLs.txt","w")
    for p in crawler.relevantPages:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        furl.write(p.pageUrl[1].encode("utf-8")+","+str(p.estimatedScore)+"\n")
        ftext = open("base-webpages/"+str(p.pageId) + ".txt", "w")
        ftext.write(p.text.encode("utf-8"))
        ftext.close()
    f.close()
    furl.close()
    bres = evaluator.evaluateFC(crawler.relevantPages)
    writeEvaluation(bres,"base-evaluateData.txt")    
    print sum(bres)
    print len(bres)
    '''
    return crawler.relevantPages
Exemple #4
0
class TestCrawling(unittest.TestCase):

    def setUp(self):
        self.crawler =Crawler('crawlerIndex.db')
        #self.crawler.dropIndexTables()
        #self.crawler.createIndexTables()
    
    def test_crawling(self):
        categories = list()
        #categories.append(Category('Administracion/Oficina', ['http://www.computrabajo.com.ar/bt-ofr-SC000-1.htm']))
        categories.append(Category('Arte/Diseno/Medios',      ['http://www.computrabajo.com.ar/bt-ofr-SC001-1.htm']))
        '''
        categories.append(Category('Cientifico/Investigacion',['http://www.computrabajo.com.ar/bt-ofr-SC002-1.htm']))
        categories.append(Category('Informatica/Telecom',     ['http://www.computrabajo.com.ar/bt-ofr-SC003-1.htm']))
        categories.append(Category('Direccion/Gerencia',      ['http://www.computrabajo.com.ar/bt-ofr-SC004-1.htm']))
        categories.append(Category('Economia/Contabilidad',   ['http://www.computrabajo.com.ar/bt-ofr-SC005-1.htm']))
        categories.append(Category('Educacion/Universidad',   ['http://www.computrabajo.com.ar/bt-ofr-SC006-1.htm']))
        categories.append(Category('Hosteleria/Turismo',      ['http://www.computrabajo.com.ar/bt-ofr-SC007-1.htm']))
        categories.append(Category('Ingenieria/Tecnico',      ['http://www.computrabajo.com.ar/bt-ofr-SC008-1.htm']))
        categories.append(Category('Legal/Asesoria',          ['http://www.computrabajo.com.ar/bt-ofr-SC009-1.htm']))
        categories.append(Category('Medicina/Salud',          ['http://www.computrabajo.com.ar/bt-ofr-SC010-1.htm']))
        categories.append(Category('Recursos Humanos',        ['http://www.computrabajo.com.ar/bt-ofr-SC011-1.htm']))
        categories.append(Category('Otros',                   ['http://www.computrabajo.com.ar/bt-ofr-SC012-1.htm']))
        '''
        
        pagelist = set()
        for category in categories:
            print category
            for url in category.getUrls(): 
                pagelist.add(Page(url,'/bt-ofr-','/bt-ofrd-',category))
                self.crawler.crawl(pagelist,5)
    def run(self):

        self.logger.info("the spider has been running!")

        #create a global thread num
        for num in range(len(self.spiders)):
            self.queue.put(num)
        try:
            for spider in self.spiders:
                crawler = Crawler(spider, self.queue)
                crawler.start()
            self.queue.join()
        except:
            self.logger.error("spider cannot run.")
        finally:
            seed_num = self.database.db['seed'].count()
            textfile = PROJECT_ROOT + '/log/spider.log'
            self.logger.info("now your seeds num is %s." % seed_num)
            try:
                fp = open(textfile, 'rb')
                content = util.tail(fp)
                fp.close()
                sub = 'bt-share-log-%s' % datetime.now()
                send_mail(['*****@*****.**', ], sub, content)
            except:
                self.logger.error(traceback.format_exc())
def eventFC(scorer, url_scorer,options):
#     seedUrls = ["http://www.cnn.com/2013/09/27/world/africa/kenya-mall-attack/index.html",
#                 "http://www.youtube.com/watch?v=oU9Oop892BQ",
#                 "http://ifrc.org/en/news-and-media/press-releases/africa/kenya/kenya-red-cross-society-continues-to-provide-vital-support-to-victims-and-families-of-the-westgate-shopping-mall-attack/"               
#                 ]
    #keywords = ['demonstrations','protest','elections','egypt','revolution','uprising','arab','spring','tunisia','libya','military']
    
    t = [(-1,p,-1) for p in options['seeds']]
    #t = [(-1,Url(p)) for p in seedUrls]
    priorityQueue = PriorityQueue(t)
    
    crawler = Crawler(priorityQueue,scorer,options)
    crawler.set_url_scorer(url_scorer)
    crawler.enhanced_crawl()
    print crawler.relevantPagesCount
    print crawler.pagesCount
    
    f = open("harverstRatioData.txt","w")
    for r,p in crawler.harvestRatioData:
        f.write(str(r) + "," + str(p) + "\n")
    f.close()
    
    f = open("logData.txt","w")
    furl = open("Output-URLs.txt","w")
    for p in crawler.relevantPages:
        f.write(str(p.pageId) + "," + str(p.pageUrl[2]) + "\n")
        furl.write(p.pageUrl[1]+"\n")
    f.close()
    
    furl.close()    
Exemple #7
0
    def post(self):
        seedURL = cgi.escape(self.request.get('seed'))
        depth = cgi.escape(self.request.get('depth'))
        keyword = cgi.escape(self.request.get('keyword'))

        if seedURL == None or seedURL.strip() == "":
            self.response.out.write("Fill in the seed URL, fool!")
            return
        if depth == None or depth.split() == "":
            self.response.out.write("Fill in the search depth, fool!")
            return
        if keyword == None or keyword.split() == "":
            self.response.out.write("Fill in the search keyword, fool!")
            return
        try:
            depth = int(depth)
        except:
            self.response.out.write("Depth should be a number, fool!")
            return
            
        crawler = Crawler(seedURL, depth, keyword)
        urls = crawler.crawl()

        self.response.out.write('<table class="results">')
        self.response.out.write('<tr><th>URL</th><th>Level</th></tr>')
        for (url, level) in urls:
            self.response.out.write('<tr>')
            self.response.out.write('<td><a class="link" href="%s">%s</a></td>' % (url, url))
            self.response.out.write('<td><span class="level">%s</span></td>' % level)
            self.response.out.write('</tr>')
        self.response.out.write('</table>')
    def do(self):
        crawler = Crawler(self.url)
        crawler.run()
        links = crawler.get_all_links()
        self._download_links(links, self.temp_dir)

        self.build()
Exemple #9
0
def crawl(c = None, seed = []):
	if c == None:
		c = Crawler(
			seed = seed, # your seed urls here 
			default_crawl_delay = 20, 
			obey_robots_txt = True,
			document_fetchers = 15,
			robots_txt_fetchers = 5) #start at least this many celery workers
	
	try:
		# start crawling, with this tasks specific termination criteria and 
		# a save period of 20 seconds
		c.crawl(
			termination_checker = example_task_termination_checker,
			save_frequency = timedelta(seconds = 20))
		
	finally:
		
		# if we were killed or finished, suspend crawl state to file.
		# revive the crawl with resume from crawler.py to explore results
		print "\nSuspended crawl to " + c.suspend()
		
		# print some statistics
		print "Downloaded bytes: " + str(cstats.downloaded_bytes(c))
		print "Discovered links: " + str(cstats.discovered_links(c))
		print "Discovered domains: " + str(cstats.discovered_domains(c))
		print "Runtime: " + str(cstats.runtime(c)) + " seconds"
		maxref = cstats.most_prolific_referer(c)
		# utf-8 printing problem in domain?
		print "Most prolific referrer was " + maxref["name"] + " with an average of " + str(maxref["avg_links_per_page"]) + " outgoing links per page."+"\n"
 def get_docs(self):
     crwl=Crawler()
     for page in self.pagelist:
         if page != '#' and page != 'mailto:[email protected]' and page !=None:
             if(crwl.get_page(page)!=True):
                 continue
             soup=crwl.return_soup()
             content=soup.find("div",{"class":"article-text"})
             if content != None:
                 div=content.find('div',id='articleKeywords')
                 if div != None:
                     div.decompose()
                 div=content.find('div',id='addshare')
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'rel-block-sec'})
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'photo-caption'})
                 if div != None:
                     div.decompose()
                 div=content.find('div',{'class':'related-column'})
                 if div != None:
                     div.decompose()
                 x=[s.extract() for s in content('script')]
                 text=content.text
                 text=re.sub('[\n]+',' ',text)
                 text=re.sub('[ ]+',' ',text)
                 text=text.strip()
                 if(len(text)<=10):
                     self.error_pagelist.append(page)
                 else:
                     self.final_docs.append(text)
Exemple #11
0
def main():
    # You can provide whatever query you like e.g. 'Barrack Obama', 'isis', 'mongodb'
    # example_query = 'isis'


    # Searching for 'isis' keyword and getting relevant snippets related to this keyword.
    # search_results_snippets = Quora.get_snippets_by_query(example_query)

    # Saving list of snippets obtained by query: 'isis' under 'snippets' collection in 'quora' db
    # db.snippets.insert({example_query: search_results_snippets})

    connection_str = 'mongodb://localhost:27017/'
    quora_db = 'quora'

    # Creating crawler object with limited crawling depth
    crawler = Crawler(connection_str, quora_db, maxdepth=2)
    seed = 'What-is-terrorism'
    # Starting crawling
    # crawler.crawl_by_question(seed)

    # The guy who originally asked seed question
    user = '******'
    # Crawling by user
    # crawler.crawl_by_user(user)

    crawler.crawl_questions_and_answers()
Exemple #12
0
class CrawlerTestCase(unittest.TestCase):
    '''
    Testing the functionality of the crawler
    '''
    def setUp(self):
        '''
        Create define a data dir, create crawler.
        '''
        self.crawler = Crawler('data')

    def test_download_content(self):
        '''
        Download content from TEST_URL and check against known
        content for said url.
        '''
        content = self.crawler.download_content(TEST_URL)
        assert 'projects' in content
        assert 'resume' in content

    def test_crawl(self):
        '''
        Download the content from a number of urls and save them
        to data directory
        '''
        self.crawler.crawl(TECH_URLS, 'technology')
        self.crawler.crawl(COOKING_URLS, 'cooking')
        assert 'technology' in os.listdir(self.crawler.data_dir)
        assert 'cooking'    in os.listdir(self.crawler.data_dir)

    def tearDown(self):
        pass
Exemple #13
0
 def test_generate_node_urls(self):
     c = Crawler(d)
     c.crawl_nodes_api(page_limit=1)
     try:
         c.generate_node_urls()
     except:
         self.fail("crawler.generate_node_urls() failed")
Exemple #14
0
def reset():
    global call_reset_last
    time_since_last_call = time.time() - call_reset_last
    if time_since_last_call >= call_reset_timeout:
        Crawler.reset()
        call_reset_last = time.time()
        time_since_last_call = 0
    return "%i000" % (call_reset_timeout - time_since_last_call)
Exemple #15
0
def flush():
    global call_flush_last
    time_since_last_call = time.time() - call_flush_last
    if time_since_last_call >= call_flush_timeout:
        Crawler.flush()
        call_flush_last = time.time()
        time_since_last_call = 0
    return "%i000" % (call_flush_timeout - time_since_last_call)
Exemple #16
0
 def test_scrape_url(self):
     c = Crawler(d)
     try:
         c._scrape_pages(['http://google.com', 'http://google.com/'])
         f = open('google.com/index.html')
         f.close()
     except:
         self.fail("page didn't save / get scraped at all")
Exemple #17
0
 def test_profile_urls_updated_by_crawl(self):
     c = Crawler(d)
     l1 = c.user_urls.copy()
     c.crawl_users_api(page_limit=1)
     l2 = c.user_urls.copy()
     self.assertEqual(len(l1), 0)
     self.assertGreater(len(l2), len(l1))
     self.assertNotEqual(l1, l2)
Exemple #18
0
 def test_institutions_urls_updated_by_crawl(self):
     c = Crawler(d)
     l1 = c.institution_urls.copy()
     c.crawl_institutions_api(page_limit=1)
     l2 = c.institution_urls.copy()
     self.assertEqual(len(l1), 1)
     self.assertGreater(len(l2), len(l1))
     self.assertNotEqual(l1, l2)
Exemple #19
0
 def test_registration_urls_updated_by_crawl(self):
     c = Crawler(d)
     l1 = c.registration_url_tuples.copy()
     c.crawl_registrations_api(page_limit=1)
     l2 = c.registration_url_tuples.copy()
     self.assertEqual(len(l1), 0)
     self.assertGreater(len(l2), len(l1))
     self.assertNotEqual(l1, l2)
    def test_http_counts_as_internal_link(self):
        self.svc.get('requests')._expect("https://example.com", 200, '<a href="http://example.com/insecure">click here</a>')
        self.svc.get('requests')._expect("http://example.com/insecure", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "https://example.com")
        siteMap = crawler.map()
        self.assertEqual({"https://example.com":{"assets":[], "links":["http://example.com/insecure"]},
                          "http://example.com/insecure": {"assets": [], "links": []}}, siteMap)
Exemple #21
0
	def post(self):
		seed = self.request.get('seed')
		maxpages = int(self.request.get('maxpages'))
		maxdepth = int(self.request.get('maxdepth'))
		rest = int(self.request.get('rest'))
		my_crawler = Crawler(seed, maxpages, maxdepth, rest)
		my_crawler.crawl_web()
		my_crawler.compute_ranks()
    def test_relative_links_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="foobar/">click here</a>')
        self.svc.get('requests')._expect("http://example.com/foobar/", 200, '')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["foobar/"]},
                          "http://example.com/foobar/": {"assets": [], "links": []}}, siteMap)
    def test_disallowed_urls_are_not_fetched(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="http://example.com/admin">click here</a>')
        self.svc.get('RobotFileParser')._disallowed_urls['http://example.com/admin'] = True

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["http://example.com/admin"]},
                         "http://example.com/admin": {"error": "Disallowed by robots.txt"}}, siteMap)
    def test_query_params_are_captured(self):
        self.svc.get('requests')._expect("http://example.com", 200, '<a href="/?foo=bar">click here</a>')
        self.svc.get('requests')._expect("http://example.com/?foo=bar", 200, '<different><stuff>')

        crawler = Crawler(self.svc, "http://example.com")
        siteMap = crawler.map()
        self.assertEqual({"http://example.com":{"assets":[], "links":["/?foo=bar"]},
                          "http://example.com/?foo=bar": {"assets": [], "links": []}}, siteMap)
class GetMookClass:
    def __init__(self):
        self.c = Crawler()
        self.db = DBHelper('localhost', 'root', '', 'test', 3306)
        self.lorder = int(time.time())
    def usage(self):
        print '''
            -h print this message
            -e everyday run, just check first page and find new classes
            -a all refresh, check all the pages and add new classes
        '''
    def run(self):
        if len(sys.argv) == 1:
            self.usage()
        else:
            try:
                opts, args = getopt.getopt(sys.argv[1:], "hea")
                for op, value in opts:
                    if op == "-h":
                        self.usage()
                    elif op == "-e":
                        self.startCrawl()
                    elif op == "-a":
                        self.startCrawl(1)
                    else:
                        self.usage()
            except:
                self.usage()
    def startCrawl(self, all=0):
        self.c.login("http://www.imooc.com/course/list", "http://www.imooc.com/user/login")
        if(all):
            index = 1
            while(self.crawlSinglePage(index)):
                index = index + 1
        else:
            self.crawlSinglePage(1)
    def crawlSinglePage(self, pageId):
        url = 'http://www.imooc.com/course/list?page=%d' % pageId
        classes = self.c.getClasses(url)
        if(len(classes) == 0):
            return 0
        else:
            for cls in classes:
                dbcls = self.db.selectClassByMid(cls.mid)
                if(not dbcls):
                    cls.lorder = self.lorder
                    cid = self.db.insertClass(cls)
                    self.refreshTitles(cid, cls.mid)
            return 1
    def refreshTitles(self, cid, mid):
        titles = self.c.getTitles(cid, mid)
        pid = 0
        for title in titles:
            if(title.mid == 0):
                pid = self.db.insertTitle(title)
            else:
                title.pid = pid
                self.db.insertTitle(title)
    def do(self):
        self.create_tmp_directory()

        crawler = Crawler(self.url)
        links = crawler.run()
        self._download_links(links, self.temp_dir)

        self.build()
        self.remove_tmp_directory()
Exemple #27
0
def test_main_funtion():
    url = "http://7gogo.jp/talks/YtykfykuJfMT"

    my_tester = Crawler()
    talk_id, username = get_talk_id(url)

    loop = asyncio.get_event_loop()
    task = asyncio.async(my_tester.run(talk_id, username, 1417268169))
    loop.run_until_complete(task)
Exemple #28
0
def get_events():
    from crawler import Crawler
    crawler = Crawler()

    print "[main] getting news"
    crawler.get_top_news()

    print "[main] getting festivals"
    crawler.get_festivals()
Exemple #29
0
def cache_fill_loop():
    global sources
    while True:  # fill cache up to min_cache_imgs
        if Crawler.info()["images"] < min_cache_imgs_before_refill:
            while Crawler.info()["images"] < min_cache_imgs:
                random.choice(sources).crawl()

        # sleep for non-invasive threading ;)
        time.sleep(1.337)
 def __init__(self):
     crwl=Crawler()
     crwl.get_pagelist()
     self.pagelist=crwl.return_pagelist()
     self.soup=crwl.return_soup()
     self.articledb=ArticleDb('localhost',27017)
     self.articledb.init_backend('testdb','testcol')
     self.final_docs=[]
     self.error_pagelist=[]
Exemple #31
0
 def run(self, ent_number=0):
     """爬取的主函数
     """
     return Crawler.run(self, ent_number)
Exemple #32
0
#!/usr/bin/env python
from flask import Flask, jsonify, request
from crawler import Crawler

app = Flask(__name__)
crawler = Crawler()

app.debug = True


@app.route('/')
def index():
    return 'Hello, World!'


@app.route('/search')
def search():
    keyword = request.args.get('w')
    if keyword:
        res = crawler.search(keyword)
    else:
        res = {'message': 'No keyword sent'}
    return jsonify(res)


if __name__ == '__main__':
    app.run(port=80)
Exemple #33
0
from crawler import Crawler

crawler = Crawler()

flipkart_url = "https://www.flipkart.com/search?q=iphone%207&as=on&as-show=on&otracker=start&as-pos=2_q_iph"
amazon_url = "http://www.amazon.in/s/ref=nb_sb_ss_i_4_6?url=search-alias%3Daps&field-keywords=iphone+7&sprefix=iphone%2Caps%2C284&crid=ZNKHKONNIHBA"

crawler.auto_crawl(flipkart_url)
Exemple #34
0
        if minID[0][0] is not None:
            minID = minID[0][0]

            start = minID
            end = start + 5

            while True:

                websiteQuery = "SELECT websiteURL, websiteID FROM website WHERE websiteID >= '{0}' AND websiteID < '{1}'".format(
                    start, end)
                websites = d.executeSelectQuery(websiteQuery)

                if websites is not []:
                    for website in websites:

                        # print(website[1])
                        # print(website[0])
                        c = Crawler(d)
                        c.crawl(website[0])
                        del c

                    start = start + 5
                    end = start + 5

                    maxID = d.executeSelectQuery(maxIdQuery)
                    if maxID[0][0] is not None:
                        maxID = maxID[0][0]
                        if start > maxID:
                            start = minID
                            end = start + 5
Exemple #35
0
if __name__ == '__main__':

    from crawler import Crawler

    crawler = Crawler()

    crawler.to_csv(
        crawler.extract_product_info(
            crawler.filter_urls(
                crawler.crawl("https://www.epocacosmeticos.com.br/"))))
def setup_crawler():
    global crawler

    # create the crawler from loaded constants
    print('setting up crawler ...')
    crawler = Crawler(session, api_id, api_hash, rabbitmq_channel)
Exemple #37
0
                "Login do studenckiego maila(wpisz razem z @stud...): ")
            login_data["password_m"] = input("haslo do mail: ")
            login_data["notification_address"] = input(
                "Na jaki mail wysylac powiadomienia?: ")
            if input("Czy dane sa poprawne[T/N]: ") == "T":
                accept = True

        with open(filename, 'wb') as f:
            pickle.dump(login_data, f, protocol=pickle.HIGHEST_PROTOCOL)

    return login_data


if __name__ == "__main__":
    login_data = load_configuration()
    c = Crawler(login_data.get("login_d"), login_data.get("password_d"))
    local_path = pth.dirname(pth.abspath(__file__))
    old_marks_file_name = local_path + "/old_marks.html"

    tmp_marks = c.getMarksInHtmlTable()

    # TODO: utf-8 should be default
    marks = str(tmp_marks.encode('utf-8'))
    try:
        f = open(old_marks_file_name, 'r')
        old_marks = f.read()
        if old_marks != marks:
            send(login_data.get("login_m"),
                 login_data.get("notification_address"),
                 login_data.get("password_m"), marks)
            f.close()
Exemple #38
0
# -*- coding: utf-8 -*-
import json
import time
from crawler import Crawler
from teleBot import TeleBot
from datetime import date

# Modules Setting
crawler = Crawler()
teleBot = TeleBot()

# Start!
while True:
    # 시간 및 횟수 측정!
    today, now = date.today(), time.strftime('%H%M%S')
    print("*" * 15, "{} {}시 {}분에 시작~".format(today, now[:2], now[2:4]),
          "*" * 15)

    # Crawl and Send telegram message
    past_data = crawler.load_past_data('crawled_data.json')
    new_data = crawler.crawl_data()
    teleBot.send_message(past_data, new_data)

    # Save new_info
    teleBot.update_and_save_data(past_data, new_data, 'crawled_data.json')

    print("Finished")
    time.sleep(600)
Exemple #39
0
def index():
    crawler_ = Crawler(database_url)

    print('iniciado')

    crawler_.indexar()
def read_excel_file(excel_file: str) -> int:
    new_excel_file = "new_" + excel_file
    isbn: str = ""
    isbn_code: str = ""
    description: str = ""
    method = Method.GET
    headers = {"Accept-Encoding": "gzip, deflate", "User-Agent": "Mozillla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Accept": "*/*", "Connection": "Keep-Alive"}
    timeout = 10
    encoding: Optional[str] = None
    description_col_num = 28

    config = Config()
    if not config:
        logger.error("can't read configuration")
        sys.exit(-1)
    collection_conf = config.get_collection_configs()
    url_prefix = collection_conf["url_prefix"]
    encoding = collection_conf["encoding"]
    logger.debug("url_prefix=%s" % url_prefix)

    workbook = xlrd.open_workbook(excel_file)
    worksheet1 = workbook.sheet_by_index(0)
    num_rows = worksheet1.nrows

    new_workbook = xlwt.Workbook()
    new_worksheet = new_workbook.add_sheet("Sheet1", cell_overwrite_ok=True)

    crawler = Crawler(method, headers, timeout, encoding)

    for row_num in range(num_rows):
        do_crawl = True
        do_extract = False

        row = worksheet1.row_values(row_num)
        isbn = str(row[0])

        try:
            isbn_code = convert_isbn(isbn)
        except ValueError as e:
            do_crawl = False
        logger.debug("isbn=%s" % isbn_code)

        if do_crawl:
            url = url_prefix + isbn_code
            logger.debug("url=%s" % url)

            html = crawler.run(url)
            #logger.debug("html=%s" % html)

            # ISBN -> bid
            state = 0
            for line in html.split('\n'):
                if state == 0:
                    m = re.search(r'<ul class="basic" id="searchBiblioList"', line)
                    if m:
                        state = 1
                elif state == 1:
                    m = re.search(r'<a href="(?P<url>http://book.naver.com/[^"]+)"', line)
                    if m:
                        url = m.group("url")
                        logger.debug(url)
                        html = crawler.run(url)
                        do_extract = True
                        if not html:
                            logger.warning("can't get response from '%s'" % url)
                            sys.exit(-1)
                        break

            if do_extract:
                row[description_col_num] = extract_element(html)
                logger.debug("len=%d" % len(row[description_col_num]))
                #logger.debug("row[description_col_num]=%s" % row[description_col_num])
                with open("test.%d.html" % row_num, "w") as outfile:
                    outfile.write(row[description_col_num])
                    outfile.write("\n")

        for col_num in range(len(row)):
            new_worksheet.write(row_num, col_num, row[col_num])

        # 테스트용으로 첫번째 건 수행 이후에 종료
        #if do_crawl:
            #print(row[description_col_num])
            #break;

    new_workbook.save(new_excel_file)

    return 0
Exemple #41
0
import multiprocessing as mp

from crawler import Crawler

if __name__ == "__main__":
    Crawler().crawl()
Exemple #42
0
 def run(self, ent_number=0):
     Crawler.run(self, ent_number)
     '''
Exemple #43
0
                    help="print verbose output")
parser.add_argument(
    "--output",
    action="store",
    default="sitemap.xml",
    help="File path for output, if file exists it will be overwritten",
)

# parsing parameters
args = parser.parse_args()
url = args.url.rstrip("/")

found_links = []

# initializeing crawler
crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose)

# fetch links
links = crawler.start()

# write into file
with open(args.output, "w") as file:
    file.write(
        '<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    )
    priority = 1.0
    date = datetime.now().strftime("%Y-%m-%d")
    for link in links:
        #if "infinitysports.ai" not in link:
        file.write(
            "\n\t\t<url>\n\t\t\t<loc>{0}{1}</loc>\n\t\t\t<lastmod>{2}</lastmod>\n\t\t\t<priority>{3}</priority>\n\t\t</url>"
Exemple #44
0
class WebDeface(object):
    """Class for WebDeface."""

    def __init__(self, args):
        """Intialize WebDeface."""

        if int(platform.sys.version_info[0]) < 3:  # if Python 2.X.X
            self.url = raw_input(">> Enter the URL of the website: ")
            self.thread = int(raw_input(">> Enter the number of threads: "))
        else:
            self.url = input(">> Enter the URL of the website: ")
            self.thread = int(input(">> Enter the number of threads: "))

        if (self.url is not None and
            deface_utils.verify_url(self.url)):
            # Create crawler object
            self.crawler_obj = Crawler(url=self.url,
                                       threads=self.thread)
            self.crawler_obj.threading_crawl()

        # Create a cache object
        self.cache_obj = Cache()
        self.cache_obj.generate_cache()

        # Arguments
        self.args = args

        # Initialize empty objects
        self.twitter_obj = None
        self.slack_obj = None
        self.telegram_obj = None
        self.twilio_sms_obj = None

    def create_notifier_objs(self):
        """
        Create notification medium objects.

        Args:
            None

        Raises:
            None

        Returns:
            None
        """
        # Parse all the arguments
        if (self.args.twitter_api_key and
            self.args.twitter_access_token and
            self.args.twitter_api_secret_key and
            self.args.twitter_access_token_secret):
            cred = {}
            cred["api_key"] = self.args.twitter_api_key
            cred["access_token"] = self.args.twitter_access_token
            cred["api_secret_key"] = self.args.twitter_api_secret_key
            cred["access_token_secret"] = self.args.twitter_access_token_secret
            self.twitter_obj = twitter.Twitter(cred)

        if (self.args.twilio_to and
            self.args.twilio_from and
            self.args.twilio_token and
            self.args.twilio_sid):
            cred = {}
            cred["twilio_to"] = self.args.twilio_to
            cred["twilio_sid"] = self.args.twilio_sid
            cred["twilio_token"] = self.args.twilio_token
            cred["twilio_from"] = self.args.twilio_from
            self.twilio_sms_obj = twilio_sms.Twilio(cred)

        if (self.args.slack_token and
            self.args.slack_user_id):
            cred = {}
            cred["token"] = self.args.slack_token
            cred["user_id"] = self.args.slack_user_id
            self.slack_obj = slack.Slack(cred)

        if (self.args.telegram_user_id and
            self.args.telegram_bot_token):
            cred = {}
            cred["user_id"] = self.args.telegram_user_id
            cred["token"] = self.args.telegram_bot_token
            self.telegram_obj = telegram.Telegram(cred)

    def start(self):
        """
        Start Web Deface Detection.

        Args:
            None

        Returns:
            None

        Raises:
            None
        """
        print("[!] Remote Web Deface Detection started")
        # Create a monitor object
        self.monitor_obj = Monitor(twitter=self.twitter_obj,
                                   slack=self.slack_obj,
                                   twilio_sms=self.twilio_sms_obj,
                                   telegram=self.telegram_obj)
        # Start the monitor loop
        self.monitor_obj.monitor()
Exemple #45
0
 def __init__(self):
     self.redis = RedisClient()
     self.crawler = Crawler()
    parser.add_argument(
        '--body',
        help=
        'The application/x-www-form-urlencoded body content to send (\'param1=value1&param2=value2\')'
    )
    parser.add_argument(
        '--method',
        help='request method \'GET\' or \'POST\' (defaults to "GET")',
        default='GET')
    parser.add_argument(
        '--threads',
        help='Number of threads to use while crawling the site',
        default=5)
    parser.add_argument('--db',
                        help='sqlite3 database name. (defaults to \'xss.db\')',
                        default="xss.db")

    args = parser.parse_args()

    # Initialize thread pool
    thread_pool = ThreadPoolExecutor(max_workers=int(args.threads))

    # Start crawling and scanning the other found URLs
    crawler = Crawler(
        URLRequest(url=args.url, method=args.method, body=args.body),
        search_for_xss, thread_pool)
    crawler.start()

    for r in crawler.func_result:
        store_xss_result(r, args.db)
Exemple #47
0
def main():
    logger.info('Starting...')
    c = Crawler()
    c.start()
    logger.info('Finished.')
Exemple #48
0
def produtos(index=True):
    crawler_ = Crawler(database_url, index=index)
    crawler_.search_produtos()
Exemple #49
0
from crawler import Crawler
from indexer import Indexer
from query_processor import QuerryProcessor
from document import Document
from time import sleep

if __name__ == '__main__':

    # sleep(5.0)
    # print("THREAD-TIME!")
    crawler = Crawler('https://www.in.gr', 20, 5, True, 'BFS')
    crawler.initializeCrawl()

    ind = Indexer(Crawler.documents)

    query = input("Enter your search query:")
    ind.add_document(Document('search_query', query))
    print('Building Indexer...')
    ind.create_indexer()
    print('Calculating TF-IDFs. May take a while.')
    ind.calculate_scores()

    qp = QuerryProcessor(ind.inverted_index, len(ind.documents))
    docs_with_cos_ = qp.compare_documents()
    docs_with_cos_ = sorted(
        docs_with_cos_, key=lambda x: x[1],
        reverse=True)  # sorting based on cosine similarity scores
    print(f'Showing top results based on your query "{query}":')
    for doc in docs_with_cos_:
        print(doc[0].link)
Exemple #50
0
'''
Created on 2014. 8. 27.

@author: lsh
'''

import logging

#meerkat modules
from crawler import Crawler

if __name__ == '__main__':
    crawler = Crawler()

    print 'crawler activated'

    logging.info("Server Start..")

    try:
        crawler.collect_document()
    except:
        logging.exception('')
Exemple #51
0
def main():
    proxyips = Crawler.run()
    logger.info('Crawler finish, total ip: %s', len(proxyips))
    sniffer = Sniffer()
    sniffer.run(proxyips)
Exemple #52
0
from website import Website
from crawler import Crawler

crawler = Crawler()

# site_data = [
#     ['O\'Reilly Media', 'http://oreilly.com',
#      'https://ssearch.oreilly.com/?q=', 'article.product-result',
#      'p.title a', True, 'h1', 'section#product-description'],
#     ['Reuters', 'http://reuters.com',
#      ''],
#     ['Brookings', 'http://www.brookings.edu', 'h1',
#      'div.post-body'],
#     ['New York Times', 'http://nytimes.com', 'h1',
#      'p.story-content'],
# ]

site_data = [[
    'O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
    'article.product-result', 'p.title a', True, 'h1',
    'section#product-description'
]]

sites = []
for row in site_data:
    sites.append(
        Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                row[7]))

    topics = ['python']
    for topic in topics:
Exemple #53
0
 def test_get_url_contents_checks_cache(self, mock_get_key):
     mock_get_key.return_value = "abc"
     crawler = Crawler(self.cache, Mock())
     with patch('crawler.requests') as mock_requests:
         crawler._get_url_contents("myurl")
         self.cache.exists.assert_called_once_with('abc')
Exemple #54
0
 def setUp(self):
     self.test_crawler = Crawler("aladinfoods.bg")
Exemple #55
0
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from crawler import Crawler
import os

url = "https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/topic-pages/tables/table-43"
options = webdriver.ChromeOptions()
options.binary_location = "C:\Program Files\Google\Chrome\Application\chrome.exe"
driver = webdriver.Chrome(options=options)
spider = Crawler(url, options, driver)
spider.inspect()
tables = spider.read_tables()

i = 0
if os.path.isdir("FBI_Data"):
    pass
else:
    os.mkdir("FBI_Data")

for table in tables:
    i += 1
    table.to_excel(f"FBI_Data/Table{i}.xlsx")
Exemple #56
0
 def test_init_sets_user_agent(self):
     crawler = Crawler(self.cache, Mock())
     self.assertEqual(USER_AGENT, crawler._headers['User-agent'])
Exemple #57
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# filename: run.py

import re
from crawler import Crawler, CrawlerCache

if __name__ == '__main__':

    # Using SQLite as a cache to avoid pulling twice

    crawler = Crawler(CrawlerCache('crawler.db'))
    root_re = re.compile('^/$').match
    crawler.crawl('http://techcrunch.com/', no_cache=root_re)
    #crawler.crawl('http://www.engadget.com/', no_cache=root_re)
    #crawler.crawl('http://gizmodo.com/', no_cache=root_re)
    #crawler.crawl('http://www.zdnet.com/', no_cache=root_re)
    #crawler.crawl('http://www.wired.com/', no_cache=root_re)
Exemple #58
0
from multiprocessing import Lock
from crawler import Crawler, ThreadScheduler
from constants import SITE_URL, MAX_PAGES_TO_CRAWL, MAX_CONCURRENT_REQUESTS, DOWNLOAD_DELAY

if __name__ == "__main__":
    lock = Lock()
    crawler = Crawler(base_url=SITE_URL,
                      max_pages_to_crawl=MAX_PAGES_TO_CRAWL,
                      lock=lock)
    thread_scheduler = ThreadScheduler(
        crawler=crawler,
        max_concurrent_requests=MAX_CONCURRENT_REQUESTS,
        download_delay=DOWNLOAD_DELAY)
    thread_scheduler.run()

    print("Page visited: %s, Bytes downloaded: %s" %
          (crawler.total_page_visited, crawler.bytes_downloaded))
Exemple #59
0
class CSSParser(object):
    """
    This class is a CSS parser of css **font** declarations.

    CSSParser instantiates his own Crawler, because of downloading extern
    cascade style sheet files.

    Supported are:
        DECLARATIONS:
        External css declarations <link> and @import
        Internal css declarations in <style type='text/css'>

        PARSING PRIORITY (where 1=lowest and 3 = highest priority):
        1. TAG
        2. CLASS
        3. ID

        PARSED SELECTORS:
        tag                 a {}
        class               .myClass {}
        id                  .myNewId {}
        tag.class           a.myClass{}
        grouped selectors:  a, b, .myClass {}


    Not supported:
        Specificity (parsing is only on basis of priority which is shown above)
        Contextual selectors (like .myclass span a)
        Pseudo-classes and pseudo-elements (a:hover)
        Directive #style>body (whats the name of this??)
        Inline css declarations - THIS IS IN TODO!
    """

    def __init__(self):
        self._crawler = Crawler()
        self._crawler.set_handler(FileDownloader)
        self.cleaner = _MyCSSCleaner()
        self._last_url = None
        self._url = None
        self._rules = []
        self.cssfiles = []
        # init tokenizer (scanner)
        self.tokenizer = CSSTokenizer()
        # css style parser converts css rules to css styles
        self.cssstyleparser = _CSSStyleParser()
        # element -> font style mapper maps lxml elements to CSSStyle instances
        self._elem2style_map = Element2CSSStyleMapper()


    def _identical_domain(self, url1, url2):
         if url1 == None or url2 == None:
             return False
         p1 = urlparse(url1)
         p2 = urlparse(url2)
         return p1.netloc == p2.netloc


    def _get_onpage_styles(self):
        stylefields = self.elemtree.findall(".//style")
        _css = ''
        for style in stylefields:
            if style.get('type') != None and style.get('type') == 'text/css':
                _css += style.text
        self.tokenizer.parse_source(_css)
        self._rules.extend( self.tokenizer.get_rules() )


    def _get_css_files(self):
        # Method returns True if some css are to download, False otherwise.
        self.last_cssfiles = self.cssfiles
        if self.ident_last_domain:
            # If we had last URL's domain identical like this url, we have probably
            # the same css files. So check it!
            if self.cssfiles and set(self.cssfiles) == set(self.last_cssfiles):
                return False
        else:
            # delete css file list
            self.cssfiles = []
        # handle css 2.0 imports of extern files
        styles = self.elemtree.findall(".//style")
        for style in styles:
            if style.get('type') != None and style.get('type') == 'text/css':
                if style.text is not None and re.search("@import", style.text, re.I):
                    urlre = re.search('^(http|https|ftp)\://[a-z0-9\-\.]+\.[a-z]{2,3}(:[a-z0-9]*)?/?' + \
                                      '([a-z0-9\-\._\?\,\'/\\\+&amp;%\$#\=~])*$', style.text, re.I)
                    urlre != None and self.cssfiles.append(urlre.group(0))
        # handle usual <link> declarations of extern css files
        links = self.elemtree.findall(".//link")
        for link in links:
            if link.get('type') != None and link.get('type') == 'text/css' \
               and link.get('href') != None:
                link.make_links_absolute(self._url)
                self.cssfiles.append(link.get('href'))
        return len(self.cssfiles) != 0


    def parse(self, elemtree, url):
        """
        Main method for parsing.
        @param elemtree - lxml.etree._ElementTree of the page which is to be parsed
        @param url - URL identifier of the page
        @return CSSStyleContainer object with parsed css declarations.
        """
        # css parsing order
        # 1. Browser default
        # 2. External style sheet
        # 3. Internal style sheet
        # 4. Inline style FIXME not supported yet!

        self.elemtree = elemtree
        self._url = url

        # make all links absolute
        root = self.elemtree.getroot()
        root.make_links_absolute(self._url)

        # If we had last URL's domain identical like this url, we are probably
        # on the same site but different web page. There is very high probability
        # that we will have identical css files, so there's no need to download
        # and parse them again.
        if not self._identical_domain(self._url, self._last_url):
            self._styles = []
            self.ident_last_domain = False
        else:
            self.ident_last_domain = True
        # get css files if needed
        if self._get_css_files():
            # download css sheets
            files = self._crawler.start(self.cssfiles)
            for f in self.cssfiles:
                try:
                    # and parse them
                    self.tokenizer.parse_source(files[f])
                    self._rules.extend( self.tokenizer.get_rules() )
                except TypeError:
                    pass
        self._last_url = self._url
        # parse on-page definitions
        self._get_onpage_styles()
        # create cascade style sheet
        self._sheet = CascadeStyleSheet(self._rules)
        # stylesheet is instance of CSSSelector2CSSStyleMapper
        self._selector2style_map = self.cssstyleparser.get_style_mapper(self._sheet)
        # parse font styles

        for elem in root.iterdescendants():
            style = CSSStyle()
            style.parse_element(elem, self._selector2style_map, self._elem2style_map)
            elem.style = style


    def get_sheet(self):
        return self._sheet
Exemple #60
0
	def __init__(self, user):
		super().__init__()
		self.crawler = Crawler(user)
		f1.write(user + '\n')