Exemple #1
0
 def run(self):
     robot_url = "http://allrecipes.com/"
     root = 'http://allrecipes.com/Recipes/ViewAll.aspx?Page=1'
     depth_limit = 5
     confine_reg = ['http://allrecipes.com/Recipes/ViewAll.aspx\?Page\=[0-9]*$','http://allrecipes.com/Recipe/[a-zA-Z0-9\-]*/Detail.aspx$']
     c = Crawler(root, depth_limit,confine_reg,robot_url)  
     c.crawl()     
Exemple #2
0
def runScan(target):

    crawler = Crawler()
    findings = {}

    print("Scanning: ", target)

    findings.clear()
    findings = {"target":target,"sqlinjection":[], "WeakPassword":[]}


    if not crawler.init(target):
        return

    crawler.crawl()
    crawler.findLoginPanel()

    AuthBypass.check_authbypass(crawler.loginFormEndpoints, findings)
    WeakPasswords.check_weak_passwords(crawler.loginFormEndpoints, findings)


    if len(crawler.loginFormEndpoints) > 0:
        findings["loginForm"]="yes"
    else:
        findings["loginForm"] = "no"

    sqli_scan_urls(crawler.uEndPoints, findings)
    sqli_scan_forms(crawler.fEndpoints, findings)
    CommonFunctions.save_findings(findings)
Exemple #3
0
def main() -> None:
    urls = None
    with open('./urls.json') as f:
        urls = json.loads(f.read())
    for i in urls:
        crawler = Crawler(i, urls[i])
        crawler.crawl()
        add_data(i, crawler.sorted_time_table)
    save()
    return
Exemple #4
0
def main():
    # Configuration
    configurator = Config("./ConfigFile.xml")
    ret = configurator.config()

    # Crawl
    crawler = Crawler()
    crawler.crawl()

    # Parse
    parser = Parser()
    parser.parse()
    return
Exemple #5
0
def scrape_and_crawl(input_page: str,
                     file_path: str,
                     link_status_report: dict = {},
                     all_checked_links: dict = {},
                     is_local_file: bool = False):
    scraper = Scraper()
    if is_local_file:
        links = list(scraper.extract_links(input_page, ""))
    else:
        links = list(scraper.extract_links(input_page, file_path))
    crawler = Crawler(urls=links, checked=all_checked_links)
    crawler.crawl()
    checked_links = crawler.get_responses()
    link_status_report[file_path] = checked_links
    return checked_links, crawler.get_checked()
Exemple #6
0
def main():
    parser = argparse.ArgumentParser(
        description='Process Parameters for searching')
    parser.add_argument('key', type=str, help='Input the keys of Google API')
    args = parser.parse_args()
    # command loop
    while True:
        print('1 Add new class data to events library\n'
              '2 Events classification\n'
              '3 Query classification\n'
              '4 Exit\n')
        cmd = input('Please input a option:\n')
        if cmd == '1':
            # create the crawler
            spider = Crawler(args.key)
            while True:
                print('\nEnter the search item and keywords like this\n'
                      'num_of_res item keyword_1 keyword_2 ... keyword_n\n'
                      "--- type 'exit' to exit ---\n")
                cmd = input('Please input your command\n')
                # input check for cmd
                if cmd == '':
                    print('Empty string!')
                    continue
                elif cmd == 'exit':
                    break
                else:
                    cmd = cmd.split(' ')
                    if cmd[0].isdigit():
                        spider.crawl(cmd[0], cmd[2:], cmd[1])
                        print('crawling...')
                        continue
                    else:
                        print('The number of search item is invalid!\n')
                        continue
            continue

        elif cmd == '2':
            print('Events classifier in developing...\n')
            continue
        elif cmd == '3':
            print('Query classifier in developing...\n')
            continue
        elif cmd == '4' or cmd == 'exit':
            break
        else:
            print('Command error, please input your option again\n')
            continue
Exemple #7
0
 def main(self):
   if self.config.has_option("sources", "bootstrap"):
     self.bootstrap(
       filename = self.config.get("sources", "bootstrap")
     )
   b = Base(
     endpoint = self.config.get("xserver", "endpoint"),
     base = self.config.get("xserver", "base")
   )
   c = Crawler(base = b)
   c.crawl(callback = self.callback)
   self.processCache()
   self.addTopConcepts()
   self.addLinks()
   self.write()
   self.writeTables()
   shutil.rmtree("temp")
Exemple #8
0
    def crawl(max_page):
        text.delete('1.0', END)
        text.insert(END, 'Currently Crawling Please Wait\n')
        search_engine.update()

        count = int(max_page)
        while len(Crawler.queue) > 0 and count > 0:
            queue = str(Crawler.queue.pop())
            Crawler.crawl(queue)
            count -= 1
            text.insert(END, 'Currently Crawling: ' + queue + '\n')
            search_engine.update()

        print('Crawl Finished Can Now Search')
        text.delete('1.0', END)
        text.insert(END, 'Crawl Finished Can Now Search\n')
        text.insert(END, str(len(Crawler.crawled)) + " Url's have been Crawled and Indexed \n")
        text.insert(END, str(len(Crawler.queue)) + " Total Number of Url's In Queue\n")
        search_engine.update()

        Crawler.save_lists()
    def test_crawl_limit(self):
        c = Crawler("http://a.com")
        c.SLEEP_TIME = 0

        def side_effect():
            c.process_q.pop(0)
        c._process_next_url = mock.Mock(side_effect=side_effect)
        c.render_sitemap = mock.Mock()

        c.URL_LIMIT = 10
        c.process_q = ["test"] * 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = float("inf")
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 10)
Exemple #10
0
def scrape_documents(min_count=0):

    doc_count = 0

    s = Crawler()
    docs = s.crawl(min_count)

    while min_count <= 0 or doc_count < min_count:
        for doc in docs:
            log.debug('uploaded image doc from %s', doc.url)
            doc_count += 1
            if doc_count % 100 == 0:
                log.info('%d images and counting...', doc_count)
            yield doc
Exemple #11
0
 def getWebPage(self, URL, depth):
     '''
     Retreve all the text data from webpage/webpages.
     
     @param URL: URL which is going to be the sourse
     @param depth: the depth of the links from the URL which should be searched
     default = 0
     
     @return: string of all text from all webpages. 
     '''
     if int(depth) != 0:
         t = ""
         crawler = Crawler(URL, int(depth)-1)
         crawler.crawl()
         for l in crawler.links_remembered:
             text = self.Alchemy.URLGetText(str(l.dst))     
             element = ET.XML(text)
             t += element.findtext("text")
     else:
         text = self.Alchemy.URLGetText(URL)     
         element = ET.XML(text)
         t = element.findtext("text")
     return t.encode('ascii','ignore')
Exemple #12
0
 def getWebPage(self, URL, depth):
     '''
     Retreve all the text data from webpage/webpages.
     
     @param URL: URL which is going to be the sourse
     @param depth: the depth of the links from the URL which should be searched
     default = 0
     
     @return: string of all text from all webpages. 
     '''
     if int(depth) != 0:
         t = ""
         crawler = Crawler(URL, int(depth) - 1)
         crawler.crawl()
         for l in crawler.links_remembered:
             text = self.Alchemy.URLGetText(str(l.dst))
             element = ET.XML(text)
             t += element.findtext("text")
     else:
         text = self.Alchemy.URLGetText(URL)
         element = ET.XML(text)
         t = element.findtext("text")
     return t.encode('ascii', 'ignore')
Exemple #13
0
 def test_crawl_inline(self):
     path_rules = {
         "start": "./",
         "file": {
             "include": ["\\.py$"]
         }
     }
     rules = {
         "search_author": {
             "include": "author",
             "result": {
                 "author": "author[\\s_]+=\s+'([\\w\\s]+)'"
             }
         }
     }
     result = {
         "BUILT-IN": ["FILENAME"]
     }
     output = None
     crawl_res_sync = Crawler.crawl(path_rules, rules, result, output)
     current_test_file = "./test/test_crawler.py"
     self.assertIsNotNone(crawl_res_sync.get(current_test_file))
     self.assertTrue("matches" in crawl_res_sync[current_test_file] and len(crawl_res_sync[current_test_file]) > 0)
     self.assertEqual(crawl_res_sync[current_test_file]["matches"]["search_author"]["author"][0], __author__)
Exemple #14
0
def main():
	urls = raw_input("\n Pages to crawl: ")
	maxLinksToCrawl = int(raw_input(" Maximum amount of links to crawl: "))
	
	crawler = Crawler(urls, maxLinksToCrawl)
	crawler.crawl()
# Written by Kevin Keraudren, 14/06/2011

import argparse

from Crawler import Crawler

parser = argparse.ArgumentParser(
    usage = "Usage: %(prog)s seed_url [options]" )
parser.add_argument(
    'seed',
    metavar='seed_url',        
    help='url for starting the crawl' )
parser.add_argument(
    '--dir',
    default='./',
    help="root directory to store the result of the crawl" )
parser.add_argument(
    '--verbose',
    action="store_true", default=True,
    help="verbose mode" ) 

args = parser.parse_args()

crawler = Crawler( args.seed, rootdir=args.dir, verbose=args.verbose )

print crawler
crawler.crawl()
print crawler
print "Crawl complete"

Exemple #16
0
import logging
import time

from Analyzer import Analyzer
from Cleaner import Cleaner
from Cluster import Cluster
from Crawler import Crawler
from Uploader import Uploader

this_date = time.strftime("%Y%m%d", time.localtime())
# 爬取新闻
crawler = Crawler(this_date=this_date)
crawler.crawl()

# 聚类
cluster = Cluster(date=this_date)
cluster.remove_useless_articles()
cluster.load_articles()
cluster.cluster()
cluster.upload_groups_to_DB()

# 情绪分析
analyzer = Analyzer(date=this_date)
analyzer.analyze()

# 上传至LeanCloud
uploader = Uploader(date=this_date)
uploader.upload_new_groups()

# 删除过老或分数过低的新闻组
cleaner = Cleaner(date=this_date)
Exemple #17
0
            for regex, tags in self.regexes_tags:
                if(regex.match(resource)):
                    auto_tags.extend(tags)

            resources_tags.append((resource, auto_tags))

        assert isinstance(resources_tags, list)

        return resources_tags
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == '__main__':
    from Crawler import Crawler

    crawler = Crawler(white_list=set((
        '..',
    )))

    auto_tagger = AutoTagger({
        r'^.*\.py$': ['python', 'development'],
        r'^.*\.css$': ['css', 'development'],
        r'^.*\.js$': ['javascript', 'development'],
    })

    for resource, tags in auto_tagger.process(crawler.crawl()):
        print(resource, tags)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Exemple #18
0
 def test_crawl_fake_directCrawl(self):
     parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json")
     Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"])
     self.assertTrue(os.path.isfile(parameters["output"]["path"]))
     result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"])
     self.assertEqual(len(result_from_file), 3)
Exemple #19
0
            auto_tags = []

            for regex, tags in self.regexes_tags:
                if (regex.match(resource)):
                    auto_tags.extend(tags)

            resources_tags.append((resource, auto_tags))

        assert isinstance(resources_tags, list)

        return resources_tags


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~ Main ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if __name__ == '__main__':
    from Crawler import Crawler

    crawler = Crawler(white_list=set(('..', )))

    auto_tagger = AutoTagger({
        r'^.*\.py$': ['python', 'development'],
        r'^.*\.css$': ['css', 'development'],
        r'^.*\.js$': ['javascript', 'development'],
    })

    for resource, tags in auto_tagger.process(crawler.crawl()):
        print(resource, tags)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Exemple #20
0
from Crawler import Crawler
import argparse as parser

if __name__ == '__main__':
    spider = Crawler('severe weather')
    spider.crawl()
Exemple #21
0
import sys
import signal
from Crawler import Crawler


def exit_handler(sig, frame):
    sys.exit(0)


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: main.py [root-url]")
        sys.exit(1)

    signal.signal(signal.SIGINT, exit_handler)
    crawler = Crawler()
    crawler.crawl(sys.argv[1])
Exemple #22
0
import re

from Crawler import Crawler, Crawler_SQLite

if __name__ == '__main__':
    crawler = Crawler(Crawler_SQLite('crawler.db'), depth=2)
    root_re = re.compile('^/$').match
    crawler.crawl('http://wmp.uksw.edu.pl', no_cache=root_re)
    print(crawler.content['wmp.uksw.edu.pl'].keys())
    print(len(crawler.content['wmp.uksw.edu.pl'].keys()))
def main():

    config = get_args()
    crawler = Crawler(config)
    crawler.crawl()
Exemple #24
0
import os.path

parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them')
parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True,
                   help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key")
parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json')
parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread')
parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file')

args = parser.parse_args()
if "p" not in args or args.p is None:
    parser.error(parser.format_usage())
param = FO.get_from_JSON_file(args.p.name)
if "rules" not in param or ("o" not in args and "output" not in param):
    print("rules error")
    parser.error(parser.format_usage())
if "crawling" not in param and ("s" not in args or args.s is None):
    parser.error(parser.format_usage())
elif "s" in args and args.s is not None:
    param["crawling"] = { "start": args.s}
if "o" in args and args.o is not None:
    output_name, output_extension = os.path.splitext(args.o.name)
    param["output"] = {
        "path": args.o.name,
        "type": "csv" if ".csv" in output_extension else "json"
    }
if "mt" in args and args.mt is not None:
    Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt)
else:
    Crawler.crawl(param.get("crawling"), param.get("rules"),  param.get("result"), param["output"])
Exemple #25
0
from Extractor import Extractor
from UrlMatcher import UrlMatcher
from Crawler import Crawler
from urllib.parse import urlparse
import re


class Title(Extractor):
    def get_url(self, url, bs):
        data = url
        msg = ''
        return data, msg

    def get_title(self, url, bs):
        data = bs.head.title.get_text()
        msg = ''
        return data, msg


args = {
    'scheme_pattern': r'http|https',
    'domain_pattern': r'.*',
    'path_pattern': r'.*',
    'extractors': [Title('title')],
    'workingList': [],
    'autoAddInternalLinks': True
}
c = Crawler(**args)
c.crawl()
Exemple #26
0
def start(url, numpages):
    if url != '' and numpages > 0:
        crawler = Crawler('Output', url, 5)
        crawler.crawl(numpages)
    else:
        raise ValueError('The input is invalid')