Python Crawler Examples, lib.crawler.Crawler Python Examples

Example #1

0

Show file

def vul_scan(domain):
    """
    常见漏洞扫描
    """
    urls = Crawler(target=domain, dynamic=False).run()
    Xss(targets=urls).scan()
    Sqli(targets=urls).scan()
    Struts2(urls).scan()

Example #2

0

Show file

def crawl():
    crawler = Crawler()

    param_did = input("预先输入本用户cookie中的did值：")
    crawler.set_did(param_did)

    uid = input("输入此次要爬取的用户id：")
    crawler.add_to_list(uid)

    crawler.crawl()

    input("请按回车键退出......")

Example #3

0

Show file

File: crawler_controller.py Project: esfandiar/bionodes

 def post(self, collection):
     """Crawl collection"""
     try:
         Crawler.crawl_and_save_articles_and_keywords(collection)
         return (
             json.dumps({"success": True}),
             200,
             {
                 "ContentType": "application/json"
             },
         )
     except:
         return (
             json.dumps({"success": False}),
             500,
             {
                 "ContentType": "application/json"
             },
         )

Example #4

0

Show file

def site_scan(domain):
    if domain.startswith('http://www.'):
        domain = domain[11:]
    elif domain.startswith('https://www.'):
        domain = domain[12:]
    elif domain.startswith('http://'):
        domain = domain[7:]
    elif domain.startswith('https://'):
        domain = domain[8:]

    id = Database().insert_task(domain)
    domains, ips = Domain(domain, id).run()
    for domain in domains:
        url = Crawler(domain).scan()
        Vul(url, id).run()
    Sendir(domains, id).run()
    Port(id).run(ips)

Example #5

0

Show file

File: main.py Project: Enchan1207/TweetOrganizer

def main():
    # .envか実行引数からアクセストークンを取得
    if dotenv.load_dotenv():
        consumer_key = os.getenv("CLIENT_ID")
        consumer_secret = os.getenv("CLIENT_SECRET")
        access_token = os.getenv("ACCESS_TOKEN")
        access_token_secret = os.getenv("ACCESS_TOKEN_SECRET")
    elif len(sys.argv) >= 5:
        consumer_key = sys.argv[1]
        consumer_secret = sys.argv[2]
        access_token = sys.argv[3]
        access_token_secret = sys.argv[4]
    else:
        print("\033[31mERROR\033[0m: No credentials has been passed.")
        return

    # アクセストークンを渡してtweepy.apiのインスタンスを生成
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    # ログインユーザの情報を吐き出す(auth.status的なものが見つからなかったのでこれで妥協)
    try:
        target_user_object = api.me()
        print(f"Fetch target: {target_user_object.name}(@{target_user_object.screen_name})")
    except tweepy.TweepError:
        print("\033[31mERROR\033[0m: Couldn't get user object.Make sure whether you have passed a valid token.")
        return
    
    # ツイート収集・削除を行うスレッドをそれぞれ起動し、キューを直結
    tweetqueue = StatusQueue()
    crawlthread = CrawlThread(Crawler(api), tweetqueue, endreq_event)
    eliminatethread = EliminateThread(Eliminator(api), tweetqueue, endreq_event)
    crawlthread.start()
    eliminatethread.start()
    print("Process started.")

    try:
        crawlthread.join()
        eliminatethread.join()
    except KeyboardInterrupt:
        print("Ctrl+C")
        endreq_event.set()
        crawlthread.join()
        eliminatethread.join()

Example #6

0

Show file

def enumerate_twit(args):
    twit = Crawler(args.config_path, args.enum_search)

Example #7

0

Show file

File: crabgrassbk.py Project: tupolev/crabgrassbk

from lib.config import Config
from lib.zip import Zip


def init_driver(config_instance: Config):
    drv = webdriver.PhantomJS(executable_path=config_instance.phantomjs_bin_path)
    drv.wait = WebDriverWait(drv, 1)

    return drv


if __name__ == "__main__":
    # init config object and driver
    config = Config('conf/config.yml')
    driver = init_driver(config)
    crawler = Crawler(driver, config)

    print("====Driver logging in====")
    crawler.login()
    print("====Driver logged in====")

    print("====Starting backup process====")
    # head driver to pages section
    crawler.goto_pages()
    time.sleep(2)

    # create folder for backups if it doesnt exist
    current_datetime = time.strftime('%d%m%Y%H%M%S')
    backup_dir_for_now = config.dir_backup + os.path.sep + current_datetime
    if not os.path.exists(config.dir_backup):
        os.makedirs(config.dir_backup)

Example #8

0

Show file

File: crawler.py Project: mshe666/python-simple-web-crawler

def main():

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)

            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''),
                            url):  # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format=
                    '%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')

                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'

                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl,
                                     allow_redirects=True,
                                     timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(
                        url, e.__doc__, e.message),
                                  exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])

    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (int(
        math.ceil(float(num_links) / tTime)), tTime)

Example #9

0

Show file

class Taobaoke(MongoClient):
    """
    Taobaoke
    """
    start_urls = (
        'http://pub.alimama.com/promo/item/channel/index.htm?channel=qqhd&toPage={}&catIds={}&level=1&perPageSize=100'
    )
    crawler = Crawler()

    def start_requests(self):
        # item list range
        for cat in range(1, 18):
            for page in range(10, 31):
                print "crawl:%s,%s" % (cat, page)
                self.requests_url(self.start_urls.format(page, cat),
                                  callback=self.callback)

    def requests_url(self, url, callback=None):
        """

        :param url:
        :param callback:
        :return:
        """
        crawler = self.crawler.crawl(url)
        callback(crawler)

    def safe_febx_text(self, crawler, xpath, count=1, max_count=10):
        """

        :param crawler:
        :param xpath:
        :param count:
        :param max_count:
        :return:
        """
        if count > max_count:
            return ""
        try:
            return crawler.febx(xpath).text.strip()
        except Exception:
            print "try:%s" % count
            return self.safe_febx_text(crawler,
                                       xpath,
                                       count=count + 1,
                                       max_count=max_count)

    def callback(self, crawler):
        """
        回调处理
        :param crawler:
        :return:
        """
        time.sleep(10)
        for index in range(1, 101):
            item = {
                "title":
                self.safe_febx_text(
                    crawler, u".//*[@id='J_search_results']/div/div[{}]"
                    u"/div[@class='box-content']/div[1]/p/a/node()".format(
                        index)),
                "category":
                self.safe_febx_text(crawler,
                                    u".//*[@class='top-nav-tag']/span")
            }
            if not item["title"] or not item[
                    "category"] or self.db.tbk_test.find({
                        "title": item["title"]
                    }).count():
                print "continue", item["category"], item["title"]
                continue

            print item["category"], item["title"]
            self.db.tbk_test.insert(item)

Example #10

0

Show file

browser.quit()


def content_func(res):
    words = []
    soup = BeautifulSoup(res.text, 'lxml')
    content = ''
    divs = soup.select('div.comp_detail')
    del divs[-1]
    for div in divs:
        content += div.text.replace('\n', '').replace('\xa0', '')
    words += list(
        set(
            re.findall('java script|objective c|visual basic|[A-Za-z.+#]+',
                       content, re.IGNORECASE)))
    return words


# Create an instance of Crawler class
crawler = Crawler(open_thread=True)

# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(alinks, content_func, sleep_time=1)

# Call get_counter to get word count result
print(crawler.get_counter().most_common())

with open('yes123_1_new.csv', 'w') as f:
    for lang, counts in crawler.get_counter().most_common():
        f.write('{},{}\n'.format(lang, counts))

Example #11

0

Show file

File: web_scanner.py Project: zodius/web_scanner

from lib.signature import Signature
from lib.crawler import Crawler

import coloredlogs
coloredlogs.install(fmt="%(asctime)s %(name)s [%(levelname)s] %(message)s")

signature = Signature(status_code=200)
signature.add(lambda x: "an error" not in x.text)

c = Crawler("https://www.youtube.com", "./test_dic.txt", signature=signature)

c.start()

Example #12

0

Show file

File: crawler-thz.py Project: yl000github/py-controller

# lock = threading.Lock()     # 全局资源锁
# 提取数据、多线程、存储、异常处理、日志
initDir = "f:/thz/"
headers = {
    "user-agent":
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
    "Referer": "http://thzbbt.net",
}
initUrls = [
    "http://thzbbt.net/forum-181-{num}.html".format(num=num)
    for num in range(1, 10)
]
print "initUrls", initUrls

crawler = Crawler(initUrls, initDir, headers, "crawler-thz")


def fn1(url):
    arr = []
    r = requests.get(url, headers=headers, timeout=100).text
    for father in BeautifulSoup(r, 'lxml').find_all('th', class_="common"):
        link = father.find_all("a")[3]["href"]
        arr.append("http://thzbbt.net/" + link)
    return arr


def fn2(url):
    arr = []
    #     print "url",url
    r = requests.get(url, headers=headers, timeout=100).text

Example #13

0

Show file

def vul_scan(domain):
    url = Crawler(domain).scan()
    Vul(url).run()

Example #14

0

Show file

def crawl(collection):
    Crawler.crawl_and_save_articles_and_keywords(collection)

Example #15

0

Show file

def crawl_scan(domain):
    Crawler(domain).scan()

Example #16

0

Show file

def crawl():
    c = Crawler()
    c.set_did(param_did)
    c.crawl_like("3xzixigzwy4kj5c")

Example #17

0

Show file

File: crawler_test_1111.py Project: aipingyen/Python

    links = soup.select('div.jbInfo > div > h3 > a')
    if res.status_code == 200 and len(links) != 0:
        for link in links:
            page_links.append('https:' + str(link['href']))
    return page_links


# Get total page from the website you want to crawl
HOST = 'https://www.1111.com.tw'
url = HOST + '/job-bank/job-index.asp?si=1&d0=140400,140200,140300&fs=1&ps=100&page=1'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
page = int(soup.select_one('div.pagedata').text.split('/')[1].split('жаЂ')[0])

# Create an instance of Crawler class
crawler = Crawler(open_thread=True)

#  Call grab_pagelinks_th_auto to get all links
page_url = HOST + '/job-bank/job-index.asp?si=1&d0=140400,140200,140300&fs=1&ps=100&page={}'
crawler.grab_pagelinks_th_auto(page_url, page_func, page, sleep_time=1)

# Call get_alinks to get all links crawled from previous pages
links = crawler.get_alinks()

# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(links, content_func, sleep_time=1)

# Call get_counter to get word count result
print(crawler.get_counter().most_common())

with open('1111_1_new.csv', 'w') as f:

Example #18

0

Show file

    "user-agent":
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
    "Referer": "http://www.mmjpg.com",
}
initUrls = [
    "http://www.mmjpg.com/mm/{num}".format(num=num) for num in range(1, 2)
]
print "initUrls", initUrls


def getCurFileName():
    filename = os.path.basename(__file__)
    return filename[0:filename.find(".")]


crawler = Crawler(initUrls, initDir, headers, getCurFileName())
print "crawler初始化成功"


def fn1(url):
    r = requests.get(url, headers=headers, timeout=100).text
    maxCount = BeautifulSoup(r,
                             'lxml').find('div',
                                          class_="page").find_all('a')[-2].text
    #     print maxCount
    page_urls = [url + "/" + str(i) for i in range(1, int(maxCount) + 1)]
    return page_urls


def fn2(url):
    r = requests.get(url, headers=headers, timeout=100).text

Example #19

0

Show file

File: crawler_test_pttsoft.py Project: aipingyen/Python

URL = "https://www.ptt.cc/"
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
    Chrome/58.0.3029.96 Safari/537.36',
    "cookies": "over18=1"
}
board = "Soft_Job"
#希望爬取ptt板面、但是頁面擷取的網址內容會缺乏URL,需要自行補上
res = requests.get(URL + "bbs/" + board + "/index.html", headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
bottons = soup.select('a.btn.wide')
#totalpage=上一頁的頁數+1
totalpage = int(bottons[1]['href'].split('index')[1].split('.')[0]) + 1

crawler = Crawler(open_thread=True)

page_url = URL + "bbs/" + board + "/index{}.html"
crawler.grab_pagelinks_th_auto(page_url,
                               pttURL_crawler,
                               totalpage,
                               sleep_time=1,
                               header=headers)

links = crawler.get_alinks()

print(links)
# Call grab_content_th_auto to get content page by page
crawler.grab_content_th_auto(links, content_func, sleep_time=2)

# Call get_counter to get word count result

Example #20

0

Show file

def crawl():
    crawler = Crawler(False)
    crawler.set_did(param_did)
    crawler.crawl()

Example #21

0

Show file

File: crawler.py Project: Bigwayseo/python-simple-web-crawler

def main():   

    opts, args = parse_options()

    url = args[0]

    if opts.links:
        getLinks(url)
        raise SystemExit, 0

    depth_limit = opts.depth_limit
    confine_prefix = opts.confine
    exclude = opts.exclude

    sTime = time.time()

    print >> sys.stderr, "Crawling %s (Max Depth: %d)" % (url, depth_limit)
    crawler = Crawler(url, depth_limit, confine_prefix, exclude)
    crawler.crawl()

    # create log directory
    if not os.path.exists(LOG_DIRECTORY):
        os.makedirs(LOG_DIRECTORY)

    num_links = 0
    if opts.out_urls:
        for url_crawl in crawler.urls_seen:

            parsed_uri = urlparse.urlparse(url_crawl)
            
            # only base url
            if not re.match(".*%s" % parsed_uri.netloc.replace('www.', ''), url): # and not opts.skip_host:
                continue

            if not opts.out_path:
                print url_crawl
            else:
                domain = '{uri.netloc}'.format(uri=parsed_uri)
                log_file = "%s/%s.log" % (LOG_DIRECTORY, domain)

                logging.basicConfig(
                    filename=log_file,
                    filemode='w+',
                    level=logging.DEBUG,
                    format='%(asctime)-15s [%(levelname)s] (%(threadName)-10s) %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p'
                )
                
                try:
                    directory = opts.out_path + domain + '/'
                    path = directory + toSeoFriendly(url_crawl, 50) + '.html'
                    
                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    r = requests.get(url_crawl, allow_redirects=True, timeout=30)
                    if not os.path.exists(path):
                        target = open(path, 'w')
                        target.write(r.text.encode('utf-8'))
                        target.close()

                        num_links = num_links + 1
                        logging.debug("Saving: {0}".format(url_crawl))

                except IOError as e:
                    logging.error("IOError: {0} {1}".format(url, e.message))
                    pass

                except Exception as e:
                    logging.error("Error({0}): {1}".format(url, e.__doc__, e.message), exc_info=True)
                    pass

    if opts.out_links:
        print "\n".join([str(l) for l in crawler.links_remembered])
        
    if opts.out_dot:
        d = DotWriter()
        d.asDot(crawler.links_remembered)

    eTime = time.time()
    tTime = eTime - sTime

    print >> sys.stderr, "Found:    %d" % num_links
    print >> sys.stderr, "Stats:    (%d/s after %0.2fs)" % (
            int(math.ceil(float(num_links) / tTime)), tTime)