def deepCrawl(crawled) : tmp = [] for each in crawled : crawl = Crawl(each['url']) crawl.filter() tmp.extend(crawl.get()) return tmp
def appCrawl(url): crawl = Crawl(url) crawl.filter() crawled = [] crawled.extend(crawl.get()) crawled.extend(SubCrawl.deepCrawl(crawled)) crawled = SubCrawl.deleteOverlap(crawled) crawled = SubCrawl.replaceAttack(crawled) crawled = SubCrawl.getAttackable(crawled) return crawled
def Run(): start_urls = Ctrip.StartURL() my_crawl = Crawl(start_urls) try: my_crawl.Run() price_pannel_list = my_crawl.price_pannel_list for price_pannel in price_pannel_list: SplunkLog.Save(price_pannel) finally: my_crawl.Finish()
def test(self): session = Session() crawl = Crawl() crawl.begin() session.add(crawl) try: session.commit() except IntegrityError, e: session.close() Crawl.dropAndCreate(e.message) self.fail(e.message)
def testGviz(self): crawl = Crawl() crawl.begin() session = Session() session.add(crawl) session.commit() record = Record() record.setUrl("http://example.com/") record.setCrawlId(crawl.crawlId) record.setLastSeen(utcnow()) session = Session() session.add(record) try: session.commit() except IntegrityError, e: session.close() Record.dropAndCreateTable(e.message) self.fail(e.message)
def start_simple(self): # get the crawler crawler = Crawl() crawler.add_url(self.ask_for_link()) crawler.load_next_page() links = crawler.crawl_next_page_for_links() for link in links: crawler.add_url(link) pass
def main(): # start by asking for a link toCrawl = [] crawled = [] toCrawl.append(getLinkToPage()) crawlercl = Crawl() # print "got page " + pageLink maxPagesSearched = 100 i = 0 # search this page for links while len(toCrawl) > 0: # get first entry of "toCrawl" list crawl = toCrawl.pop() # TODO # ensure that this link is not contained in the "crawled" list if crawl in crawled: # continue with the next loop continue links = crawlercl.crawl_next_page_for_links(crawl) # put into "crawled" list crawled.append(crawl) # add new found links to "toCrawl" list for link in links: # ensure that the link is not in the "crawled" list if link not in crawled: toCrawl.append(makeAbsoluteLink(link, crawl)) print makeAbsoluteLink(link, crawl) # stop loop after .. iterations i += 1 if i >= maxPagesSearched: break return 0
def testInsert2(self): crawl = Crawl() crawl.begin() self.assertGreater(len(crawl.userName), 0, "no user name was given") self.assertGreater(len(crawl.userDomain), 0, "no user domain was given") crawl.end() session = Session() session.add(crawl) session.commit() debug("crawlId of inserted record is %s" % (crawl.crawlId)) session.close() Crawl.dropTable()
def crawl(self): self.url = input(">>> Enter url of website: ") if not self.url: self.url = "http://testphp.vulnweb.com" if not self.url.startswith("http"): self.url = "http://" + self.url host = self.url.replace("http://", "").replace("https://", "").split("/")[0] current_path = os.path.dirname(os.path.realpath(__file__)) self.output_dir = os.path.join(current_path, "output/" + host) if os.path.exists(self.output_dir): chose = input("Scan results are available at output/{}, continue? (Y/N | Default = Y)".format(host)) if chose.upper() == "N": print("Stopping.....") exit() else: shutil.rmtree(self.output_dir) os.makedirs(self.output_dir) crawl = Crawl(self.url) return crawl
def dummy(cls, n_dummy): n_before = cls.count() session = Session() record = None for x in range(n_dummy): crawl = Crawl.dummy() from uuid import uuid1 record = Record() record.crawlId = crawl.crawlId record.uri = "http://example.com/"+uuid1().get_hex() record.url = "http://exmaple.com/"+uuid1().get_hex() from random import randint record.size = randint() record.lastSeen = utcnow() record.lastModified = utcnow() record.jsonString = {} record.belongsTo = None record.exhaustive = False session.add(record) session.commit() n_after = cls.count() assert n_before + n_dummy == n_after assert isinstance(record, Record) return record
sDB_User = _cf["StockAnalysis"]["DB_User"] sDB_Pwd = _cf["StockAnalysis"]["DB_Pwd"] sDB_Name = _cf["StockAnalysis"]["DB_Name"] #endregion log = Log.hyLog() #第一種函數宣告, 要用 instance (或是呼叫函數時第一個引數帶入物件) start_date = datetime.date(2019, 11, 1) #.strftime("%Y%m%d") end_date = datetime.date.today() #.strftime("%Y%m%d") day = datetime.timedelta(days=1) #獲取昨天的日期 log.writeLog(apname=_APName, text="要處理的時間 ({} ~ {})".format(start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d"))) # initial crawl object craw = Crawl(sDownloadFilePath) db = DB(_APName, sDB_Host, sDB_User, sDB_Pwd, sDB_Name) sleep_sec = 5 lastprocmonth = 0 while start_date <= end_date: try: # 星期六,日不處理 if start_date.weekday() == 5 or start_date.weekday() == 6: start_date = start_date + day continue #暫時的 code procemonthdata = False if start_date.month != lastprocmonth: procemonthdata = True lastprocmonth = start_date.month
def setUp(self): try: Crawl.dropTable() except OperationalError,e: debug(e.message)