Ejemplo n.º 1
0
def deepCrawl(crawled) :
    tmp = []
    for each in crawled :
        crawl = Crawl(each['url'])
        crawl.filter()
        tmp.extend(crawl.get())
    return tmp
Ejemplo n.º 2
0
def appCrawl(url):
    crawl = Crawl(url)
    crawl.filter()
    crawled = []
    crawled.extend(crawl.get())
    crawled.extend(SubCrawl.deepCrawl(crawled))
    crawled = SubCrawl.deleteOverlap(crawled)
    crawled = SubCrawl.replaceAttack(crawled)
    crawled = SubCrawl.getAttackable(crawled)
    return crawled
Ejemplo n.º 3
0
def Run():
    start_urls = Ctrip.StartURL()
    my_crawl = Crawl(start_urls)
    try:
        my_crawl.Run()
        price_pannel_list = my_crawl.price_pannel_list
        for price_pannel in price_pannel_list:
            SplunkLog.Save(price_pannel)
    finally:
        my_crawl.Finish()
Ejemplo n.º 4
0
 def test(self):
     session = Session()
     crawl = Crawl()
     crawl.begin()
     session.add(crawl)
     try:
         session.commit()
     except IntegrityError, e:
         session.close()
         Crawl.dropAndCreate(e.message)
         self.fail(e.message)
Ejemplo n.º 5
0
 def testGviz(self):
     crawl = Crawl()
     crawl.begin()
     session = Session()
     session.add(crawl)
     session.commit()
     
     record = Record()
     record.setUrl("http://example.com/")
     record.setCrawlId(crawl.crawlId)
     record.setLastSeen(utcnow())
     session = Session()
     session.add(record)
     try:
         session.commit()
     except IntegrityError, e:
         session.close()
         Record.dropAndCreateTable(e.message)
         self.fail(e.message)
Ejemplo n.º 6
0
    def start_simple(self):

        # get the crawler
        crawler = Crawl()

        crawler.add_url(self.ask_for_link())
        crawler.load_next_page()
        links = crawler.crawl_next_page_for_links()
        for link in links:
            crawler.add_url(link)

        pass
Ejemplo n.º 7
0
def main():
    #    start by asking for a link
    toCrawl = []
    crawled = []
    toCrawl.append(getLinkToPage())
    crawlercl = Crawl()
    #    print "got page " + pageLink

    maxPagesSearched = 100
    i = 0

    #   search this page for links
    while len(toCrawl) > 0:
        # get first entry of "toCrawl" list
        crawl = toCrawl.pop()

        # TODO
        # ensure that this link is not contained in the "crawled" list        
        if crawl in crawled:
            # continue with the next loop
            continue

        links = crawlercl.crawl_next_page_for_links(crawl)

        # put into "crawled" list
        crawled.append(crawl)

        # add new found links to "toCrawl" list
        for link in links:
            # ensure that the link is not in the "crawled" list
            if link not in crawled:
                toCrawl.append(makeAbsoluteLink(link, crawl))
                print makeAbsoluteLink(link, crawl)

        # stop loop after .. iterations
        i += 1
        if i >= maxPagesSearched:
            break

    return 0
Ejemplo n.º 8
0
 def testInsert2(self):
     crawl = Crawl()
     crawl.begin()
     self.assertGreater(len(crawl.userName), 0, "no user name was given")
     self.assertGreater(len(crawl.userDomain), 0, "no user domain was given")
     crawl.end()
     session = Session()
     session.add(crawl)
     session.commit()
     debug("crawlId of inserted record is %s" % (crawl.crawlId))
     session.close()
     Crawl.dropTable()
Ejemplo n.º 9
0
 def crawl(self):
     self.url = input(">>> Enter url of website: ")
     if not self.url:
         self.url = "http://testphp.vulnweb.com"
     if not self.url.startswith("http"):
         self.url = "http://" + self.url
     host = self.url.replace("http://", "").replace("https://", "").split("/")[0]
     current_path = os.path.dirname(os.path.realpath(__file__))
     self.output_dir = os.path.join(current_path, "output/" + host)
     if os.path.exists(self.output_dir):
         chose = input("Scan results are available at output/{}, continue? (Y/N | Default = Y)".format(host))
         if chose.upper() == "N":
             print("Stopping.....")
             exit()
         else:
             shutil.rmtree(self.output_dir)
     os.makedirs(self.output_dir)
     crawl = Crawl(self.url)
     return crawl
Ejemplo n.º 10
0
 def dummy(cls, n_dummy):
     n_before = cls.count()
     session = Session()
     record = None
     for x in range(n_dummy):
         crawl = Crawl.dummy()
         from uuid import uuid1
         record = Record()
         record.crawlId = crawl.crawlId
         record.uri = "http://example.com/"+uuid1().get_hex()
         record.url = "http://exmaple.com/"+uuid1().get_hex()
         from random import randint
         record.size = randint()
         record.lastSeen = utcnow()
         record.lastModified = utcnow()
         record.jsonString = {}
         record.belongsTo = None
         record.exhaustive = False
         session.add(record)
     session.commit()
     n_after = cls.count()
     assert n_before + n_dummy == n_after
     assert isinstance(record, Record)
     return record
Ejemplo n.º 11
0
    sDB_User = _cf["StockAnalysis"]["DB_User"]
    sDB_Pwd = _cf["StockAnalysis"]["DB_Pwd"]
    sDB_Name = _cf["StockAnalysis"]["DB_Name"]
    #endregion

    log = Log.hyLog()  #第一種函數宣告, 要用 instance (或是呼叫函數時第一個引數帶入物件)

    start_date = datetime.date(2019, 11, 1)  #.strftime("%Y%m%d")
    end_date = datetime.date.today()  #.strftime("%Y%m%d")
    day = datetime.timedelta(days=1)  #獲取昨天的日期
    log.writeLog(apname=_APName,
                 text="要處理的時間 ({} ~ {})".format(start_date.strftime("%Y%m%d"),
                                                end_date.strftime("%Y%m%d")))

    # initial crawl object
    craw = Crawl(sDownloadFilePath)
    db = DB(_APName, sDB_Host, sDB_User, sDB_Pwd, sDB_Name)
    sleep_sec = 5
    lastprocmonth = 0
    while start_date <= end_date:
        try:
            # 星期六,日不處理
            if start_date.weekday() == 5 or start_date.weekday() == 6:
                start_date = start_date + day
                continue
            #暫時的 code
            procemonthdata = False
            if start_date.month != lastprocmonth:
                procemonthdata = True
                lastprocmonth = start_date.month
Ejemplo n.º 12
0
 def setUp(self):
     try:
         Crawl.dropTable()
     except OperationalError,e:
         debug(e.message)