Beispiel #1
0
 def get_words(self, url):
     html = GetWords.get_content(url)
     try:
         words = self.catch_words(html)
         wlist = []
         for wd in words:
             wlist.extend(self.analyze(wd))
         PyMongoUtil.write(url, wlist)
     except Exception, e:
         logger.error(url + " " + str(e))
Beispiel #2
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None)

        for l in list:
            PyMongoUtil.write(l,[""])
        print len(list)
Beispiel #3
0
    def testGetUrl():
        PyMongoUtil.clean()
        MemcacheUtil.clean()
        SpiderBloomFilter()

        html = GetWords.get_content(
            "http://www.leakedin.com/tag/emailpassword-dump/")
        list = UrlScan.scanpage(
            html, "http://www.leakedin.com/tag/emailpassword-dump/", None)

        for l in list:
            PyMongoUtil.write(l, [""])
        print len(list)
Beispiel #4
0
 def catch_words(self, html):
     words = GetWords.get_chinese(html)
     return words
Beispiel #5
0
queue = PyPool.get_queue()
lock = PyPool.get_lock()
listener = MyListener()


def err():
    print("please enter the right select")


while True:
    url = raw_input("Please input url:\n")
    print("checking url...")
    if not url.startswith("http"):
        url = "http://" + url
    try:
        statusCode = GetWords.try_connect(url)
    except Exception, e:
        print(str(e))
        continue
    if statusCode != 200:
        print "cannot connect to the website"
    else:
        break

while True:
    depth = raw_input("Please input depth:\n")
    if not depth.isdigit():
        print("please enter a number\n")
    else:
        break
Beispiel #6
0
 def catch_words(self, html):
     words = GetWords.get_by_regex(html, self.__pattern)
     return words
Beispiel #7
0
 def catch_words(self, html):
     raw = HtmlUtil.filter_tags(html)
     words = GetWords.get_english(raw)
     return words
Beispiel #8
0
 def catch_words(self, html):
     words = GetWords.get_by_regex(html, self.__pattern)
     return words
Beispiel #9
0
 def catch_words(self, html):
     words = GetWords.get_korean(html)
     return words