class WordCount: logger = LogFactory.getlogger("WordCount") mapper = Code(""" function() { emit(this.content, 1); } """) reducer = Code(""" function(key, values) { var sum = 0; values.forEach(function(value) { sum += Number(value); }); return sum; }; """) @staticmethod def calc_count(): WordCount.logger.info("start to count words") ip = Config.getProperty('mongo', 'addr') port = int(Config.getProperty('mongo', 'port')) client = pymongo.MongoClient(ip, port) db = client.spiderDB collection = db.spider collection.map_reduce(WordCount.mapper, WordCount.reducer, out="result", full_response=True)
# !/usr/bin/python # coding=utf-8 from urlparse import * from SpiderUtils.getWords import GetWords from Utils.logFactory import LogFactory from enums import Language from getUrls import UrlScan from spiderStrategy import SpiderStrategy from SpiderUtils.modeFactory import ModeFactory logger = LogFactory.getlogger("Spider") class Spider: __isout = False __url = "" __depth = 1 __url_pattern = None __mode = None def __init__(self, strategy=SpiderStrategy()): self.__isout = strategy.is_out self.__url = strategy.url self.__depth = strategy.depth self.__mode = strategy.mode pattern = strategy.pattern if pattern is None: if strategy.is_out is False: r = urlparse(strategy.url) self.__url_pattern = r.netloc
from abstractMode import AbstractMode from Utils.logFactory import LogFactory from SpiderUtils.getWords import GetWords from WordSplit.splitAdapter import SplitAdapter logger = LogFactory.getlogger("ChineseMode") class ChineseMode(AbstractMode): def __init__(self): super(ChineseMode, self).__init__() def catch_words(self, html): words = GetWords.get_chinese(html) return words def analyze(self, word): ws = SplitAdapter.split(word) return ws
import time from abc import ABCMeta, abstractmethod from Utils.logFactory import LogFactory from memcacheUtil import MemcacheUtil logger = LogFactory.getlogger("LockModel") class LockModel: __metaclass__ = ABCMeta __lock_key = "WRITEKEY" def __init__(self, key): self.__lock_key = key def lock_and_do(self): is_loop = True while is_loop: # print "loop",self.__lock_key if MemcacheUtil.get(self.__lock_key) is None: if MemcacheUtil.add(self.__lock_key, True): # logger.debug("get memcache lock: " + self.__lock_key) result = None try: result = self._do() except Exception as e: logger.error(e) finally: MemcacheUtil.delete(self.__lock_key)
from PyMemcached.lockModel import LockModel from Consts.cacheKeyConstants import const from PyMemcached.memcacheUtil import MemcacheUtil from Utils.logFactory import LogFactory logger = LogFactory.getlogger("ProcessCnt") class ProcessCntIncrease(LockModel): def __init__(self): super(ProcessCntIncrease, self).__init__(const.PROCESSWRITEKEY) def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) if cnt is None: cnt = 1 else: cnt += 1 MemcacheUtil.set(const.PROCESSCNTKEY, cnt) logger.debug("process cnt:" + str(cnt)) return True class ProcessCntReduce(LockModel): def __init__(self): super(ProcessCntReduce, self).__init__(const.PROCESSWRITEKEY) def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) cnt -= 1 MemcacheUtil.set(const.PROCESSCNTKEY, cnt)
from ProcessPool.pool import PyPool from PyIO.pyMongoUtil import PyMongoUtil from PyMemcached.memcacheUtil import MemcacheUtil from QueueListener.listener import MyListener from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.enums import Language from SpiderUtils.getWords import GetWords from PyIO.excelUtil import ExcelUtil from os import path logger = LogFactory.getlogger("main") # clean old data PyMongoUtil.clean() MemcacheUtil.clean() # create bloom filter SpiderBloomFilter() # multitask prepare queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() def err():
from abstractMode import AbstractMode from Utils.logFactory import LogFactory from SpiderUtils.getWords import GetWords import re from Utils.htmlUtil import HtmlUtil logger = LogFactory.getlogger("EnglishMode") class EnglishMode(AbstractMode): def __init__(self): super(EnglishMode, self).__init__() def catch_words(self, html): raw = HtmlUtil.filter_tags(html) words = GetWords.get_english(raw) return words def analyze(self, word): m = re.search("\d+", word) n = re.search("\W+", word) if not m and not n and len(word) > 4: return [word] else: return []
import time from ProcessPool.pool import PyPool from SpiderUtils.spider import Spider from Utils.logFactory import LogFactory from PyMemcached.Locks.processCntLock import ProcessCntReduce, ProcessCntIncrease from PyMemcached.memcacheUtil import MemcacheUtil from Consts.cacheKeyConstants import const from Utils.logFactory import LogFactory logger = LogFactory.getlogger("MyListener") class MyListener: __lock = None __queue = None __wait_cnt = 10 def __init__(self): self.__pool = PyPool.get_pool() def listen(self, lock, queue): loop_flag = True while loop_flag: try: lock.acquire() size = queue.qsize() size = PyPool.limit if size > PyPool.limit else size for num in range(0, size): strategy = queue.get_nowait() ProcessCntIncrease().lock_and_do()
from Consts.cacheKeyConstants import const from ProcessPool.pool import PyPool from PyIO.pyMongoUtil import PyMongoUtil from PyMemcached.memcacheUtil import MemcacheUtil from QueueListener.listener import MyListener from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.enums import Language from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.getUrls import UrlScan from SpiderUtils.getWords import GetWords import urllib, htmllib, formatter logger = LogFactory.getlogger("test") import re from bs4 import BeautifulSoup from PyIO.pyMongoUtil import PyMongoUtil class Test: @staticmethod def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(
from PyMemcached.lockModel import LockModel from Consts.cacheKeyConstants import const from PyMemcached.memcacheUtil import MemcacheUtil from Utils.logFactory import LogFactory from SpiderUtils.bloomFilter import SpiderBloomFilter logger = LogFactory.getlogger("BloomFilterLock") class BloomFilterLock(LockModel): __url = None def __init__(self, url): self.__url = url super(BloomFilterLock, self).__init__(const.URLWRITEKEY) def _do(self): if SpiderBloomFilter.exists(self.__url): logger.debug("dup url: " + self.__url) return False else: logger.debug("access url: " + self.__url) return True
from Consts.cacheKeyConstants import const from ProcessPool.pool import PyPool from PyIO.pyMongoUtil import PyMongoUtil from PyMemcached.memcacheUtil import MemcacheUtil from QueueListener.listener import MyListener from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.enums import Language from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.getUrls import UrlScan from SpiderUtils.getWords import GetWords import urllib, htmllib, formatter logger = LogFactory.getlogger("test") import re from bs4 import BeautifulSoup from PyIO.pyMongoUtil import PyMongoUtil class Test: @staticmethod def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list:
#!/usr/bin/python # coding=utf-8 from abc import ABCMeta, abstractmethod import sys from SpiderUtils.getWords import GetWords from Utils.logFactory import LogFactory from PyIO.pyMongoUtil import PyMongoUtil reload(sys) sys.setdefaultencoding('utf-8') logger = LogFactory.getlogger("Mode") class AbstractMode: __metaclass__ = ABCMeta def __init__(self): pass def get_words(self, url): html = GetWords.get_content(url) try: words = self.catch_words(html) wlist = [] for wd in words: wlist.extend(self.analyze(wd)) PyMongoUtil.write(url, wlist) except Exception, e: logger.error(url + " " + str(e)) return html
from PyMemcached.lockModel import LockModel from Consts.cacheKeyConstants import const from PyMemcached.memcacheUtil import MemcacheUtil from Utils.logFactory import LogFactory logger = LogFactory.getlogger("ProcessCnt") class ProcessCntIncrease(LockModel): def __init__(self): super(ProcessCntIncrease, self).__init__(const.PROCESSWRITEKEY) def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) if cnt is None: cnt = 1 else: cnt += 1 MemcacheUtil.set(const.PROCESSCNTKEY, cnt) logger.debug("process cnt:"+str(cnt)) return True class ProcessCntReduce(LockModel): def __init__(self): super(ProcessCntReduce, self).__init__(const.PROCESSWRITEKEY) def _do(self): cnt = MemcacheUtil.get(const.PROCESSCNTKEY) cnt -= 1
#!/usr/bin/python # coding=utf-8 from bs4 import BeautifulSoup from PyMemcached.Locks.bloomFilterLock import BloomFilterLock from Utils.logFactory import LogFactory import htmllib, formatter, re logger = LogFactory.getlogger("UrlScan") class UrlScan: @staticmethod def scanpage(html, url, pattern=None): try: BloomFilterLock(url).lock_and_do() results = [] format = formatter.AbstractFormatter(formatter.NullWriter()) ptext = htmllib.HTMLParser(format) ptext.feed(html) for link in ptext.anchorlist: if pattern is None: pattern = 'http' r = re.findall(pattern, link) if r is None or len(r) == 0: continue if BloomFilterLock(link).lock_and_do(): results.append(link) return results except Exception, e: logger.error("catch urls exception url: " + url + " error: " + str(e))
#!/usr/bin/python # coding=utf-8 import re import sys import chardet from Utils.logFactory import LogFactory import urllib2 import zlib reload(sys) sys.setdefaultencoding('utf-8') logger = LogFactory.getlogger("GetWords") class GetWords: headers = headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'} @staticmethod def get_chinese(html): raw = GetWords.__get_unicode_content(html) words = re.findall(ur"[\u4e00-\u9fa5]+", raw) if words.__len__() == 0: raise Exception, "cannot find any words" return words @staticmethod def get_english(html): s = re.findall("\w+", str.lower(html)) return s
from abstractMode import AbstractMode from Utils.logFactory import LogFactory from SpiderUtils.getWords import GetWords from WordSplit.splitAdapter import SplitAdapter logger = LogFactory.getlogger("KoreanMode") class KoreanMode(AbstractMode): def __init__(self): super(KoreanMode, self).__init__() def catch_words(self, html): words = GetWords.get_korean(html) return words def analyze(self, word): return [word]