Esempio n. 1
0
def test():
    runque = RedisQueueConnection('scan').conn

    size = runque.qsize()
    print size
    sleep(1)
    cnt = 0
    if size:
        while cnt < size:
            i = runque.get()
            print i
            runque.put(i)
            cnt += 1

        runque.flushdb()
    exit(0)

    f = open('seeds995k.txt')

    urls = f.read().strip().split('\n')

    if size == 0:
        i = 0
        st = time()
        for url in urls:
            runque.put(url)
Esempio n. 2
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)
        

        
    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])
    
    ol = 'ret_%s.txt' %  (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')
    
    slist = lines[ cur : len(lines) : cnt ]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp,'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

       
    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)
    
    scanque = RedisQueueConnection('scan').conn
    
    ipd = dict()
    for ip in ips:
        h  = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Esempio n. 3
0
    def __init__(self):

        self.showpercounts = 10
        self.timeout = 20

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.run_que = RedisQueueConnection('running').conn
        self.doneque = RedisQueueConnection('robots').conn
        self.tempque = Queue()
        self.done = 1
        self.sent = 0
        self.quit = False

        self.err = Error()
        self.https_enable = 0

        self.httpget = self.httpget_requests  # down method self.httpget_requests | httpget_curl

        self.totalnettime = 0
        self.totaldownsize = 0
        self.starttime = time()

        self.ip = getip()
        self.headers = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.6',
            'Accept-Encoding':
            'gzip,deflate',
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
        }
Esempio n. 4
0
def insert():
    
    runque = RedisQueueConnection('extracturls').conn
    urls = flist('urlstogetrobots1.txt')[100000:300000]
    print len(urls)
    for url in urls:
        runque.put(url)
Esempio n. 5
0
    def __init__(self, done_que):

        self.showpercounts = 100
        self.timeout = 5
        self.starttime = time()

        self.quit = False

        self.run_que = RedisQueueConnection('running').conn
        self.done_que = done_que
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()
        self.https_enable = 0 

        self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.totaldownsize = 0
        
        self.ip = getip()
Esempio n. 6
0
def extract():
    icmd = "insert into orignal (dbname, dbid, url, cms, headers, head) values (%s, %s, %s, %s, %s, %s)"
    runque = RedisQueueConnection('cms').conn
    size =  runque.qsize()
    print "total:" , size

    i = 0
    while i < size:
        item = runque.get()
        #runque.put(item)
        data = pickle.loads(item)
        ndata = []
        for item in data:
            if isinstance(item, unicode):
                    item = item.encode('utf8')
            ndata.append(item)

        #print ndata 
        if ndata[3]:
            print ndata[3], ndata[2]
        cur.execute(icmd , ndata)

        i += 1

 
    conn.commit()


    print "done"
    print runque.qsize()
Esempio n. 7
0
def run():
    runque = RedisQueueConnection('running').conn
    #########runque.flushdb()
    size =  runque.qsize()

    print size
    raw_input('flush runing')
    runque.flushdb()
Esempio n. 8
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)

    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])

    ol = 'ret_%s.txt' % (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')

    slist = lines[cur:len(lines):cnt]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp, 'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)

    scanque = RedisQueueConnection('scan').conn

    ipd = dict()
    for ip in ips:
        h = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Esempio n. 9
0
def daemon(tempque):
    st = random.random() + 0.5
    doneque = RedisQueueConnection('robots').conn

    while True:
        if not tempque.empty():
            dat = tempque.get()
            doneque.put(dat)
        else:
            sleep(st)
Esempio n. 10
0
def inserturls():
    
    runque = RedisQueueConnection('extracturls').conn
    print runque.qsize()
    raw_input('flushdb?')
    runque.flushdb()
    urls = flist('urlstogetip.txt')
    for url in urls:
        runque.put(url)
    
    print runque.qsize()
Esempio n. 11
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size

    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Esempio n. 12
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size = runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ')

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp = runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Esempio n. 13
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size



    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Esempio n. 14
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size =  runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ') 

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp =  runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

   
    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Esempio n. 15
0
def test():
    runque = RedisQueueConnection('robots').conn
    #########runque.flushdb()
    size =  runque.qsize()
    item = runque.get()
    runque.put(item)
    print pickle.loads(item)

    print size
    
    return
    raw_input('cofrim')
    s = flist('urlstogetrobots.txt')
    for url in s:
        runque.put(url) 


    print runque.qsize()
Esempio n. 16
0
    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912  # 512M 536870912
        self.spend = 0

        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute(
            "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
        )

        #when recv the ctrl+c signal, run out the extractation jobs and then quit
        self.quit = False
Esempio n. 17
0
    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912 # 512M 536870912
        self.spend = 0
        
        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)")
       
        #when recv the ctrl+c signal, run out the extractation jobs and then quit 
        self.quit = False
Esempio n. 18
0
def show(name):
    runque = RedisQueueConnection(name).conn
    cnt = 0
    while cnt < runque.qsize():
        data = runque.get()
        runque.put(data)
        data = pickle.loads(data)
        
        seed =  data['seed']
        data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip()
        if not data:
            continue
        if data.find('<') >= 0:
            #html page
            print seed
            continue
        
        robots = data.split('\n')
        print seed 
        print
        print "\n".join(robots)
        print  
        cnt += 1
Esempio n. 19
0
class Daemon:

    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912 # 512M 536870912
        self.spend = 0
        
        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)")
       
        #when recv the ctrl+c signal, run out the extractation jobs and then quit 
        self.quit = False
 
    def getdbname(self, create=False):
        
        path = "/work/db"
        tf = "%Y%m%d-%H%M%S"
        pre = "sitedata"
        suf = ".db"
        dbsize = 0
 
        ip = getip()
        findname = "%s%s" % (ip, suf)

        if create == True:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            self.dbsize = 0
            print "Create db: ", lastname
            return os.path.join(path, lastname)
        
        fnames = os.listdir(path)

        last = 0
        lastname = ""
        for fname in fnames:
            if fname.endswith(findname):
                fnow = fname.split('_')[1]
                fnown = int(time.mktime(time.strptime(fnow, tf)))
                if fnown > last:
                    last = fnown
                    lastname = fname
        #can not found the newest db file, so create it
        if not last:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            print "Create db: ", lastname
            self.dbsize = 0
        else:
            print "Reuse the last db: ", lastname
            self.dbsize = os.stat(os.path.join(path, lastname)).st_size 
            
        return os.path.join(path, lastname)


    def geturls(self, seed, content):
        urls = []
        returls = []
        if not content  or len(content) == 0:
            return []
        try:
            urls = re.findall(self.urlfilter.urlpatern, content)
            returls = self.urlfilter.filter_urls(seed,urls)
        except:
            pass
        return returls


    def run(self):
        #backend job,
        sleep(2)
        while True:
            try:
                if self.done_que.empty():
                    if self.quit == True:
                        #the speed to extract urls is more higher than crawler
                        sleep(1)
                        if not self.done_que.empty():
                            continue
                        print "Daemon run done and quit successfuly"
                        exit(0)
                    
                    #print "Downloaded queue empty, wait crawler ..."
                    sleep(10)
                    continue

                data = cPickle.loads(self.done_que.get())

                seed  = data['seed']
                content = data['content']
                headers = str(data['headers'])
                
                urls = self.geturls(seed, content)
                
                #put the extracted urls to urls_que
                for url in urls:
                    self.urls_que.put(url)
 
                #use level 1 to compress data , we get enough compress ratio and speed
                gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1))
                self.dbsize += ( len(gziphtml) + len(seed) + len(headers) )

                self.conn.execute("insert into mainpages (url,headers,content) values (?,?,?)", (seed, headers, gziphtml))
                
                self.cnt += 1
                if self.cnt % self.showpercounts == 0:
                    self.conn.commit()
                    
                    print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \
                         (self.ip, self.cnt, self.done_que.qsize(),  self.dbsize/1024/1024)
                    
                
                    if self.dbsize > self.dbsizelimit:
                        self.fname = self.getdbname(True)
                        self.conn.close()
                        self.conn = sqlite3.connect(self.fname)
                        self.conn.execute("create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)")
        
            except Exception as e:
                print e
            except KeyboardInterrupt:
                print "Daemon recv quit singal, waiting for queue empty"
                self.quit = True
Esempio n. 20
0
#coding:utf8
import re, os
import requests
from db_driver import db_driver
import db_file
from redis_inc import RedisQueueConnection
import cPickle as pickle
import multiprocessing as mp

cmsque = RedisQueueConnection('cms').conn

pscript = re.compile(u"<script.*?>(.*?)</script>", re.S)
pstyle = re.compile(u"<style.*?>(.*?)</style>", re.S)
ptitle = re.compile(u"<title>(.*?)</title>", re.S)

pby = re.compile(u'powered by (.*?)["\'</>]', re.S)
headtag = "</head>"
bodytag = "<body>"

htmltag = re.compile(r'<[^>]+>', re.S)


# check version fllowed by the cms name
# dz x2.5
# phpcms v9
# we believe the version follows by cms must contains numbers
def body_version(poweredby):

    l = poweredby.strip().split()
    if len(l) == 1:
        return poweredby
Esempio n. 21
0
from gevent import monkey
import sys
from gevent.pool import Pool
import requests
from redis_inc import RedisQueueConnection

monkey.patch_all(thread=False)




size = 100

pool = Pool(size)

runque = RedisQueueConnection('running').conn
robotsque = RedisQueueConnection('robots').conn



def httpget(url):
    url = url + "/robots.txt"
    con = ""
    try:
        with gevent.Timeout(2) as timeout:
            req = requests.get(url,timeout=(2,2))
            con = req.content
            #print url, len(con)
            req.close()
        
    except:
Esempio n. 22
0
# this script to test the urls.txt in some tests
#! /bin/python
from pybloomfilter import BloomFilter
import sqlite3
import multiprocessing as mp
import sqlite3
import zlib
import sys, os, getopt
import hashlib
import time
import re
import Queue
from redis_inc import RedisQueueConnection

#!!! db=1 use db1 to store the seeds
r = RedisQueueConnection('test').conn

cmd = "select id from mainpages"


#worker is a singal process for each cpu on each computer
def worker(queue, lock, cpuid, outque):
    cpuurls = set()
    innerque = Queue.Queue()

    while queue.qsize() > 0:
        db = queue.get()
        lock.acquire()
        print "CPU-%s:runing : %s" % (cpuid, db)
        lock.release()
Esempio n. 23
0
class Daemon:
    def __init__(self, done_que):
        self.cnt = 0
        self.showpercounts = 100
        self.dbsize = 0
        self.dbsizelimit = 536870912  # 512M 536870912
        self.spend = 0

        #queue for daemon recieve downloaded websites info
        self.done_que = done_que
        #urls queue to put filtered urls extracted from the webpage
        self.urls_que = RedisQueueConnection('extracturls').conn
        self.urlfilter = Filter()

        self.ip = getip()
        self.fname = self.getdbname()
        self.conn = sqlite3.connect(self.fname)
        self.conn.execute(
            "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
        )

        #when recv the ctrl+c signal, run out the extractation jobs and then quit
        self.quit = False

    def getdbname(self, create=False):

        path = "/work/db"
        tf = "%Y%m%d-%H%M%S"
        pre = "sitedata"
        suf = ".db"
        dbsize = 0

        ip = getip()
        findname = "%s%s" % (ip, suf)

        if create == True:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            self.dbsize = 0
            print "Create db: ", lastname
            return os.path.join(path, lastname)

        fnames = os.listdir(path)

        last = 0
        lastname = ""
        for fname in fnames:
            if fname.endswith(findname):
                fnow = fname.split('_')[1]
                fnown = int(time.mktime(time.strptime(fnow, tf)))
                if fnown > last:
                    last = fnown
                    lastname = fname
        #can not found the newest db file, so create it
        if not last:
            date = time.strftime(tf, time.localtime())
            lastname = "_".join([pre, date, ip]) + suf
            print "Create db: ", lastname
            self.dbsize = 0
        else:
            print "Reuse the last db: ", lastname
            self.dbsize = os.stat(os.path.join(path, lastname)).st_size

        return os.path.join(path, lastname)

    def geturls(self, seed, content):
        urls = []
        returls = []
        if not content or len(content) == 0:
            return []
        try:
            urls = re.findall(self.urlfilter.urlpatern, content)
            returls = self.urlfilter.filter_urls(seed, urls)
        except:
            pass
        return returls

    def run(self):
        #backend job,
        sleep(2)
        while True:
            try:
                if self.done_que.empty():
                    if self.quit == True:
                        #the speed to extract urls is more higher than crawler
                        sleep(1)
                        if not self.done_que.empty():
                            continue
                        print "Daemon run done and quit successfuly"
                        exit(0)

                    #print "Downloaded queue empty, wait crawler ..."
                    sleep(10)
                    continue

                data = cPickle.loads(self.done_que.get())

                seed = data['seed']
                content = data['content']
                headers = str(data['headers'])

                urls = self.geturls(seed, content)

                #put the extracted urls to urls_que
                for url in urls:
                    self.urls_que.put(url)

                #use level 1 to compress data , we get enough compress ratio and speed
                gziphtml = sqlite3.Binary(gzip.zlib.compress(content, 1))
                self.dbsize += (len(gziphtml) + len(seed) + len(headers))

                self.conn.execute(
                    "insert into mainpages (url,headers,content) values (?,?,?)",
                    (seed, headers, gziphtml))

                self.cnt += 1
                if self.cnt % self.showpercounts == 0:
                    self.conn.commit()

                    print "\n%s\n\tExtract done:%d todo:%d size:%dM" % \
                         (self.ip, self.cnt, self.done_que.qsize(),  self.dbsize/1024/1024)

                    if self.dbsize > self.dbsizelimit:
                        self.fname = self.getdbname(True)
                        self.conn.close()
                        self.conn = sqlite3.connect(self.fname)
                        self.conn.execute(
                            "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
                        )

            except Exception as e:
                print e
            except KeyboardInterrupt:
                print "Daemon recv quit singal, waiting for queue empty"
                self.quit = True
Esempio n. 24
0
import gevent
from gevent import monkey
import sys
from gevent.pool import Pool
import requests
from redis_inc import RedisQueueConnection

monkey.patch_all(thread=False)

size = 100

pool = Pool(size)

runque = RedisQueueConnection('running').conn
robotsque = RedisQueueConnection('robots').conn


def httpget(url):
    url = url + "/robots.txt"
    con = ""
    try:
        with gevent.Timeout(2) as timeout:
            req = requests.get(url, timeout=(2, 2))
            con = req.content
            #print url, len(con)
            req.close()

    except:
        pass
    data = (url, con)
    cb(data)
Esempio n. 25
0
class Crawler:

    def __init__(self, done_que):

        self.showpercounts = 100
        self.timeout = 5
        self.starttime = time()

        self.quit = False

        self.run_que = RedisQueueConnection('running').conn
        self.done_que = done_que
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()
        self.https_enable = 0 

        self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.totaldownsize = 0
        
        self.ip = getip()

    #callback function when greenlet of httpget run done
    def cb_httpget(self, data = None):

        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err,seed)
            return

        data={'seed':seed,'headers':headers,'content':content}
        
        dat = cPickle.dumps(data)
        #self.done_que.put_nowait(dat)

        #print "done", seed
        if self.done % self.showpercounts == 0:
            self.out(seed)


    def out(self, seed):

        spendtime = time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
        now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
        print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, now, (self.done), self.run_que.qsize(), \
            self.done/spendtime, self.done/self.totalnettime , str(self.err) )
    
    
    def run(self):

        while self.quit == False:
            try:
                if self.run_que.qsize() == 0:
                    print "run que empty"
                    sleep(10)
                    continue
                url = self.run_que.get()
                self.run_que.put(url)
                #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget)
                # spawn is more fast?
                #url = 'http://www.sdust.edu.cn'
                self.down_pool.spawn(self.httpget, url)
                self.done += 1
            except KeyboardInterrupt:
                print "Crawler recv quit singal"
                self.quit = True

        self.down_pool.join()
        print "Crawler over, quit"

    def handle_error(self,e,url):
       
        self.err.lasterrurl = url 
        # do not record the err url, but record the least err url to show
        if e.find('DNSError') > 0 :
            self.err.dns += 1
            #self.err.rdns.append(url)
        elif e.find('reset') > 0 :#Connection reset
            self.err.reset += 1
            #self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
            self.err.conntimeout += 1
            #self.err.rconntimeout.append(url)
        elif e.find('refused') > 0: #Connection refused
            self.err.refuse += 1
            #self.err.rrefuse.append(url)

        else:
            self.err.others +=1
            #self.err.rothers.append(url)
            print "Error", url, e

    
    # requests is better than curl in tests
    def httpget_requests(self, url):
       
        #return data
        data = None 
        st = time()
        con = ""
        e = ""
        res_headers = ""
        headers = {
                    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
                    'Accept-Encoding':'gzip,deflate',
                    'Connection':'close',
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
                }


        res = None
        done = False
        try:
            with gevent.Timeout(3, False) as timeout:
                #req.max_redirects = 2
                res = requests.get(url, headers = headers )
                con = res.content
                res.close()
                done = True
        except KeyboardInterrupt:
                raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            #as for spawn, no callback , we should call by ourself
            data = (url, e, None, None)
            #return url,e,None,None

        et = time()
        self.totalnettime += (et-st)
        #spawn
        if done:
            data = (url, e, res.headers, con)
            
        self.cb_httpget(data)
Esempio n. 26
0
ThreadRunning = True
serverips = ['127.0.0.1']
MAX_RUNNING_COUNT = 2**19  #  2**19 = 524 288
MAX_QPS = 1000  # 700+
showpercounts = 1000  #print out every when every 1000 querys sent

s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(('', 5310))
cur = 0
s.settimeout(0.1)

done_sites_fname = 'done_sites.bin'
bfdone = BloomFilter.open(done_sites_fname)

urlsque = RedisQueueConnection('extracturls').conn
runque = RedisQueueConnection('running').conn

#query count send out, and success query url cont
querysent = 0
querysuc = 0

#dnslocalserver = "dns_server_ip"

dns_server_died = False


def insert_redis(data):
    #perform extract and reform url to runque

    #domain format:
Esempio n. 27
0
#coding:utf8
import re, os
import redis_inc
from redis_inc import RedisQueueConnection
import cPickle as pickle
import multiprocessing as mp
import requests
import mysql_inc

cmsque = RedisQueueConnection('robots').conn
tempset = redis_inc.RedisConnection('test').conn

conn, cur = mysql_inc.gethandler()

cms = dict()
cms['disallow'] = dict()
cd = cms['disallow']

cd['dedecms'] = ['ad_js.php', 'mytag_js.php', 'feedback_js.php']
cd['phpcms'] = ['phpcms', 'phpsso_server']
cd['wordpress'] = ['wp-admin', 'wp-content', 'wp-includes']
cd['xiaocms'] = ['Print.aspx']
cd['discuz'] = ['forum.php?mod=']
cd['yiqicms'] = ['captcha']
cd['ecshop'] = ['goods_script.php']
cd['empirecms'] = ['e/enews']  # orinal:  /e/enew/

forks = list()
for key in cms.keys():
    disk = cms[key]
    for cmsk in disk.keys():
Esempio n. 28
0
def getsize(name):
    runque = RedisQueueConnection(name).conn
    print runque.qsize()
    i = runque.get()
    runque.put(i)
    print i
Esempio n. 29
0
class Crawler:
    def __init__(self):

        self.showpercounts = 10
        self.timeout = 20

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.run_que = RedisQueueConnection('running').conn
        self.doneque = RedisQueueConnection('robots').conn
        self.tempque = Queue()
        self.done = 1
        self.sent = 0
        self.quit = False

        self.err = Error()
        self.https_enable = 0

        self.httpget = self.httpget_requests  # down method self.httpget_requests | httpget_curl

        self.totalnettime = 0
        self.totaldownsize = 0
        self.starttime = time()

        self.ip = getip()
        self.headers = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.6',
            'Accept-Encoding':
            'gzip,deflate',
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
        }

    #callback function when greenlet of httpget run done
    def cb_httpget(self, data=None):
        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err, seed)
            return
        if len(content) <= 0:
            return

        data = {'seed': seed, 'headers': headers, 'content': content}

        #content is robots.txt, normally it's pure text
        dat = cPickle.dumps(data)
        self.tempque.put(dat)
        self.done += 1
        if self.done % self.showpercounts == 0:
            self.out(seed)

    def out(self, seed):

        spendtime = time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        print "\n%s D:%-4d  DT: %4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \
            self.done/spendtime, self.done/self.totalnettime , str(self.err) )

    def run(self):

        while self.quit == False:
            try:
                if self.run_que.qsize() == 0:
                    print "run que empty"
                    sleep(60)

                url = self.run_que.get()
                self.down_pool.spawn(self.httpget, url)
                self.sent += 1
            except KeyboardInterrupt:
                print "Crawler recv quit singal"
                self.quit = True

        self.down_pool.join()
        print "Crawler over, quit"

    def handle_error(self, e, url):

        self.err.lasterrurl = url
        # do not record the err url, but record the least err url to show
        if e.find('DNSError') > 0:
            self.err.dns += 1
            #self.err.rdns.append(url)
        elif e.find('reset') > 0:  #Connection reset
            self.err.reset += 1
            #self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'):  #
            self.err.conntimeout += 1
            #self.err.rconntimeout.append(url)
        elif e.find('refused') > 0:  #Connection refused
            self.err.refuse += 1
            #self.err.rrefuse.append(url)

        else:
            self.err.others += 1
            #self.err.rothers.append(url)
            print "Error", url, e

    # requests is better than curl in tests
    def httpget_requests(self, url):

        #return data
        data = None
        st = time()
        con = ""
        e = ""
        res_headers = ""

        res = None
        done = False
        try:
            with gevent.Timeout(self.timeout, False) as timeout:
                url = url + '/robots.txt'
                res = requests.get(url, headers=self.headers)
                if res.status_code == 200:
                    con = res.content
                    done = True
                res.close()
        except KeyboardInterrupt:
            raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            data = (url, e, None, None)

        et = time()
        self.totalnettime += (et - st)
        #spawn
        if done:
            data = (url, e, res.headers, con)

        #self.cb_httpget(data)
        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err, seed)
            return
        if len(content) <= 0:
            return

        data = {'seed': seed, 'headers': headers, 'content': content}

        #content is robots.txt, normally it's pure text
        dat = cPickle.dumps(data)
        self.tempque.put(dat)
        self.done += 1
        if self.done % self.showpercounts == 0:
            #self.out(seed)
            spendtime = time() - self.starttime
            spendtime = 1 if spendtime == 0 else spendtime
            print "\n%s D:%-4dDT:%4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \
                self.done/spendtime, self.sent/spendtime , str(self.err) )
Esempio n. 30
0
def rmdb(test):
    runque = RedisQueueConnection(test).conn
    print runque.qsize()
    raw_input('yes?')
    runque.flushdb()