Example #1
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size =  runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ') 

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp =  runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

   
    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Example #2
0
def extract():
    icmd = "insert into orignal (dbname, dbid, url, cms, headers, head) values (%s, %s, %s, %s, %s, %s)"
    runque = RedisQueueConnection('cms').conn
    size =  runque.qsize()
    print "total:" , size

    i = 0
    while i < size:
        item = runque.get()
        #runque.put(item)
        data = pickle.loads(item)
        ndata = []
        for item in data:
            if isinstance(item, unicode):
                    item = item.encode('utf8')
            ndata.append(item)

        #print ndata 
        if ndata[3]:
            print ndata[3], ndata[2]
        cur.execute(icmd , ndata)

        i += 1

 
    conn.commit()


    print "done"
    print runque.qsize()
Example #3
0
def test():
    runque = RedisQueueConnection('scan').conn
    #########runque.flushdb()
    size = runque.qsize()
    t = 0
    cnt = 0
    port = 0
    data = []
    if size == 0:
        return
    tmp = runque.get()
    runque.put(tmp)
    port, iph, ipl = tmp.split(' ')

    print port, size

    raw_input('confirm:')

    while runque.qsize() > 0:
        tmp = runque.get()
        try:
            t += len(tmp.split(' ')[-1].split(','))
            data.append(tmp)
        except:
            pass
        cnt += 1

    f = "china%s_%s_%s.txt" % (port, size, t)
    fp = open(f, 'w')
    for item in data:
        fp.write(item + '\n')

    fp.close()
Example #4
0
def inserturls():
    
    runque = RedisQueueConnection('extracturls').conn
    print runque.qsize()
    raw_input('flushdb?')
    runque.flushdb()
    urls = flist('urlstogetip.txt')
    for url in urls:
        runque.put(url)
    
    print runque.qsize()
Example #5
0
def test():
    runque = RedisQueueConnection('scan').conn

    size = runque.qsize()
    print size
    sleep(1)
    cnt = 0
    if size:
        while cnt < size:
            i = runque.get()
            print i
            runque.put(i)
            cnt += 1

        runque.flushdb()
    exit(0)

    f = open('seeds995k.txt')

    urls = f.read().strip().split('\n')

    if size == 0:
        i = 0
        st = time()
        for url in urls:
            runque.put(url)
Example #6
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)
        

        
    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])
    
    ol = 'ret_%s.txt' %  (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')
    
    slist = lines[ cur : len(lines) : cnt ]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp,'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

       
    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)
    
    scanque = RedisQueueConnection('scan').conn
    
    ipd = dict()
    for ip in ips:
        h  = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Example #7
0
def run():
    runque = RedisQueueConnection('running').conn
    #########runque.flushdb()
    size =  runque.qsize()

    print size
    raw_input('flush runing')
    runque.flushdb()
Example #8
0
def test():
    runque = RedisQueueConnection('robots').conn
    #########runque.flushdb()
    size =  runque.qsize()
    item = runque.get()
    runque.put(item)
    print pickle.loads(item)

    print size
    
    return
    raw_input('cofrim')
    s = flist('urlstogetrobots.txt')
    for url in s:
        runque.put(url) 


    print runque.qsize()
Example #9
0
def main():
    if len(sys.argv) != 4:
        print "wrong param"
        exit(0)

    port = int(sys.argv[1])
    cnt = int(sys.argv[2])
    cur = int(sys.argv[3])

    ol = 'ret_%s.txt' % (port)
    f = open(il)
    i = 0
    slist = list()
    lines = f.read().strip().split('\n')

    slist = lines[cur:len(lines):cnt]
    scancnt = len(slist)

    tmp = "scanips_%s_%s_%s" % (port, cnt, cur)
    if os.path.isfile(tmp):
        os.unlink(tmp)

    f = open(tmp, 'w')
    for ip in slist:
        f.write(ip + '\n')
    f.close()

    cmd = scancmd % (port, tmp, ol)
    print cmd
    run(cmd)
    # when done
    # make bitmap to store the scanret and then insert into redis
    print "run done, we collect the ips"
    tmp = open(ol).read().split('\n')[1:-2]
    ips = list()
    for ip in tmp:
        ips.append(ip.split()[3])

    alivecnt = len(ips)
    print "SCAN: %d ALIVE: %d " % (scancnt, alivecnt)

    scanque = RedisQueueConnection('scan').conn

    ipd = dict()
    for ip in ips:
        h = calc(ip)
        ht = ip.split('.')[-1]
        if h in ipd:
            ipd[h].append(ht)
        else:
            ipd[h] = [ht]
    for h in ipd:
        i = [port, h, ipd[h]]
        item = pickle.dumps(i)
        scanque.put(item)
    print "Insert into redis done "
    print "Total: %d" % (scanque.qsize())
Example #10
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size

    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Example #11
0
def test():
    runque = RedisQueueConnection('running').conn

    size = runque.qsize()
    print size



    exit(0)
    if size == 0:
        f = open('seeds995k.txt')
        for c in f:
            url = c.strip()
            runque.put(url)
Example #12
0
def show(name):
    runque = RedisQueueConnection(name).conn
    cnt = 0
    while cnt < runque.qsize():
        data = runque.get()
        runque.put(data)
        data = pickle.loads(data)
        
        seed =  data['seed']
        data = data['content'].replace('\r', '\n').replace('\n\n','\n').strip()
        if not data:
            continue
        if data.find('<') >= 0:
            #html page
            print seed
            continue
        
        robots = data.split('\n')
        print seed 
        print
        print "\n".join(robots)
        print  
        cnt += 1
Example #13
0
            con = req.content
            #print url, len(con)
            req.close()
        
    except:
        pass
    data =  (url, con)
    cb(data)
   

from time import time


def cb(data):
    seed, con = data
    #print "\t", seed, len(con)

 
cnt = 0
sst = time()
while True:
    url = runque.get()
    runque.put(url)
    st = time()
    pool.spawn(httpget, url)
    et = time()
    cnt += 1

    if cnt % 10 == 0:
        print cnt / (et-sst) ,runque.qsize(), robotsque.qsize()
Example #14
0
class Crawler:

    def __init__(self, done_que):

        self.showpercounts = 100
        self.timeout = 5
        self.starttime = time()

        self.quit = False

        self.run_que = RedisQueueConnection('running').conn
        self.done_que = done_que
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()
        self.https_enable = 0 

        self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.totaldownsize = 0
        
        self.ip = getip()

    #callback function when greenlet of httpget run done
    def cb_httpget(self, data = None):

        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err,seed)
            return

        data={'seed':seed,'headers':headers,'content':content}
        
        dat = cPickle.dumps(data)
        #self.done_que.put_nowait(dat)

        #print "done", seed
        if self.done % self.showpercounts == 0:
            self.out(seed)


    def out(self, seed):

        spendtime = time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
        now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
        print "\n%s\t%s D:%-4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, now, (self.done), self.run_que.qsize(), \
            self.done/spendtime, self.done/self.totalnettime , str(self.err) )
    
    
    def run(self):

        while self.quit == False:
            try:
                if self.run_que.qsize() == 0:
                    print "run que empty"
                    sleep(10)
                    continue
                url = self.run_que.get()
                self.run_que.put(url)
                #self.down_pool.apply_cb(self.httpget, (url,), callback=self.cb_httpget)
                # spawn is more fast?
                #url = 'http://www.sdust.edu.cn'
                self.down_pool.spawn(self.httpget, url)
                self.done += 1
            except KeyboardInterrupt:
                print "Crawler recv quit singal"
                self.quit = True

        self.down_pool.join()
        print "Crawler over, quit"

    def handle_error(self,e,url):
       
        self.err.lasterrurl = url 
        # do not record the err url, but record the least err url to show
        if e.find('DNSError') > 0 :
            self.err.dns += 1
            #self.err.rdns.append(url)
        elif e.find('reset') > 0 :#Connection reset
            self.err.reset += 1
            #self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
            self.err.conntimeout += 1
            #self.err.rconntimeout.append(url)
        elif e.find('refused') > 0: #Connection refused
            self.err.refuse += 1
            #self.err.rrefuse.append(url)

        else:
            self.err.others +=1
            #self.err.rothers.append(url)
            print "Error", url, e

    
    # requests is better than curl in tests
    def httpget_requests(self, url):
       
        #return data
        data = None 
        st = time()
        con = ""
        e = ""
        res_headers = ""
        headers = {
                    'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
                    'Accept-Encoding':'gzip,deflate',
                    'Connection':'close',
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
                }


        res = None
        done = False
        try:
            with gevent.Timeout(3, False) as timeout:
                #req.max_redirects = 2
                res = requests.get(url, headers = headers )
                con = res.content
                res.close()
                done = True
        except KeyboardInterrupt:
                raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            #as for spawn, no callback , we should call by ourself
            data = (url, e, None, None)
            #return url,e,None,None

        et = time()
        self.totalnettime += (et-st)
        #spawn
        if done:
            data = (url, e, res.headers, con)
            
        self.cb_httpget(data)
Example #15
0
            con = req.content
            #print url, len(con)
            req.close()

    except:
        pass
    data = (url, con)
    cb(data)


from time import time


def cb(data):
    seed, con = data
    #print "\t", seed, len(con)


cnt = 0
sst = time()
while True:
    url = runque.get()
    runque.put(url)
    st = time()
    pool.spawn(httpget, url)
    et = time()
    cnt += 1

    if cnt % 10 == 0:
        print cnt / (et - sst), runque.qsize(), robotsque.qsize()
Example #16
0
def rmdb(test):
    runque = RedisQueueConnection(test).conn
    print runque.qsize()
    raw_input('yes?')
    runque.flushdb()
Example #17
0
def getsize(name):
    runque = RedisQueueConnection(name).conn
    print runque.qsize()
    i = runque.get()
    runque.put(i)
    print i
Example #18
0
class Crawler:
    def __init__(self):

        self.showpercounts = 10
        self.timeout = 20

        self.poolsize = 100
        self.down_pool = Pool(size=self.poolsize)

        self.run_que = RedisQueueConnection('running').conn
        self.doneque = RedisQueueConnection('robots').conn
        self.tempque = Queue()
        self.done = 1
        self.sent = 0
        self.quit = False

        self.err = Error()
        self.https_enable = 0

        self.httpget = self.httpget_requests  # down method self.httpget_requests | httpget_curl

        self.totalnettime = 0
        self.totaldownsize = 0
        self.starttime = time()

        self.ip = getip()
        self.headers = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.6',
            'Accept-Encoding':
            'gzip,deflate',
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
        }

    #callback function when greenlet of httpget run done
    def cb_httpget(self, data=None):
        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err, seed)
            return
        if len(content) <= 0:
            return

        data = {'seed': seed, 'headers': headers, 'content': content}

        #content is robots.txt, normally it's pure text
        dat = cPickle.dumps(data)
        self.tempque.put(dat)
        self.done += 1
        if self.done % self.showpercounts == 0:
            self.out(seed)

    def out(self, seed):

        spendtime = time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        print "\n%s D:%-4d  DT: %4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \
            self.done/spendtime, self.done/self.totalnettime , str(self.err) )

    def run(self):

        while self.quit == False:
            try:
                if self.run_que.qsize() == 0:
                    print "run que empty"
                    sleep(60)

                url = self.run_que.get()
                self.down_pool.spawn(self.httpget, url)
                self.sent += 1
            except KeyboardInterrupt:
                print "Crawler recv quit singal"
                self.quit = True

        self.down_pool.join()
        print "Crawler over, quit"

    def handle_error(self, e, url):

        self.err.lasterrurl = url
        # do not record the err url, but record the least err url to show
        if e.find('DNSError') > 0:
            self.err.dns += 1
            #self.err.rdns.append(url)
        elif e.find('reset') > 0:  #Connection reset
            self.err.reset += 1
            #self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'):  #
            self.err.conntimeout += 1
            #self.err.rconntimeout.append(url)
        elif e.find('refused') > 0:  #Connection refused
            self.err.refuse += 1
            #self.err.rrefuse.append(url)

        else:
            self.err.others += 1
            #self.err.rothers.append(url)
            print "Error", url, e

    # requests is better than curl in tests
    def httpget_requests(self, url):

        #return data
        data = None
        st = time()
        con = ""
        e = ""
        res_headers = ""

        res = None
        done = False
        try:
            with gevent.Timeout(self.timeout, False) as timeout:
                url = url + '/robots.txt'
                res = requests.get(url, headers=self.headers)
                if res.status_code == 200:
                    con = res.content
                    done = True
                res.close()
        except KeyboardInterrupt:
            raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            data = (url, e, None, None)

        et = time()
        self.totalnettime += (et - st)
        #spawn
        if done:
            data = (url, e, res.headers, con)

        #self.cb_httpget(data)
        if not data:
            return
        seed, err, headers, content = data

        if err:
            self.handle_error(err, seed)
            return
        if len(content) <= 0:
            return

        data = {'seed': seed, 'headers': headers, 'content': content}

        #content is robots.txt, normally it's pure text
        dat = cPickle.dumps(data)
        self.tempque.put(dat)
        self.done += 1
        if self.done % self.showpercounts == 0:
            #self.out(seed)
            spendtime = time() - self.starttime
            spendtime = 1 if spendtime == 0 else spendtime
            print "\n%s D:%-4dDT:%4d R:%-7d [QPS: %.2f  %.2f]  %s" % (self.ip, self.done,self.doneque.qsize(), self.run_que.qsize(), \
                self.done/spendtime, self.sent/spendtime , str(self.err) )