Ejemplo n.º 1
0
def main():

    global b, o, ct, w, s, urlq, want
    b = ctime()
    bt = b.split()
    ct = bt[1] + bt[2]
    urlq = Slist()
    s = Save()
    u = raw_input('Please input the url:')
    t = int(raw_input('Please input the number of thread you want: '))
    n = int(raw_input('Please input the depth you want: '))
    w = raw_input('Do you want do search some urls you want?(y/n): ')
    if w == 'y':
        want = raw_input('What do you want to search?: ')
    elif w == 'n':
        want = 0
    else:
        want = 0

    url = 'http://' + u
    m = url_parse(url)
    urlq.put(m)
    urlq.l_del_sa()
    for i in m:
        s.s_wirte(nadom, i)

    thread_list = []
    for i in range(t):
        thread_list.append(Thread(target=forward,args=(n,)))

    for thread in thread_list:
        thread.start()
Ejemplo n.º 2
0
Archivo: dark.py Proyecto: 3bobo/spider
    def search(self,page,keyword):

        search_list = Slist()
        what = r'(http://[\.\w\s\d//-=/?]*' + str(keyword) + r'+/*\d*[^>\'\"<]*)'
        search_re = re.compile(what, re.S)
        search_get = search_re.findall(page, re.S)
        search_list.put(search_get)
        search_list.l_del_sa()

        return search_list.li2
Ejemplo n.º 3
0
def  search(page,date):

    search_list = Slist()
#http://[\.\w\s\d//-=/?]*(vuls)+/*\d*[^><'"]*
    what = r'(http://[\.\w\s\d//-=/?]*' + str(date) + r'+/*\d*[^>\'\"<]*)'
    search_re = re.compile(what, re.S)
    search_get = search_re.findall(page, re.S)
    search_list.put(search_get)
    search_list.l_del_sa()

    return search_list.li2
Ejemplo n.º 4
0
def url_parse(url):

    global nadom, nerror
    adom = domain(url)
    nadom = 'thread_' + str(adom) + '_' + str(ct)
    nerror = 'error_' + str(adom) + '_' + str(ct)
    try:
        page = urllib2.urlopen(url)
        html = page.read()
        if want != 0:
            par_url = search(html, want)
        else:
            url_re = re.compile(r'http://[\.\w//]+',re.S)
            par_url = url_re.findall(html,re.S)

        urld = Slist()
        urld.put(par_url)
        urld.l_del_sa()

        for i in urld.li2:
            for i in urld.li2:
                for i in urld.li2:
                    i_dom = domain(i)
                    if i_dom != adom:
                        urld.l_del_da(i)
                        s.s_wirte(nerror, i)

        urls = urld.li2
        return urls
    except:
        return 0
Ejemplo n.º 5
0
Archivo: dark.py Proyecto: 3bobo/spider
class WSR(object):

    isearch = raw_input('Need search?(y/n)')
    if isearch == 'y':
        keyword = raw_input('Keyword(only support english):')
    elif isearch == 'n':
        keyword = ''
    else:
        keyword = ''

    def __init__(self):
        self.s1 = Slist()

    def imit(self,url):
        date = Slist()
        adom = domain(url)
        try:
            page = urllib2.urlopen(url)
            html = page.read()
            if self.keyword != '':
                date.put(self.search(html,self.keyword))
                date.l_del_sa()
            else:
                url_re = re.compile(r'http://[\.\w//]+',re.S)
                date.put(url_re.findall(html,re.S))
                date.l_del_sa()

            for i in date.li2:
                for i in date.li2:
                    for i in date.li2:
                        dom = domain(i)
                        if dom != adom:
                            date.l_del_da(i)

            self.s1.put(date.li2)
            self.s1.l_del_sa()
        except:
            self.s1.put([])

        return self.s1.li2

    def go(self,n):
        while n > 0:
            u = self.s1.get()
            aa = self.imit(u)
            if aa == 0:
                self.go(n)
            else:
                self.s1.put(aa)
                self.s1.l_del_sa()
                n = n -1
                self.go(n)
        else:
            print 'Done'

    def search(self,page,keyword):

        search_list = Slist()
        what = r'(http://[\.\w\s\d//-=/?]*' + str(keyword) + r'+/*\d*[^>\'\"<]*)'
        search_re = re.compile(what, re.S)
        search_get = search_re.findall(page, re.S)
        search_list.put(search_get)
        search_list.l_del_sa()

        return search_list.li2
Ejemplo n.º 6
0
Archivo: dark.py Proyecto: 3bobo/spider
    def imit(self,url):
        date = Slist()
        adom = domain(url)
        try:
            page = urllib2.urlopen(url)
            html = page.read()
            if self.keyword != '':
                date.put(self.search(html,self.keyword))
                date.l_del_sa()
            else:
                url_re = re.compile(r'http://[\.\w//]+',re.S)
                date.put(url_re.findall(html,re.S))
                date.l_del_sa()

            for i in date.li2:
                for i in date.li2:
                    for i in date.li2:
                        dom = domain(i)
                        if dom != adom:
                            date.l_del_da(i)

            self.s1.put(date.li2)
            self.s1.l_del_sa()
        except:
            self.s1.put([])

        return self.s1.li2
Ejemplo n.º 7
0
Archivo: dark.py Proyecto: 3bobo/spider
 def __init__(self):
     self.s1 = Slist()