Beispiel #1
0
 def linklistappend(self, qualifiedlink):
     glovar.linklock.acquire()
     if qualifiedlink not in glovar.linklist:
         self.linklist.append(qualifiedlink)
         glovar.linklist.append(qualifiedlink)
         CrawThread.writetocrawfile(qualifiedlink)
     glovar.linklock.release()
Beispiel #2
0
 def formlistappend(self):
     formdict = {}
     formdict[self.current_actionlink] = self.current_forminputlist
     #
     #        for item in self.current_forminputlist:
     #            self.current_forminputlist.remove(item)
     formstr = self.transformtostr(formdict)
     flag = 1
     glovar.formlock.acquire()
     #'''
     #        for idict in glovar.formlist:
     #            if set(list(idict))==set(list(formdict)):#link same
     #                idictlist=idict[list(idict)[0]]
     #                formdictlist=formdict[list(formdict)[0]]
     #                for inputdict in formdictlist:
     #                    for inputdict2 in idictlist:
     #                        if set(list(inputdict))==set(list(inputdict2)):
     #                            continue
     #                        else:
     #                            flag=0
     #'''
     for form in glovar.formlist:
         str = self.transformtostr(form)
         if str == formstr:
             flag = 0
     if flag == 1:
         glovar.formlist.append(formdict)
         CrawThread.writetoformfile(formdict)
     glovar.formlock.release()
     self.current_forminputlist = []
Beispiel #3
0
 def formlistappend(self):
     formdict = {}
     formdict[self.current_actionlink] = self.current_forminputlist
     #
     #        for item in self.current_forminputlist:
     #            self.current_forminputlist.remove(item)
     formstr = self.transformtostr(formdict)
     flag = 1
     glovar.formlock.acquire()
     #'''
     #        for idict in glovar.formlist:
     #            if set(list(idict))==set(list(formdict)):#link same
     #                idictlist=idict[list(idict)[0]]
     #                formdictlist=formdict[list(formdict)[0]]
     #                for inputdict in formdictlist:
     #                    for inputdict2 in idictlist:
     #                        if set(list(inputdict))==set(list(inputdict2)):
     #                            continue
     #                        else:
     #                            flag=0
     #'''
     for form in glovar.formlist:
         str = self.transformtostr(form)
         if str == formstr:
             flag = 0
     if flag == 1:
         glovar.formlist.append(formdict)
         CrawThread.writetoformfile(formdict)
     glovar.formlock.release()
     self.current_forminputlist = []
Beispiel #4
0
 def linklistappend(self, qualifiedlink):
     glovar.linklock.acquire()
     if qualifiedlink not in glovar.linklist:
         self.linklist.append(qualifiedlink)
         glovar.linklist.append(qualifiedlink)
         CrawThread.writetocrawfile(qualifiedlink)
     glovar.linklock.release()
Beispiel #5
0
 def hreflistappend(self, attrs):
     hrefdict = {}
     dict = self.hrefdatadict_from_attrs(attrs)
     if dict:
         hrefdict[self.current_hreflink] = dict
         flag = 1
         glovar.hreflock.acquire()
         for idict in glovar.hreflist:
             if set(list(hrefdict)) == set(list(idict)):
                 idictvalue = idict[list(idict)[0]]
                 hrefdictvalue = hrefdict[list(hrefdict)[0]]
                 if set(list(idictvalue)) == set(list(hrefdictvalue)):
                     flag = 0
         if flag == 1:
             glovar.hreflist.append(hrefdict)
             CrawThread.writetohreffile(hrefdict)
         glovar.hreflock.release()
Beispiel #6
0
 def hreflistappend(self, attrs):
     hrefdict = {}
     dict = self.hrefdatadict_from_attrs(attrs)
     if dict:
         hrefdict[self.current_hreflink] = dict
         flag = 1
         glovar.hreflock.acquire()
         for idict in glovar.hreflist:
             if set(list(hrefdict)) == set(list(idict)):
                 idictvalue = idict[list(idict)[0]]
                 hrefdictvalue = hrefdict[list(hrefdict)[0]]
                 if set(list(idictvalue)) == set(list(hrefdictvalue)):
                     flag = 0
         if flag == 1:
             glovar.hreflist.append(hrefdict)
             CrawThread.writetohreffile(hrefdict)
         glovar.hreflock.release()
Beispiel #7
0
    def main(self, link):
        global linkparse
        #            linkparse=link
        timeout = 20
        self.sleep_download_time = 10
        socket.setdefaulttimeout(timeout)
        url_parser = URLParser(strict=False)
        #            headers = {
        #           'User-Agent':
        #           'Opera/9.23'
        #             }
        #                try:
        #                    r = urllib.request.Request(link)
        #                except ValueError as e:
        #                    print("ValueError!!!")
        #                    print(e)
        #                    print("link:"+link+"\n")
        #                    return
        #                if r:
        try:
            time.sleep(self.sleep_download_time)
            u = urllib.request.urlopen(link)
            backurl = u.geturl()  #prevent redirection
            print("backurl:", backurl)
            glovar.parselock.acquire()
            if backurl in glovar.parsedlist:
                print("this link has been paresd")
                return None
            linkparse = backurl
            glovar.parselock.release()
            the_html = u.read()
            charset = u.info().get_content_charset()
            u.close()
        except Exception as e:
            print(time.strftime("%Y-%m-%d %H:%M:%S"))
            print("fail to access!!!")
            print("urllib.error")
            print(e)
            print("link:" + link + "\n")
            return None
        except socket.timeout as e:
            print("socket timout:", link)
            print(e)
            return None
#                    print ("charset:".join(charset))
        if not charset:
            try:
                url_parser.feed(the_html.decode('gb2312'))
                print('gb2312')
            except UnicodeDecodeError:
                try:
                    url_parser.feed(the_html.decode('utf-8'))
                    print('utf-8')
                except UnicodeDecodeError:
                    try:
                        url_parser.feed(the_html.decode('GB18030'))
                        print('GB18030')
                        #www.sohu.com
                    except HTMLParseError as e:
                        print(" HTMLParseError!!!")
                        print(e)
                        print("link:" + linkparse + "\n")
                        return None
                    except UnicodeDecodeError as e:
                        print("UnicodeDecodeError")
                        print(e)
                        print("link:" + linkparse + "\n")
                        return None
            except HTMLParseError as e:
                print(" HTMLParseError:")
                print(e)
                print("link:" + linkparse + "\n")
                return None

        else:
            #                       try:
            print("charset:" + charset)
            url_parser.feed(the_html.decode(charset))


#                            url_parser.feed(the_html)
#                        except Exception as e:
#                            print('parse error occurred: %s\n' % e)
#                            return None

#            global linklist
#            linklist= url_parser.linklist
#            print(time.strftime("%Y-%m-%d %H:%M:%S"))
#            print("parse successfully!!!")
#            print("link parsed:"+link+"\n")
#            global linkparse
        if link != linkparse:
            print(link, "redict to: ", linkparse)
        print(time.strftime("%Y-%m-%d %H:%M:%S"))
        print("access successfully!!!")
        print(linkparse)
        CrawThread.writetolinkfile(linkparse)
        glovar.parselock.acquire()
        glovar.parsedlist.append(linkparse)
        glovar.parselock.release()
Beispiel #8
0
       def main(self,link):
            global linkparse
#            linkparse=link
            timeout = 20 
            self.sleep_download_time = 10
            socket.setdefaulttimeout(timeout)
            url_parser = URLParser(strict=False)
#            headers = {
#           'User-Agent':
#           'Opera/9.23'
#             }
#                try:
#                    r = urllib.request.Request(link)
#                except ValueError as e:
#                    print("ValueError!!!")
#                    print(e)
#                    print("link:"+link+"\n")
#                    return 
#                if r:
            try:
                time.sleep(self.sleep_download_time)
                u = urllib.request.urlopen(link)
                backurl=u.geturl()#prevent redirection               
                print("backurl:",backurl)
                glovar.parselock.acquire()
                if backurl in glovar.parsedlist:
                    print("this link has been paresd")
                    return None
                linkparse=backurl
                glovar.parselock.release()
                the_html = u.read()
                charset = u.info().get_content_charset()
                u.close() 
            except Exception as e:
                print(time.strftime("%Y-%m-%d %H:%M:%S"))
                print("fail to access!!!")
                print("urllib.error")
                print(e)
                print("link:"+link+"\n")
                return None
            except socket.timeout as e:
                print("socket timout:",link) 
                print(e)
                return None
#                    print ("charset:".join(charset))             
            if not charset:
                try:
                    url_parser.feed(the_html.decode('gb2312'))
                    print ('gb2312')
                except UnicodeDecodeError:
                    try:
                        url_parser.feed(the_html.decode('utf-8'))                    
                        print ('utf-8')
                    except UnicodeDecodeError:
                        try:
                            url_parser.feed(the_html.decode('GB18030'))
                            print ('GB18030')
                            #www.sohu.com
                        except HTMLParseError as e:
                            print(" HTMLParseError!!!")
                            print(e)
                            print("link:"+linkparse+"\n")
                            return None
                        except UnicodeDecodeError as e:
                            print("UnicodeDecodeError")
                            print(e)
                            print("link:"+linkparse+"\n")
                            return None
                except HTMLParseError as e:
                    print(" HTMLParseError:")
                    print(e)
                    print("link:"+linkparse+"\n")
                    return None
                       
            else:
#                       try:
                print ("charset:"+charset)
                url_parser.feed(the_html.decode(charset))
#                            url_parser.feed(the_html)
#                        except Exception as e:
#                            print('parse error occurred: %s\n' % e)
#                            return None
                        
#            global linklist 
#            linklist= url_parser.linklist
#            print(time.strftime("%Y-%m-%d %H:%M:%S"))
#            print("parse successfully!!!")
#            print("link parsed:"+link+"\n")
#            global linkparse
            if link!=linkparse:
                print(link,"redict to: ",linkparse)
            print(time.strftime("%Y-%m-%d %H:%M:%S"))
            print("access successfully!!!")
            print(linkparse)
            CrawThread.writetolinkfile(linkparse)
            glovar.parselock.acquire()
            glovar.parsedlist.append(linkparse)
            glovar.parselock.release()