Beispiel #1
0
 def handle_data(self, data):
     if self.current_tag == 'title':
         keys = doclex.lex(data)
         if isinstance(keys, list) and len(keys) > 0:
             for key in keys:
                 #if not self.key_url.has_key(key):
                     #self.key_url[key] = []
                 #print key
                 self.keywords.append(key)
                 #self.key_url[key].append(self.url)
         data = doclex.delspace(data)
         if len(data) > 0:
             self.title = data
             #collection_url_title.insert({'key':self.url, 'title':data, 'timetmp':time.time()})
     elif self.current_tag == 'a':
         #if not judged_url(self.link_url):
         #    self.link_url = self.url + self.link_url
         keys = doclex.simplesplit(data)
         if isinstance(keys, list) and len(keys) > 0:
             for key in keys:
                 if not self.key_url.has_key(key):
                     self.key_url[key] = []
                 if self.link_url != self.url and judged_url(self.link_url):
                     self.key_url[key].append(self.link_url)
                     #print key, self.link_url
     else:
         if self.current_tag == 'p' or self.current_tag == 'div':
             self.data.append(data)
Beispiel #2
0
def process_page(url, data):
    if data is None:
        return

    try:
        key_url = {}
        url_profile = ""

        htmlp = htmlprocess(url)
        encoding = chardet.detect(data)
        if encoding['encoding'] is None:
            return
        udata = unicode(data, encoding['encoding'])
        htmlp.feed(udata.encode('utf-8'))

        keywords = htmlp.keywords

        key_url.update(htmlp.key_url)
        if len(key_url) > 0:
            for key, value in key_url.iteritems():
                if len(value) > 0:
                    urllist = []
                    urllist = [url for url in value if urllist.count(url) == 0]
                    if url not in key_url[key]:
                        key_url[key] = urllist

        for data in htmlp.data:
            data = doclex.delspace(data)
            if len(data) < 32:
                url_profile += data
                keys = doclex.simplesplit(data)
                keywords.extend(keys)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        if not key_url.has_key(key):
                            key_url[key] = []
                        if url not in key_url[key]:
                            key_url[key].append(url)
            else:
                if len(data) > 100:
                    url_profile += data[0:len(data) if len(data) < 100 else 100] + "..."
                keys1 = doclex.lex(data)
                keywords.extend(keys1)
                for key1 in keys1:
                    if not key_url.has_key(key1):
                        key_url[key1] = []
                    if url not in key_url[key1]:
                        key_url[key1].append(url)

        return htmlp.link, url_profile, keywords, htmlp.profile, key_url, htmlp.title

    except:
        #import traceback
        #traceback.print_exc()
        pass
Beispiel #3
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                data = doclex.delspace(data)
                keys = doclex.lex(data)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        self.urlinfo['keys']['2'].append(key)
                if len(data) > 0:
                    self.urlinfo['title'] = data
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a':
            try:
                if self.sub_url != "":
                    keys = doclex.simplesplit(data)
                    if isinstance(keys, list) and len(keys) > 0:
                        for key in keys:
                            if key in self.urllist[self.sub_url]['keys']['3']:
                                self.urllist[self.sub_url]['keys']['3'].remove(
                                    key)
                            if key not in self.urllist[self.sub_url]['keys'][
                                    '1'] and key not in self.urllist[
                                        self.sub_url]['keys']['2']:
                                self.urllist[self.sub_url]['keys']['2'].append(
                                    key)

                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urllist[self.sub_url]['titlegen'].append(
                            udata[0:tlen].encode('utf-8'))
                        if len(udata) > 16:
                            self.urllist[self.sub_url]['profile']['1'] = udata[
                                0:32].encode('utf-8')

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'p' or self.current_tag == 'div':
                try:
                    if not doclex.invialddata(data):
                        data = doclex.delspace(data)

                        encodingdate = chardet.detect(data)
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urlinfo['titlegen'].append(
                            udata[0:tlen].encode('utf-8'))

                        if len(udata) > 32:
                            self.urlinfo['profile']['2'].append(
                                (udata[0:32] + u"...").encode('utf-8'))

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urlinfo['keys']['3'].append(key)

                except:
                    import traceback
                    traceback.print_exc()
Beispiel #4
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                encodingdate = chardet.detect(data)
                if encodingdate['encoding']:
                    data = unicode(data, encodingdate['encoding'])

                    if not doclex.invialddata(data):
                        if len(data) > 0:
                            self.urlinfo['title'].append(data)

                        keys = doclex.lex(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['2'].append(key)

                        keys = doclex.vaguesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['3'].append(key)
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            try:
                encodingdate = chardet.detect(data)
                if encodingdate['encoding']:
                    data = unicode(data, encodingdate['encoding'])

                    if not doclex.invialddata(data):
                        if len(data) > 0:
                            self.urlinfo['title'].append(data)

                        keys = doclex.lex(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['2'].append(key)

                        keys = doclex.vaguesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                self.urlinfo['keys']['3'].append(key)
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a' or self.current_tag == 'A':
            try:
                if self.sub_url != "":
                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        data = unicode(data, encodingdate['encoding'])

                        keys = doclex.simplesplit(data)
                        if isinstance(keys, list) and len(keys) > 0:
                            for key in keys:
                                if key in self.urllist[self.sub_url]['keys']['3']:
                                    self.urllist[self.sub_url]['keys']['3'].remove(key)
                                if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']:
                                    self.urllist[self.sub_url]['keys']['1'].append(key)

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urllist[self.sub_url]['keys']['2'].append(key)

                        keys1 = doclex.vaguesplit(data)
                        for key in keys1:
                            self.urllist[self.sub_url]['keys']['3'].append(key)

                        tlen = 16
                        if len(data) < 16:
                            tlen = len(data)
                        self.urllist[self.sub_url]['title'].append(data[0:tlen])

                        if len(data) > 32:
                            self.urllist[self.sub_url]['profile'].append(data[0:32])

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'div' or self.current_tag == 'p':
                try:
                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        data = unicode(data, encodingdate['encoding'])

                        if not doclex.invialddata(data):
                            data = doclex.delspace(data)

                            if data[0] == u'<':
                                return

                            if len(data) > 100:
                                tlen = 16
                                if len(data) < 16:
                                    tlen = len(data)
                                self.urlinfo['title'].append(data[0:tlen])

                                if len(data) > 32:
                                    self.urlinfo['profile'].append(data[0:32] + u"...")

                                keys1 = doclex.lex(data)
                                for key in keys1:
                                    self.urlinfo['keys']['2'].append(key)

                                keys1 = doclex.vaguesplit(data)
                                for key in keys1:
                                    self.urlinfo['keys']['3'].append(key)

                                self.weight += 200

                except:
                    import traceback
                    traceback.print_exc()
Beispiel #5
0
    def handle_data(self, data):
        if self.current_tag == 'title':
            try:
                data = doclex.delspace(data)
                keys = doclex.lex(data)
                if isinstance(keys, list) and len(keys) > 0:
                    for key in keys:
                        self.urlinfo['keys']['2'].append(key)
                if len(data) > 0:
                    self.urlinfo['title'] = data
            except:
                import traceback
                traceback.print_exc()

        elif self.current_tag == 'a':
            try:
                if self.sub_url != "":
                    keys = doclex.simplesplit(data)
                    if isinstance(keys, list) and len(keys) > 0:
                        for key in keys:
                            if key in self.urllist[self.sub_url]['keys']['3']:
                                self.urllist[self.sub_url]['keys']['3'].remove(key)
                            if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']:
                                self.urllist[self.sub_url]['keys']['2'].append(key)

                    encodingdate = chardet.detect(data)
                    if encodingdate['encoding']:
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8'))
                        if len(udata) > 16:
                            self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8')

            except:
                import traceback
                traceback.print_exc()
        else:
            if self.current_tag == 'p' or self.current_tag == 'div':
                try:
                    if not doclex.invialddata(data):
                        data = doclex.delspace(data)

                        encodingdate = chardet.detect(data)
                        udata = unicode(data, encodingdate['encoding'])
                        tlen = 16
                        if len(udata) < 16:
                            tlen = len(udata)
                        self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8'))

                        if len(udata) > 32:
                            self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8'))

                        keys1 = doclex.lex(data)
                        for key in keys1:
                            self.urlinfo['keys']['3'].append(key)

                except:
                    import traceback
                    traceback.print_exc()