def handle_data(self, data): if self.current_tag == 'title': keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: #if not self.key_url.has_key(key): #self.key_url[key] = [] #print key self.keywords.append(key) #self.key_url[key].append(self.url) data = doclex.delspace(data) if len(data) > 0: self.title = data #collection_url_title.insert({'key':self.url, 'title':data, 'timetmp':time.time()}) elif self.current_tag == 'a': #if not judged_url(self.link_url): # self.link_url = self.url + self.link_url keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not self.key_url.has_key(key): self.key_url[key] = [] if self.link_url != self.url and judged_url(self.link_url): self.key_url[key].append(self.link_url) #print key, self.link_url else: if self.current_tag == 'p' or self.current_tag == 'div': self.data.append(data)
def process_page(url, data): if data is None: return try: key_url = {} url_profile = "" htmlp = htmlprocess(url) encoding = chardet.detect(data) if encoding['encoding'] is None: return udata = unicode(data, encoding['encoding']) htmlp.feed(udata.encode('utf-8')) keywords = htmlp.keywords key_url.update(htmlp.key_url) if len(key_url) > 0: for key, value in key_url.iteritems(): if len(value) > 0: urllist = [] urllist = [url for url in value if urllist.count(url) == 0] if url not in key_url[key]: key_url[key] = urllist for data in htmlp.data: data = doclex.delspace(data) if len(data) < 32: url_profile += data keys = doclex.simplesplit(data) keywords.extend(keys) if isinstance(keys, list) and len(keys) > 0: for key in keys: if not key_url.has_key(key): key_url[key] = [] if url not in key_url[key]: key_url[key].append(url) else: if len(data) > 100: url_profile += data[0:len(data) if len(data) < 100 else 100] + "..." keys1 = doclex.lex(data) keywords.extend(keys1) for key1 in keys1: if not key_url.has_key(key1): key_url[key1] = [] if url not in key_url[key1]: key_url[key1].append(url) return htmlp.link, url_profile, keywords, htmlp.profile, key_url, htmlp.title except: #import traceback #traceback.print_exc() pass
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove( key) if key not in self.urllist[self.sub_url]['keys'][ '1'] and key not in self.urllist[ self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append( key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[ 0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append( udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append( (udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()
def handle_data(self, data): if self.current_tag == 'title': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): if len(data) > 0: self.urlinfo['title'].append(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) keys = doclex.vaguesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc() elif self.current_tag == 'a' or self.current_tag == 'A': try: if self.sub_url != "": encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['1'].append(key) keys1 = doclex.lex(data) for key in keys1: self.urllist[self.sub_url]['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urllist[self.sub_url]['keys']['3'].append(key) tlen = 16 if len(data) < 16: tlen = len(data) self.urllist[self.sub_url]['title'].append(data[0:tlen]) if len(data) > 32: self.urllist[self.sub_url]['profile'].append(data[0:32]) except: import traceback traceback.print_exc() else: if self.current_tag == 'div' or self.current_tag == 'p': try: encodingdate = chardet.detect(data) if encodingdate['encoding']: data = unicode(data, encodingdate['encoding']) if not doclex.invialddata(data): data = doclex.delspace(data) if data[0] == u'<': return if len(data) > 100: tlen = 16 if len(data) < 16: tlen = len(data) self.urlinfo['title'].append(data[0:tlen]) if len(data) > 32: self.urlinfo['profile'].append(data[0:32] + u"...") keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['2'].append(key) keys1 = doclex.vaguesplit(data) for key in keys1: self.urlinfo['keys']['3'].append(key) self.weight += 200 except: import traceback traceback.print_exc()
def handle_data(self, data): if self.current_tag == 'title': try: data = doclex.delspace(data) keys = doclex.lex(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: self.urlinfo['keys']['2'].append(key) if len(data) > 0: self.urlinfo['title'] = data except: import traceback traceback.print_exc() elif self.current_tag == 'a': try: if self.sub_url != "": keys = doclex.simplesplit(data) if isinstance(keys, list) and len(keys) > 0: for key in keys: if key in self.urllist[self.sub_url]['keys']['3']: self.urllist[self.sub_url]['keys']['3'].remove(key) if key not in self.urllist[self.sub_url]['keys']['1'] and key not in self.urllist[self.sub_url]['keys']['2']: self.urllist[self.sub_url]['keys']['2'].append(key) encodingdate = chardet.detect(data) if encodingdate['encoding']: udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urllist[self.sub_url]['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 16: self.urllist[self.sub_url]['profile']['1'] = udata[0:32].encode('utf-8') except: import traceback traceback.print_exc() else: if self.current_tag == 'p' or self.current_tag == 'div': try: if not doclex.invialddata(data): data = doclex.delspace(data) encodingdate = chardet.detect(data) udata = unicode(data, encodingdate['encoding']) tlen = 16 if len(udata) < 16: tlen = len(udata) self.urlinfo['titlegen'].append(udata[0:tlen].encode('utf-8')) if len(udata) > 32: self.urlinfo['profile']['2'].append((udata[0:32] + u"...").encode('utf-8')) keys1 = doclex.lex(data) for key in keys1: self.urlinfo['keys']['3'].append(key) except: import traceback traceback.print_exc()