def extract_content(doc): content = '' spans = htmlfind.findTag(doc, 'div', 'class="company-introduction clearfix"') if spans: ps = re.findall(r'<p[^<>]*>(.*?)</p>', spans[0], re.S) for p in ps: content += htmlfind.remove_tag(p, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') return content
def extract_content(self): content = '' spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="company_intro_text"') if spans: ps = re.findall(r'<span[^<>]*>(.*?)</span>', spans[0], re.S) for p in ps: content += htmlfind.remove_tag(p, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') return content
def extract_content(self): content = '' divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="in"') if divs: ps = re.findall(r'<p[^<>]*>(.*?)</p>', divs[0], re.S) for p in ps: content += htmlfind.remove_tag(p, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') return content
def extract_content(self): content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'posMsg borb') try: content = htmlfind.remove_tag(content[0], 1) except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content
def run_job(self, jobid): if not self.page_store.check_should_fetch(jobid): return url = "http://www.lagou.com/jobs/{}.html".format(jobid) res = self.speed_control_requests.with_sleep_requests(url, sleep=0.1) if htmlfind.findTag(res.text, 'div', 'position_del'): print "jobid: {} match nothing".format(jobid) if res is not None: self.page_store.save(int(time.time()), jobid, url, res.text) else: self.re_add_job(jobid) Log.error("failed get url", url)
def extract_content(self): content = '' spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'table', 'class="i_table"') if spans: ps = re.findall(r'<tr[^<>]*>(.*?)</tr>', spans[0], re.S) for tr in ps: tds = re.findall(r'<td[^<>]*>(.*?)</td>', tr, re.S) for td in tds: content += htmlfind.remove_tag(td, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') print content return content
def page_time(self): cur_doc = self.get_cur_doc().cur_content if isinstance(cur_doc, unicode): cur_doc = cur_doc.encode('utf-8') s = re.search(r'发布日期:</dt>.*?<dd class="text_dd">(.*?)</dd>', cur_doc, re.S) if s: return TimeHandler.fmt_time(s.group(1)) tag = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox"') if tag: m = re.search(r'(\d*-?\d+-\d+发布)', tag[0]) if m: t = TimeHandler.fmt_time(m.group(1)) return t
def page_time(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"') try: tag = htmlfind.remove_tag(tag[0], 1) except: Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) raise if isinstance(tag, unicode): tag = tag.encode('utf-8') return TimeHandler.fmt_time(tag)
def extract_content(self): content = '' jobbt = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="job_bt"', 0) job_request = htmlfind.findTag(self.get_cur_doc().cur_content, 'dd', 'class="job_request"') for e in job_request: tags = re.findall(r'<span[^<>]*>(.*?)</span>', e) content += '#'.join(tags) if isinstance(content, unicode): content = content.encode('utf-8') try: content += jobbt.get_text() except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content
def extract_content(self): content='' divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox">') if divs: spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S) if spans: spans = spans[:-1] # 忽略更新时间 for span in spans: content += htmlfind.remove_tag(span, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0) t2 = htmlfind.remove_tag(hf.get_node(), 1) if isinstance(t2, unicode): t2 = t2.encode('utf-8') return content + t2
def extract_content(self): content = '' uls = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul', 'class="terminal-ul clearfix"') if len(uls): strongs = re.findall(r'<strong[^<>]*>(.*?)</strong>', uls[0], re.S) for index, strong in enumerate(strongs): if 2 == index: # updateTime 忽略 continue content += htmlfind.remove_tag(strong, True) + "#" m = re.search(ur'''<div class="tab-inner-cont">(.*?)</button>''', self.get_cur_doc().cur_content, re.S) if m: a = re.sub(ur'<[a-zA-Z/!][^<>]*>', '', m.group(1)) content += a.strip() return content Log.error(self.get_cur_doc().cur_url, "no content") return None
def page_time(self): tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul', 'class="headTag"') try: tag = htmlfind.remove_tag(tag[0], 1) except: Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) raise if isinstance(tag, unicode): tag = tag.encode('utf-8') if "天前" not in tag: return int(time.time() * 1000) else: find = re.search('(\d+).*?(\d+).*?(\d+)', tag, re.S) if find: day = find.group(1) return TimeHandler.getTimeOfNDayBefore(day) raise Exception("not copy time pattern: {}".format(tag))
#!/usr/bin/env python # -*- coding:utf8 -*- import re from spider.util import htmlfind, TimeHandler with open('test.html', 'rb') as f: content = f.read() divs = htmlfind.findTag(content, 'div', 'class="jtag inbox">') if divs: spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S) if spans: spans = spans[:-1] # 忽略更新时间 for span in spans: content += htmlfind.remove_tag(span, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') hf = htmlfind(content, '<div class="bmsg job_msg inbox">', 0) t2 = htmlfind.remove_tag(hf.get_node(), 1) find = re.search(r'tCompany_text">(.*?)</div>', content, re.S) # print htmlfind.remove_tag(find.group(1), 1) s = re.search(r'(\d*-?\d+-\d+发布)', content, re.S) print htmlfind.remove_tag(s.group(1), True)
def extract_content(self): content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="rich-text" itemprop="description"') if content and len(content) > 0: content = htmlfind.remove_tag(content[0], 1) return content