def extract_content(self): content='' cur_doc = self.get_cur_doc().cur_content if isinstance(cur_doc, unicode): cur_doc = cur_doc.encode('utf-8') find = re.search(r'tCompany_text">(.*?)</div>', cur_doc, re.S) if find: content = htmlfind.remove_tag(find.group(1), True) return content divs = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox">') if divs: spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S) if spans: spans = spans[:-1] # 忽略更新时间 for span in spans: content += htmlfind.remove_tag(span, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0) t2 = htmlfind.remove_tag(hf.get_node(), 1) if isinstance(t2, unicode): t2 = t2.encode('utf-8') content = content + t2 return content
def extract_content(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') fields = htmlfind.findTag(cur_content, 'table') content = '' for field in fields: if r'所在行业:' in field: content = htmlfind.remove_tag(field, True) break elif r'Industry:' in field or r'Industry:' in field: print "Ignore..... is English page!" self._save_not_need_cv(self.get_cur_doc().cur_jdid) break if r'抱歉,该简历已经设置为对猎头顾问不开放!' in cur_content: print "Ignore..... can not access by lietou" return None if r'该简历人才已经设置了对企业不开放简历,可能该人才已经找到工作,或者暂时没有换工作的意向。' in cur_content: print "Ignore..... can not access by qiye" self._not_access_by_qiye.append(self.get_cur_doc().cur_jdid) return None return content
def extract_content(doc): content = '' divs = htmlfind.findTag(doc, 'div', 'class="in"') if divs: ps = re.findall(r'<p[^<>]*>(.*?)</p>', divs[0], re.S) for p in ps: content += htmlfind.remove_tag(p, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') return content
def extract_content(doc): content = '' spans = htmlfind.findTag(doc, 'div', 'class="company_intro_text"') if spans: ps = re.findall(r'<span[^<>]*>(.*?)</span>', spans[0], re.S) for p in ps: content += htmlfind.remove_tag(p, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') return content
def extract_content(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') content_part1 = htmlfind.findTag(cur_content, 'ul', 'contact-list') if not content_part1: return content_part1 = htmlfind.remove_tag(content_part1[0], True) content_part2 = htmlfind.findTag(cur_content, 'div', 'field') for c in content_part2: if r'求职意向' in c: content_part2 = c break content_part2 = htmlfind.remove_tag(content_part2, True) return content_part1 + content_part2
def extract_content(self): content='' divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox">') if divs: spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S) if spans: spans = spans[:-1] # 忽略更新时间 for span in spans: content += htmlfind.remove_tag(span, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0) t2 = htmlfind.remove_tag(hf.get_node(), 1) if isinstance(t2, unicode): t2 = t2.encode('utf-8') return content + t2
def extract_content(self): content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'posMsg borb') try: content = htmlfind.remove_tag(content[0], 1) except: Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) return None return content
def extract_content(self): content = self.get_cur_doc().cur_content if isinstance(content, unicode): content = content.encode('utf-8') content = re.search( r'"job-main main-message ">.*?职位描述:.*?"content content-word">(.*?)</div>', content, re.S) if content: content = htmlfind.remove_tag(content.group(1), 1) return content return None
def extract_content(self): content = self.get_cur_doc().cur_content if isinstance(content, unicode): content = content.encode('utf-8') find = htmlfind(content, '<table class="comTinyDes">', 0) try: rs = find.get_text() rs = htmlfind.remove_tag(rs, 1) return rs except Exception as e: print "co_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e) return None
def extract_content(self): content = '' spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'table', 'class="i_table"') if spans: ps = re.findall(r'<tr[^<>]*>(.*?)</tr>', spans[0], re.S) for tr in ps: tds = re.findall(r'<td[^<>]*>(.*?)</td>', tr, re.S) for td in tds: content += htmlfind.remove_tag(td, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') print content return content
def extract_content(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') find = htmlfind(cur_content, '<div class="inforBase">', 0) try: info = htmlfind.remove_tag(find.get_text(), True) return info except Exception, e: print "cv_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e) return None
def page_time(self): cur_content = self.get_cur_doc().cur_content if isinstance(cur_content, unicode): cur_content = cur_content.encode('utf-8') tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"') try: tag = htmlfind.remove_tag(tag[0], 1) except: Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) raise if isinstance(tag, unicode): tag = tag.encode('utf-8') return TimeHandler.fmt_time(tag)
def extract_content(self): content = self.get_cur_doc().cur_content if isinstance(content, unicode): content = content.encode('utf-8') if r'你查找的页面可能已被删除、或暂时不可用!' in content: print "jobid: %r, is not valid or deleted" % self.get_cur_doc( ).cur_jdid return None find = htmlfind(content, '<dl class="j-edit hasVist dlli mb10">', 0) try: rs = find.get_text() rs = htmlfind.remove_tag(rs, 1) return rs except Exception as e: print "co_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e) return None
def extract_content(self): content = '' uls = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul', 'class="terminal-ul clearfix"') if len(uls): strongs = re.findall(r'<strong[^<>]*>(.*?)</strong>', uls[0], re.S) for index, strong in enumerate(strongs): if 2 == index: # updateTime 忽略 continue content += htmlfind.remove_tag(strong, True) + "#" m = re.search(ur'''<div class="tab-inner-cont">(.*?)</button>''', self.get_cur_doc().cur_content, re.S) if m: a = re.sub(ur'<[a-zA-Z/!][^<>]*>', '', m.group(1)) content += a.strip() return content Log.error(self.get_cur_doc().cur_url, "no content") return None
def page_time(self): tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul', 'class="headTag"') try: tag = htmlfind.remove_tag(tag[0], 1) except: Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content) raise if isinstance(tag, unicode): tag = tag.encode('utf-8') if "天前" not in tag: return int(time.time() * 1000) else: find = re.search('(\d+).*?(\d+).*?(\d+)', tag, re.S) if find: day = find.group(1) return TimeHandler.getTimeOfNDayBefore(day) raise Exception("not copy time pattern: {}".format(tag))
#!/usr/bin/env python # -*- coding:utf8 -*- import re from spider.util import htmlfind, TimeHandler with open('test.html', 'rb') as f: content = f.read() divs = htmlfind.findTag(content, 'div', 'class="jtag inbox">') if divs: spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S) if spans: spans = spans[:-1] # 忽略更新时间 for span in spans: content += htmlfind.remove_tag(span, True) + "#" if isinstance(content, unicode): content = content.encode('utf-8') hf = htmlfind(content, '<div class="bmsg job_msg inbox">', 0) t2 = htmlfind.remove_tag(hf.get_node(), 1) find = re.search(r'tCompany_text">(.*?)</div>', content, re.S) # print htmlfind.remove_tag(find.group(1), 1) s = re.search(r'(\d*-?\d+-\d+发布)', content, re.S) print htmlfind.remove_tag(s.group(1), True)
def extract_content(self): content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="rich-text" itemprop="description"') if content and len(content) > 0: content = htmlfind.remove_tag(content[0], 1) return content