def parse(self, source = None, ruleset = None): """ 根据规则解析内容 """ if not source: source =self.source if not ruleset: ruleset = self.ruleset if isinstance(source, bytes): try: source = utils.decode(source) except: source = str(source) # self.info("Json source: %s" % re.sub(r"(\r|\n|\s{2,})", "", str(source))) # self.info("Json ruleset: %s" % str(ruleset)) if source and ruleset: source = source.strip() ll = source[0:1] rl = source[-1:] if ll == '{' and rl == '}': data = self._jsonparse(source, ruleset) elif ll == '[' and rl == ']': data = self._jsonparse(source, ruleset) else: haslp = source.find('(') hasrp = source.rfind(')') if haslp == -1 or hasrp == -1: raise CDSpiderParserJsonLoadFaild('Invalid json data: %s' % (re.sub(r'\s{2,}', ' ', str(source).replace("\r", "").replace("\n",""))), rule = ruleset) data = self._jsonparse(source[haslp+1:hasrp], ruleset) return data return source
def parser_time(html, now=False): rule1 = '((?:(?:(?:(?:20)?[012]\d)|(?:19)?[98]\d)年\d{1,2}月\d{1,2}日?(?:\D*\d{2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?))' rule11 = '\>\s*' + rule1 + '[\s&\<]' rule12 = '[\>\s;](?:(?:本文)?发布于|发表于|发布(?:时间|日期)|日期|时间)\s*(?:\:|:)?\s*' + rule1 rule13 = rule1 + '\s*发布' rule14 = '[\s\>;]((?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2}(?:\D*\d{1,2}:\d{2}(?::\d{2})?)?))(?:\040| )+\<' rule15 = '((?:(?:(?:(?:20)?[012]\d)|(?:19)?[98]\d)年\d{1,2}月\d{1,2}日\D?\d{2}:\d{2}(?::\d{2})?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)-\d{1,2}-\d{1,2} \d{1,2}:\d{2}(?::\d{2})?)|(?:(?:(?:(?:20)?[012]\d)|(?:19)?[9]\d)\/\d{1,2}\/\d{1,2} \d{1,2}:\d{2}(?::\d{2})?))' if isinstance(html, bytes): try: html = decode(html) except: html = str(html) if html: g1 = re.findall(rule12, html) or re.findall( rule13, html) or re.findall(rule15, html) or re.findall( rule11, html) or re.findall(rule14, html) if len(g1) == 1: return g1[0] elif len(g1) > 1: for item in g1: if item.find(':') > 0: return item if now: return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) return None
def parse(self, source=None, ruleset=None): if not source: source = self.source if not ruleset: ruleset = self.ruleset if not ruleset: ruleset = {} if ('filter' in ruleset and ruleset['filter'] and ruleset['filter'].startswith('@json:'))\ or (((ruleset.get('item') or {}).get("url") or {}).get("filter") or '').startswith('@json'): parser = JsonParser(source=source, ruleset=ruleset, logger=self.logger, domain=self.domain, subdomain=self.subdomain) return parser.parse() g = Goose({ "target_language": "zh", 'stopwords_class': StopWordsChinese, "enable_fewwords_paragraphs": True, "logger": self.logger, "domain": self.domain, "subdomain": self.subdomain, "custom_rule": ruleset if ruleset else {} }) if isinstance(source, bytes): try: catalogue = g.fetch(raw_html=utils.decode(source), encoding='UTF8') except: catalogue = g.fetch(raw_html=source) else: catalogue = g.fetch(raw_html=source, encoding='UTF8') data = catalogue.infos return data
def parse(self, source=None, ruleset=None): if not source: source = self.source if not ruleset: ruleset = self.ruleset if not ruleset: ruleset = {} if "item" in ruleset: if 'filter' in ruleset and ruleset['filter'] and ruleset[ 'filter'].startswith('@json:'): parser = JsonParser(source=source, ruleset=ruleset, logger=self.logger, domain=self.domain, subdomain=self.subdomain) return parser.parse() else: ruleset['onlyOne'] = 0 elif ruleset: rule = list(ruleset.values())[0] if 'filter' in rule and rule['filter'] and rule[ 'filter'].startswith('@json:'): parser = JsonParser(source=source, ruleset=ruleset, logger=self.logger, domain=self.domain, subdomain=self.subdomain) return parser.parse() onlyOne = ruleset.pop('onlyOne', 1) g = Goose({ "target_language": "zh", 'stopwords_class': StopWordsChinese, "enable_fewwords_paragraphs": True, "logger": self.logger, "domain": self.domain, "subdomain": self.subdomain, "custom_rule": ruleset if ruleset else {}, "final_url": self.final_url }) if isinstance(source, bytes): try: catalogue = g.parse(raw_html=utils.decode(source), encoding='UTF8') except: catalogue = g.parse(raw_html=source) else: catalogue = g.parse(raw_html=source, encoding='UTF8') data = catalogue.infos if onlyOne: return data[0] if (isinstance(data, list) and data) else data return data
def parse(self, source=None, ruleset=None): if not source: source = self.source if not ruleset: ruleset = self.ruleset if isinstance(source, bytes): try: source = decode(source) except: source = str(source) if source and ruleset: data = {} for key in ruleset: data[key] = self._filter(source, ruleset[key]) return data return None
def parse(self, source = None, ruleset = None): """ 根据规则解析内容 """ if not source: source =self.source if not ruleset: ruleset = self.ruleset if isinstance(source, bytes): try: source = decode(source) except: pass # self.info("Pyquery source: %s" % re.sub(r"(\r|\n|\s{2,})", "", str(content))) # self.info("Pyquery ruleset: %s" % str(ruleset)) if source and ruleset: pq = PyQuery(source) data = {} for k in ruleset: if not ruleset[k]: data[k] = source continue if 'type' in ruleset[k] or 'filter' in ruleset[k]: self.info("Pyquery rule: %s" % str(ruleset[k])) data[k] = self._filter(pq, ruleset[k]) self.info("Pyquery data: %s" % str(data[k])) else: rule = ruleset[k] if isinstance(rule, list): for r in rule: self.info("rule: %s" % str(r)) item = self._filter(pq, r) self.info("data: %s" % str(item)) if item: data[k] = item break else: rest = {} for idx in rule: self.info("Pyquery rule: %s=>%s" % (idx, str(rule[idx]))) rest[idx] = self._filter(pq, rule[idx]) self.info("Pyquery data: %s=>%s" % (idx, str(rest[idx]))) data[k] = rest return data return source
def parse(self, source=None, ruleset=None): if not source: source = self.source if not ruleset: ruleset = self.ruleset if not ruleset: ruleset = {} item_ruleset = dict( [(k, item) for k, item in ruleset.items() if item and isinstance(item, dict) and 'filter' in item and item['filter']]) if ruleset else {} if item_ruleset: rule = list(item_ruleset.values())[0] if 'filter' in rule and rule['filter'] and rule[ 'filter'].startswith('@json:'): parser = JsonParser(source=source, ruleset=item_ruleset, logger=self.logger, domain=self.domain, subdomain=self.subdomain) return parser.parse() local_storage_path = self._settings.get('attach_storage', None) g = Goose({ "target_language": "zh", 'stopwords_class': StopWordsChinese, "enable_fewwords_paragraphs": True, "logger": self.logger, "domain": self.domain, "subdomain": self.subdomain, "custom_rule": item_ruleset if item_ruleset else {}, "local_storage_path": local_storage_path, "final_url": self.final_url }) if isinstance(source, bytes): try: article = g.extract(raw_html=utils.decode(source), encoding='UTF8') except UnicodeDecodeError: article = g.extract(raw_html=source) else: article = g.extract(raw_html=source, encoding='UTF8') data = {} for i in ruleset: if i == 'title': data[i] = article.infos['title']['clean_title'] elif i == 'content': data[i] = article.infos['cleaned_text'] data["raw_content"] = '\r\n'.join( article.top_node_html) if isinstance( article.top_node_html, (list, tuple)) else article.top_node_html data["raw_content"] = re.sub('[\u3000\xa0]', ' ', str(data["raw_content"])) elif i == 'pubtime': data[i] = article.infos['publish_date'] elif i == 'author': data[i] = self.get_author(article.infos['authors']) else: data[i] = article.infos.get(i, None) if 'title' not in data: data['title'] = article.infos['title']['clean_title'] if 'content' not in data: data['content'] = article.infos['cleaned_text'] data["raw_content"] = '\r\n'.join( article.top_node_html) if isinstance( article.top_node_html, (list, tuple)) else article.top_node_html data["raw_content"] = re.sub('[\u3000\xa0]', ' ', str(data["raw_content"])) if 'pubtime' not in data: data['pubtime'] = article.infos['publish_date'] if 'author' not in data: data['author'] = self.get_author(article.infos['authors']) return data