def parse(self, response): url = response.url links = self.extract_urls(response) article_urls = [] subindexes = [] subindex_urls = SiteRule.get_subindex_urls_by_url(url) patterns = Site.get_article_patterns(url) for link in links: if link.url in subindex_urls: subindexes.append(link.url) else: for pattern in patterns: if re.match(pattern, link.url.strip()): article_urls.append(link.url) if subindexes: self.server.rpush(SUB_INDEX_WORKER_KEY, *set(subindexes)) if article_urls: articles = Article.objects(source_url__in=article_urls) exsited_article_urls = [article.source_url for article in articles] new_urls = set(article_urls) - set(exsited_article_urls) if new_urls: self.server.rpush(TOP_LEVEL_ARTICLES_WORKER_KEY, *set(new_urls)) name = Site.get_name_by_url(url) Article.objects.insert([Article(source_url=article_url, source='Home', site_name=name, site_url=url, category=['Top Stories']) \ for article_url in set(new_urls)])
def main(): SiteRule.drop_collection() fin = open('scripts/sub_or_article.csv','r') for line in fin: title, site_url, sub_index_url, sub_index_title, sub_index_cate = line.strip().split(',') if sub_index_title == 'Home': continue if 'http' not in site_url: site_url = 'http://%s' % site_url site_url = re.sub('/$', '', site_url) rules = SiteRule() rules.title = title.strip() rules.site_url = site_url.strip() rules.sub_index_title = sub_index_title.strip().decode('gb18030').encode('utf8') rules.sub_index_url = sub_index_url.strip() if not sub_index_cate == 'Null': rules.category_name = sub_index_cate.strip() print title, site_url, sub_index_cate, sub_index_title.decode('gb18030').encode('utf8'), sub_index_url rules.save() fin.close()
def main(): url_pattern = {} fin = open('scripts/article_pattern_precise.txt','r') for line in fin: title, pattern = line.strip().split('###') url_pattern[title] = pattern fin.close() siterules = SiteRule.objects() for siterule in siterules: if siterule.title in url_pattern: print siterule.title siterule.article_patterns = [url_pattern[siterule.title]] siterule.save()