Ejemplo n.º 1
0
Archivo: home.py Proyecto: ICCV/chaos
 def parse(self, response):
     url = response.url
     links = self.extract_urls(response)
     article_urls = []
     subindexes = []
     subindex_urls = SiteRule.get_subindex_urls_by_url(url)
     patterns = Site.get_article_patterns(url)
     for link in links:
         if link.url in subindex_urls:
             subindexes.append(link.url)
         else:
             for pattern in patterns:
                 if re.match(pattern, link.url.strip()):
                     article_urls.append(link.url)
     if subindexes:
         self.server.rpush(SUB_INDEX_WORKER_KEY, *set(subindexes))
     if article_urls:
         articles = Article.objects(source_url__in=article_urls)
         exsited_article_urls = [article.source_url for article in articles]
         new_urls = set(article_urls) - set(exsited_article_urls)
         if new_urls:
             self.server.rpush(TOP_LEVEL_ARTICLES_WORKER_KEY, *set(new_urls))
             name = Site.get_name_by_url(url)
             Article.objects.insert([Article(source_url=article_url, source='Home',
                         site_name=name, site_url=url, category=['Top Stories']) \
                                 for article_url in set(new_urls)])
Ejemplo n.º 2
0
def main():
    SiteRule.drop_collection()
    fin = open('scripts/sub_or_article.csv','r')
    for line in fin:
        title, site_url, sub_index_url, sub_index_title, sub_index_cate = line.strip().split(',')
        if sub_index_title == 'Home':
            continue
        if 'http' not in site_url:
            site_url = 'http://%s' % site_url
        site_url = re.sub('/$', '', site_url)
        rules = SiteRule()
        rules.title = title.strip()
        rules.site_url = site_url.strip()
        rules.sub_index_title = sub_index_title.strip().decode('gb18030').encode('utf8')
        rules.sub_index_url = sub_index_url.strip()
        if not sub_index_cate == 'Null':
            rules.category_name = sub_index_cate.strip()
        print title, site_url, sub_index_cate, sub_index_title.decode('gb18030').encode('utf8'), sub_index_url
        rules.save()
    fin.close()
Ejemplo n.º 3
0
def main():
    url_pattern = {}
    fin = open('scripts/article_pattern_precise.txt','r')
    for line in fin:
        title, pattern = line.strip().split('###')
        url_pattern[title] = pattern
    fin.close()

    siterules = SiteRule.objects()
    for siterule in siterules:
        if siterule.title in url_pattern:
            print siterule.title
            siterule.article_patterns = [url_pattern[siterule.title]]
            siterule.save()