def handle(self, website_url, **kwargs): langs = ["ar", "bg", "ca", "cs", "da", "de", "el", "en", "eo", "es", "eu", "fa", "fi", "fr", "he", "hu", "id", "it", "ja", "ko", "lt", "ms", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi", "vo", "zh"] for lang in langs: for src, target in links(website_url, lang=lang): print lang, "\t", src.encode('utf8'), "\t", target.encode('utf8')
def handle(self, website_url, **kwargs): langs = [ "ar", "bg", "ca", "cs", "da", "de", "el", "en", "eo", "es", "eu", "fa", "fi", "fr", "he", "hu", "id", "it", "ja", "ko", "lt", "ms", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "tr", "uk", "vi", "vo", "zh" ] for lang in langs: for src, target in links(website_url, lang=lang): print lang, "\t", src.encode('utf8'), "\t", target.encode( 'utf8')
def crawl(website): """ Execute a crawl, but only if it hasn't been started already. """ logging.info("starting crawl for %s" % website.url) crawl = m.Crawl(website=website) crawl.started = datetime.datetime.now() crawl.save() # look at all wikipedia pages that reference a particular website count = 0 for source, target in wikipedia.links(website.url): # get the wikipedia page page, created = m.WikipediaPage.new_from_wikipedia(url=source) if created: logging.info("created wikipedia page for %s" % source) # create the link link, created = m.Link.objects.get_or_create( website=website, wikipedia_page=page, target=target) if created: logging.info("created link: %s -> %s" % (source, target)) link.last_checked = datetime.datetime.now() link.save() reset_queries() count += 1 if CRAWL_CUTOFF and count > CRAWL_CUTOFF: logging.info("stopping crawl at crawl cutoff: %s" % CRAWL_CUTOFF) break crawl.finished = datetime.datetime.now() crawl.save() logging.info("finished crawl for %s" % crawl.website.url) return crawl
def crawl(website): """ Execute a crawl, but only if it hasn't been started already. """ logging.info("starting crawl for %s" % website.url) crawl = m.Crawl(website=website) crawl.started = datetime.datetime.now() crawl.save() # look at all wikipedia pages that reference a particular website count = 0 for source, target in wikipedia.links(website.url): # get the wikipedia page page, created = m.WikipediaPage.new_from_wikipedia(url=source) if created: logging.info("created wikipedia page for %s" % source) # create the link link, created = m.Link.objects.get_or_create(website=website, wikipedia_page=page, target=target) if created: logging.info("created link: %s -> %s" % (source, target)) link.last_checked = datetime.datetime.now() link.save() reset_queries() count += 1 if CRAWL_CUTOFF and count > CRAWL_CUTOFF: logging.info("stopping crawl at crawl cutoff: %s" % CRAWL_CUTOFF) break crawl.finished = datetime.datetime.now() crawl.save() logging.info("finished crawl for %s" % crawl.website.url) return crawl