def test_retrieve(url, debug=True, rss_debug=False): # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) try: archiver = SiteArchiver(None, db.get_db_session(), None) job = archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def go(self): self.log.info("Fetching URLs via local fetcher!") for url in self.urls: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) archiver.synchronousJobRequest(url, ignore_cache=True, debug=True)
def test(url, debug=True): parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) archiver = SiteArchiver(None) ret = archiver.taskProcess(job_test=new) if debug: print(archiver) print(ret.keys()) if "plainLinks" in ret and "rsrcLinks" in ret: # Looks like a HTML page. Print the relevant info print_html_response(archiver, new, ret) if "rss-content" in ret: print_rss_response(archiver, new, ret) pass
def test(): print("Test mode!") import logSetup import settings from WebMirror.Engine import SiteArchiver logSetup.initLogging() urls = [ 'https://royalroadl.com/api/fiction/updates?apiKey=' + settings.RRL_API_KEY, # 'https://royalroadl.com/api/fiction/newreleases?apiKey=' + settings.RRL_API_KEY, ] for url in urls: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) archiver.synchronousJobRequest(url, ignore_cache=True)
def exposed_fetch(url, debug=True, rss_debug=False): ''' Do a synchronous fetch of content from url `url`. ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) try: archiver = SiteArchiver(None, db.get_db_session(), None) job = archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def test(url, debug=True, rss_debug=False): if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) archiver = SiteArchiver(None) archiver.taskProcess(job_test=new)
def exposed_fetch(url, debug=True, rss_debug=False): ''' Do a synchronous fetch of content from url `url`. ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url=url, starturl=root, netloc=parsed.netloc, distance=50000, is_text=True, priority=500000, type='unknown', fetchtime=datetime.datetime.now(), ) if debug: print(new) try: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc()
def fetch(url): with db.session_context() as sess: archiver = SiteArchiver(cookie_lock=None, db_interface=sess, new_job_queue=None) archiver.synchronousJobRequest(url, ignore_cache=True, debug=True)
def exposed_retrigger_feed_urls(): ''' Retrigger the content urls from each feed item. ''' # RssFeedPost attributes: # id # type # feed_id # contenturl # contentid # title # contents # updated # published # tag_rel # author_rel # tags # author urls = set() with db.session_context() as sess: processor = WebMirror.processor.RssProcessor.RssProcessor( loggerPath="Main.RssDb", pageUrl='http://www.example.org', pgContent='', type='application/atom+xml', transfer=False, debug_print=True, db_sess=sess, write_debug=False) print("Loading posts....") items = sess.query(db.RssFeedPost).all() print("Loaded %s rows" % len(items)) have_content = [tmp for tmp in items if tmp.contents] print("%s rows have content" % len(have_content)) pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs") for post in pbar: if post.contenturl.startswith("tag:blogger.com"): continue if post.contenturl and not '#comment_' in post.contenturl: urls.add(post.contenturl) if post.contents and post.contents != 'Disabled?' and post.contents != 'wat': soup = WebRequest.as_soup(post.contents) # print(post.contents) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, post.contenturl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = processor.extractLinks(soup, post.contenturl) imageLinks = processor.extractImages(soup, post.contenturl) # if plainLinks or imageLinks: # print((len(plainLinks), len(imageLinks))) urls.update(plainLinks) urls.update(imageLinks) # pbar.set_description("Links: %s" % len(urls)) urls = list(urls) urld = {} for url in [tmp for tmp in urls if tmp]: nl = urllib.parse.urlsplit(url).netloc if nl: urld.setdefault(nl, []) urld[nl].append(url) print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld))) # rules = WebMirror.rules.load_rules() # feeds = [item['feedurls'] for item in rules] # feeds = [item for sublist in feeds for item in sublist] # url = feeds[0] # parsed = urllib.parse.urlparse(url) # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) # print("Using feed url %s for job base" % url) try: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'): sel_url = urls[0] parsed = urllib.parse.urlparse(sel_url) root = urllib.parse.urlunparse( (parsed[0], parsed[1], "", "", "", "")) job = db.WebPages( url=sel_url, starturl=root, netloc=key, distance=0, is_text=True, priority=db.DB_LOW_PRIORITY, type='unknown', fetchtime=datetime.datetime.now(), ) for chunk in chunks(urls, 500): archiver.upsertResponseLinks(job, plain=chunk, resource=[], debug=True, interactive=True) except Exception as e: traceback.print_exc()