Example #1
0
def test_retrieve(url, debug=True, rss_debug=False):

	# try:
	# 	WebMirror.SpecialCase.startAmqpFetcher()
	# except RuntimeError:  # Fetcher already started
	# 	pass

	if rss_debug:
		print("Debugging RSS")
		flags.RSS_DEBUG = True

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)

	try:
		archiver = SiteArchiver(None, db.get_db_session(), None)
		job     = archiver.synchronousJobRequest(url, ignore_cache=True)
	except Exception as e:
		traceback.print_exc()
	finally:
		db.delete_db_session()
	def go(self):
		self.log.info("Fetching URLs via local fetcher!")

		for url in self.urls:
			with db.session_context() as sess:
				archiver = SiteArchiver(None, sess, None)
				archiver.synchronousJobRequest(url, ignore_cache=True, debug=True)
Example #3
0
def test(url, debug=True):

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)
	archiver = SiteArchiver(None)
	ret = archiver.taskProcess(job_test=new)

	if debug:
		print(archiver)
		print(ret.keys())

		if "plainLinks" in ret and "rsrcLinks" in ret: # Looks like a HTML page. Print the relevant info
			print_html_response(archiver, new, ret)
		if "rss-content" in ret:
			print_rss_response(archiver, new, ret)


	pass
def test():
    print("Test mode!")
    import logSetup
    import settings
    from WebMirror.Engine import SiteArchiver

    logSetup.initLogging()

    urls = [
        'https://royalroadl.com/api/fiction/updates?apiKey=' +
        settings.RRL_API_KEY,
        # 'https://royalroadl.com/api/fiction/newreleases?apiKey=' + settings.RRL_API_KEY,
    ]

    for url in urls:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            archiver.synchronousJobRequest(url, ignore_cache=True)
def exposed_fetch(url, debug=True, rss_debug=False):
	'''
	Do a synchronous fetch of content from url `url`.
	'''

	# try:
	# 	WebMirror.SpecialCase.startAmqpFetcher()
	# except RuntimeError:  # Fetcher already started
	# 	pass

	if rss_debug:
		print("Debugging RSS")
		flags.RSS_DEBUG = True

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)

	try:
		archiver = SiteArchiver(None, db.get_db_session(), None)
		job     = archiver.synchronousJobRequest(url, ignore_cache=True)
	except Exception as e:
		traceback.print_exc()
	finally:
		db.delete_db_session()
Example #6
0
def test(url, debug=True, rss_debug=False):
	if rss_debug:
		print("Debugging RSS")
		flags.RSS_DEBUG = True

	parsed = urllib.parse.urlparse(url)
	root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

	new = db.WebPages(
		url       = url,
		starturl  = root,
		netloc    = parsed.netloc,
		distance  = 50000,
		is_text   = True,
		priority  = 500000,
		type      = 'unknown',
		fetchtime = datetime.datetime.now(),
		)

	if debug:
		print(new)
	archiver = SiteArchiver(None)
	archiver.taskProcess(job_test=new)
def exposed_fetch(url, debug=True, rss_debug=False):
    '''
	Do a synchronous fetch of content from url `url`.
	'''

    # try:
    # 	WebMirror.SpecialCase.startAmqpFetcher()
    # except RuntimeError:  # Fetcher already started
    # 	pass

    if rss_debug:
        print("Debugging RSS")
        flags.RSS_DEBUG = True

    parsed = urllib.parse.urlparse(url)
    root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

    new = db.WebPages(
        url=url,
        starturl=root,
        netloc=parsed.netloc,
        distance=50000,
        is_text=True,
        priority=500000,
        type='unknown',
        fetchtime=datetime.datetime.now(),
    )

    if debug:
        print(new)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            archiver.synchronousJobRequest(url, ignore_cache=True)
    except Exception as e:
        traceback.print_exc()
Example #8
0
 def fetch(url):
     with db.session_context() as sess:
         archiver = SiteArchiver(cookie_lock=None,
                                 db_interface=sess,
                                 new_job_queue=None)
         archiver.synchronousJobRequest(url, ignore_cache=True, debug=True)
Example #9
0
def exposed_retrigger_feed_urls():
    '''
	Retrigger the content urls from each feed item.
	'''

    # RssFeedPost attributes:
    # 	id
    # 	type
    # 	feed_id
    # 	contenturl
    # 	contentid
    # 	title
    # 	contents
    # 	updated
    # 	published
    # 	tag_rel
    # 	author_rel
    # 	tags
    # 	author

    urls = set()
    with db.session_context() as sess:
        processor = WebMirror.processor.RssProcessor.RssProcessor(
            loggerPath="Main.RssDb",
            pageUrl='http://www.example.org',
            pgContent='',
            type='application/atom+xml',
            transfer=False,
            debug_print=True,
            db_sess=sess,
            write_debug=False)

        print("Loading posts....")
        items = sess.query(db.RssFeedPost).all()
        print("Loaded %s rows" % len(items))
        have_content = [tmp for tmp in items if tmp.contents]
        print("%s rows have content" % len(have_content))

        pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs")
        for post in pbar:
            if post.contenturl.startswith("tag:blogger.com"):
                continue

            if post.contenturl and not '#comment_' in post.contenturl:
                urls.add(post.contenturl)

            if post.contents and post.contents != 'Disabled?' and post.contents != 'wat':
                soup = WebRequest.as_soup(post.contents)
                # print(post.contents)
                # Make all the page URLs fully qualified, so they're unambiguous
                soup = urlFuncs.canonizeUrls(soup, post.contenturl)

                # pull out the page content and enqueue it. Filtering is
                # done in the parent.
                plainLinks = processor.extractLinks(soup, post.contenturl)
                imageLinks = processor.extractImages(soup, post.contenturl)

                # if plainLinks or imageLinks:
                # 	print((len(plainLinks), len(imageLinks)))

                urls.update(plainLinks)
                urls.update(imageLinks)
            # pbar.set_description("Links: %s" % len(urls))

    urls = list(urls)

    urld = {}
    for url in [tmp for tmp in urls if tmp]:
        nl = urllib.parse.urlsplit(url).netloc
        if nl:
            urld.setdefault(nl, [])
            urld[nl].append(url)

    print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld)))

    # rules = WebMirror.rules.load_rules()
    # feeds = [item['feedurls'] for item in rules]
    # feeds = [item for sublist in feeds for item in sublist]
    # url = feeds[0]
    # parsed = urllib.parse.urlparse(url)
    # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))
    # print("Using feed url %s for job base" % url)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'):
                sel_url = urls[0]
                parsed = urllib.parse.urlparse(sel_url)
                root = urllib.parse.urlunparse(
                    (parsed[0], parsed[1], "", "", "", ""))

                job = db.WebPages(
                    url=sel_url,
                    starturl=root,
                    netloc=key,
                    distance=0,
                    is_text=True,
                    priority=db.DB_LOW_PRIORITY,
                    type='unknown',
                    fetchtime=datetime.datetime.now(),
                )
                for chunk in chunks(urls, 500):
                    archiver.upsertResponseLinks(job,
                                                 plain=chunk,
                                                 resource=[],
                                                 debug=True,
                                                 interactive=True)

    except Exception as e:
        traceback.print_exc()