Example #1
0
def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")
	sess = db.get_db_session()
	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = sess.query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except sqlalchemy.SQLAlchemyError:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	db.delete_db_session()
Example #2
0
def initializeStartUrls(rules):
    print("Initializing all start URLs in the database")
    with common.database.session_context() as sess:
        for ruleset in [
                rset for rset in rules
                if rset['starturls'] and rset['rewalk_disabled'] is False
        ]:
            for starturl in ruleset['starturls']:
                have = sess.query(db.WebPages) \
                 .filter(db.WebPages.url == starturl)   \
                 .count()
                if not have:
                    netloc = urlFuncs.getNetLoc(starturl)
                    new = db.WebPages(
                        url=starturl,
                        starturl=starturl,
                        netloc=netloc,
                        type=ruleset['type'],
                        priority=db.DB_IDLE_PRIORITY,
                        distance=db.DB_DEFAULT_DIST,
                        normal_fetch_mode=ruleset['normal_fetch_mode'],
                        epoch=0,
                    )
                    print(
                        "Missing start-url for address: '{}'".format(starturl))
                    sess.add(new)
                try:
                    sess.commit()
                except sqlalchemy.SQLAlchemyError:
                    print(
                        "Failure inserting start url for address: '{}'".format(
                            starturl))

                    sess.rollback()
def exposed_fetch(url, debug=True, rss_debug=False):
    '''
	Do a synchronous fetch of content from url `url`.
	'''

    # try:
    # 	WebMirror.SpecialCase.startAmqpFetcher()
    # except RuntimeError:  # Fetcher already started
    # 	pass

    if rss_debug:
        print("Debugging RSS")
        flags.RSS_DEBUG = True

    parsed = urllib.parse.urlparse(url)
    root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))

    new = db.WebPages(
        url=url,
        starturl=root,
        netloc=parsed.netloc,
        distance=50000,
        is_text=True,
        priority=500000,
        type='unknown',
        fetchtime=datetime.datetime.now(),
    )

    if debug:
        print(new)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            archiver.synchronousJobRequest(url, ignore_cache=True)
    except Exception as e:
        traceback.print_exc()
Example #4
0
def exposed_retrigger_feed_urls():
    '''
	Retrigger the content urls from each feed item.
	'''

    # RssFeedPost attributes:
    # 	id
    # 	type
    # 	feed_id
    # 	contenturl
    # 	contentid
    # 	title
    # 	contents
    # 	updated
    # 	published
    # 	tag_rel
    # 	author_rel
    # 	tags
    # 	author

    urls = set()
    with db.session_context() as sess:
        processor = WebMirror.processor.RssProcessor.RssProcessor(
            loggerPath="Main.RssDb",
            pageUrl='http://www.example.org',
            pgContent='',
            type='application/atom+xml',
            transfer=False,
            debug_print=True,
            db_sess=sess,
            write_debug=False)

        print("Loading posts....")
        items = sess.query(db.RssFeedPost).all()
        print("Loaded %s rows" % len(items))
        have_content = [tmp for tmp in items if tmp.contents]
        print("%s rows have content" % len(have_content))

        pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs")
        for post in pbar:
            if post.contenturl.startswith("tag:blogger.com"):
                continue

            if post.contenturl and not '#comment_' in post.contenturl:
                urls.add(post.contenturl)

            if post.contents and post.contents != 'Disabled?' and post.contents != 'wat':
                soup = WebRequest.as_soup(post.contents)
                # print(post.contents)
                # Make all the page URLs fully qualified, so they're unambiguous
                soup = urlFuncs.canonizeUrls(soup, post.contenturl)

                # pull out the page content and enqueue it. Filtering is
                # done in the parent.
                plainLinks = processor.extractLinks(soup, post.contenturl)
                imageLinks = processor.extractImages(soup, post.contenturl)

                # if plainLinks or imageLinks:
                # 	print((len(plainLinks), len(imageLinks)))

                urls.update(plainLinks)
                urls.update(imageLinks)
            # pbar.set_description("Links: %s" % len(urls))

    urls = list(urls)

    urld = {}
    for url in [tmp for tmp in urls if tmp]:
        nl = urllib.parse.urlsplit(url).netloc
        if nl:
            urld.setdefault(nl, [])
            urld[nl].append(url)

    print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld)))

    # rules = WebMirror.rules.load_rules()
    # feeds = [item['feedurls'] for item in rules]
    # feeds = [item for sublist in feeds for item in sublist]
    # url = feeds[0]
    # parsed = urllib.parse.urlparse(url)
    # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))
    # print("Using feed url %s for job base" % url)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'):
                sel_url = urls[0]
                parsed = urllib.parse.urlparse(sel_url)
                root = urllib.parse.urlunparse(
                    (parsed[0], parsed[1], "", "", "", ""))

                job = db.WebPages(
                    url=sel_url,
                    starturl=root,
                    netloc=key,
                    distance=0,
                    is_text=True,
                    priority=db.DB_LOW_PRIORITY,
                    type='unknown',
                    fetchtime=datetime.datetime.now(),
                )
                for chunk in chunks(urls, 500):
                    archiver.upsertResponseLinks(job,
                                                 plain=chunk,
                                                 resource=[],
                                                 debug=True,
                                                 interactive=True)

    except Exception as e:
        traceback.print_exc()
Example #5
0
    def retrigger_page(self, release_url):

        trigger_priority = db.DB_MED_PRIORITY

        if self.db_sess is None:
            return
        while 1:
            try:
                have = self.db_sess.query(db.WebPages) \
                 .filter(db.WebPages.url == release_url)   \
                 .scalar()

                # If we don't have the page, ignore
                # it as the normal new-link upsert mechanism
                # will add it.
                if not have:
                    if not 'job' in self.kwargs:
                        self.log.warning(
                            "Cannot upsert URL due to no job passed to filters!"
                        )
                        self.log.info("New (deferring): '%s'", release_url)
                        return

                    url_netloc = urllib.parse.urlsplit(release_url).netloc

                    assert release_url.startswith("http")
                    assert url_netloc

                    self.log.info("New: '%s'", release_url)
                    new = db.WebPages(
                        url=release_url,
                        starturl=self.kwargs['job'].starturl,
                        netloc=url_netloc,
                        distance=self.kwargs['job'].distance + 1,
                        is_text=True,
                        priority=self.kwargs['job'].priority,
                        type=self.kwargs['job'].type,
                        state="new",
                        addtime=datetime.datetime.now(),
                        epoch=WebMirror.misc.get_epoch_for_url(release_url),
                    )
                    self.db_sess.add(new)
                    self.db_sess.commit()

                    break

                # Also, don't reset if it's in-progress
                if (have.state in ['new', 'fetching', 'processing', 'removed']
                        and have.priority <= trigger_priority
                        and have.distance > 1 and have.epoch <=
                        WebMirror.misc.get_epoch_for_url(release_url)):
                    self.log.info("Skipping: '%s' (%s, %s)", release_url,
                                  have.state, have.priority)
                    break

                self.log.info("Retriggering page '%s' (%s, %s)", release_url,
                              have.state, have.priority)
                have.state = 'new'
                have.epoch = WebMirror.misc.get_epoch_for_url(release_url) - 2
                have.distance = 1
                have.priority = trigger_priority
                self.db_sess.commit()
                break

            except sqlalchemy.exc.InvalidRequestError:
                print("InvalidRequest error!")
                self.db_sess.rollback()
                traceback.print_exc()
            except sqlalchemy.exc.OperationalError:
                print("InvalidRequest error!")
                self.db_sess.rollback()
            except sqlalchemy.exc.IntegrityError:
                print("[upsertRssItems] -> Integrity error!")
                traceback.print_exc()
                self.db_sess.rollback()