def initializeStartUrls(rules): print("Initializing all start URLs in the database") sess = db.get_db_session() for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url = starturl, starturl = starturl, netloc = netloc, type = ruleset['type'], priority = db.DB_IDLE_PRIORITY, distance = db.DB_DEFAULT_DIST, normal_fetch_mode = ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback() sess.close() db.delete_db_session()
def initializeStartUrls(rules): print("Initializing all start URLs in the database") with common.database.session_context() as sess: for ruleset in [ rset for rset in rules if rset['starturls'] and rset['rewalk_disabled'] is False ]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url=starturl, starturl=starturl, netloc=netloc, type=ruleset['type'], priority=db.DB_IDLE_PRIORITY, distance=db.DB_DEFAULT_DIST, normal_fetch_mode=ruleset['normal_fetch_mode'], epoch=0, ) print( "Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print( "Failure inserting start url for address: '{}'".format( starturl)) sess.rollback()
def exposed_fetch(url, debug=True, rss_debug=False): ''' Do a synchronous fetch of content from url `url`. ''' # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url=url, starturl=root, netloc=parsed.netloc, distance=50000, is_text=True, priority=500000, type='unknown', fetchtime=datetime.datetime.now(), ) if debug: print(new) try: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc()
def exposed_retrigger_feed_urls(): ''' Retrigger the content urls from each feed item. ''' # RssFeedPost attributes: # id # type # feed_id # contenturl # contentid # title # contents # updated # published # tag_rel # author_rel # tags # author urls = set() with db.session_context() as sess: processor = WebMirror.processor.RssProcessor.RssProcessor( loggerPath="Main.RssDb", pageUrl='http://www.example.org', pgContent='', type='application/atom+xml', transfer=False, debug_print=True, db_sess=sess, write_debug=False) print("Loading posts....") items = sess.query(db.RssFeedPost).all() print("Loaded %s rows" % len(items)) have_content = [tmp for tmp in items if tmp.contents] print("%s rows have content" % len(have_content)) pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs") for post in pbar: if post.contenturl.startswith("tag:blogger.com"): continue if post.contenturl and not '#comment_' in post.contenturl: urls.add(post.contenturl) if post.contents and post.contents != 'Disabled?' and post.contents != 'wat': soup = WebRequest.as_soup(post.contents) # print(post.contents) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, post.contenturl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = processor.extractLinks(soup, post.contenturl) imageLinks = processor.extractImages(soup, post.contenturl) # if plainLinks or imageLinks: # print((len(plainLinks), len(imageLinks))) urls.update(plainLinks) urls.update(imageLinks) # pbar.set_description("Links: %s" % len(urls)) urls = list(urls) urld = {} for url in [tmp for tmp in urls if tmp]: nl = urllib.parse.urlsplit(url).netloc if nl: urld.setdefault(nl, []) urld[nl].append(url) print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld))) # rules = WebMirror.rules.load_rules() # feeds = [item['feedurls'] for item in rules] # feeds = [item for sublist in feeds for item in sublist] # url = feeds[0] # parsed = urllib.parse.urlparse(url) # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) # print("Using feed url %s for job base" % url) try: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'): sel_url = urls[0] parsed = urllib.parse.urlparse(sel_url) root = urllib.parse.urlunparse( (parsed[0], parsed[1], "", "", "", "")) job = db.WebPages( url=sel_url, starturl=root, netloc=key, distance=0, is_text=True, priority=db.DB_LOW_PRIORITY, type='unknown', fetchtime=datetime.datetime.now(), ) for chunk in chunks(urls, 500): archiver.upsertResponseLinks(job, plain=chunk, resource=[], debug=True, interactive=True) except Exception as e: traceback.print_exc()
def retrigger_page(self, release_url): trigger_priority = db.DB_MED_PRIORITY if self.db_sess is None: return while 1: try: have = self.db_sess.query(db.WebPages) \ .filter(db.WebPages.url == release_url) \ .scalar() # If we don't have the page, ignore # it as the normal new-link upsert mechanism # will add it. if not have: if not 'job' in self.kwargs: self.log.warning( "Cannot upsert URL due to no job passed to filters!" ) self.log.info("New (deferring): '%s'", release_url) return url_netloc = urllib.parse.urlsplit(release_url).netloc assert release_url.startswith("http") assert url_netloc self.log.info("New: '%s'", release_url) new = db.WebPages( url=release_url, starturl=self.kwargs['job'].starturl, netloc=url_netloc, distance=self.kwargs['job'].distance + 1, is_text=True, priority=self.kwargs['job'].priority, type=self.kwargs['job'].type, state="new", addtime=datetime.datetime.now(), epoch=WebMirror.misc.get_epoch_for_url(release_url), ) self.db_sess.add(new) self.db_sess.commit() break # Also, don't reset if it's in-progress if (have.state in ['new', 'fetching', 'processing', 'removed'] and have.priority <= trigger_priority and have.distance > 1 and have.epoch <= WebMirror.misc.get_epoch_for_url(release_url)): self.log.info("Skipping: '%s' (%s, %s)", release_url, have.state, have.priority) break self.log.info("Retriggering page '%s' (%s, %s)", release_url, have.state, have.priority) have.state = 'new' have.epoch = WebMirror.misc.get_epoch_for_url(release_url) - 2 have.distance = 1 have.priority = trigger_priority self.db_sess.commit() break except sqlalchemy.exc.InvalidRequestError: print("InvalidRequest error!") self.db_sess.rollback() traceback.print_exc() except sqlalchemy.exc.OperationalError: print("InvalidRequest error!") self.db_sess.rollback() except sqlalchemy.exc.IntegrityError: print("[upsertRssItems] -> Integrity error!") traceback.print_exc() self.db_sess.rollback()