def initializeRawStartUrls(): print("Initializing all start URLs in the database") with common.database.session_context() as sess: for module in RawArchiver.RawActiveModules.ACTIVE_MODULES: for starturl in module.get_start_urls(): have = sess.query(common.database.RawWebPages) \ .filter(common.database.RawWebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = common.database.RawWebPages( url = starturl, starturl = starturl, netloc = netloc, priority = common.database.DB_IDLE_PRIORITY, distance = common.database.DB_DEFAULT_DIST, ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except Exception: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback()
def initializeStartUrls(rules): print("Initializing all start URLs in the database") with common.database.session_context() as sess: for ruleset in [ rset for rset in rules if rset['starturls'] and rset['rewalk_disabled'] is False ]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url=starturl, starturl=starturl, netloc=netloc, type=ruleset['type'], priority=db.DB_IDLE_PRIORITY, distance=db.DB_DEFAULT_DIST, normal_fetch_mode=ruleset['normal_fetch_mode'], epoch=0, ) print( "Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print( "Failure inserting start url for address: '{}'".format( starturl)) sess.rollback()
def initializeStartUrls(rules): print("Initializing all start URLs in the database") sess = db.get_db_session() for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url = starturl, starturl = starturl, netloc = netloc, type = ruleset['type'], priority = db.DB_IDLE_PRIORITY, distance = db.DB_DEFAULT_DIST, normal_fetch_mode = ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback() sess.close() db.delete_db_session()
def initializeRawStartUrls(): print("Initializing all start URLs in the database") sess = common.database.get_db_session() for module in RawArchiver.RawActiveModules.ACTIVE_MODULES: for starturl in module.get_start_urls(): have = sess.query(common.database.RawWebPages) \ .filter(common.database.RawWebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = common.database.RawWebPages( url = starturl, starturl = starturl, netloc = netloc, priority = common.database.DB_IDLE_PRIORITY, distance = common.database.DB_DEFAULT_DIST, ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except Exception: print("Failure inserting start url for address: '{}'".format(starturl)) sess.rollback() sess.close() common.database.delete_db_session()