Esempio n. 1
0
def initializeRawStartUrls():
	print("Initializing all start URLs in the database")
	with common.database.session_context() as sess:
		for module in RawArchiver.RawActiveModules.ACTIVE_MODULES:
			for starturl in module.get_start_urls():
				have = sess.query(common.database.RawWebPages) \
					.filter(common.database.RawWebPages.url == starturl)   \
					.count()
				if not have:
					netloc = urlFuncs.getNetLoc(starturl)
					new = common.database.RawWebPages(
							url               = starturl,
							starturl          = starturl,
							netloc            = netloc,
							priority          = common.database.DB_IDLE_PRIORITY,
							distance          = common.database.DB_DEFAULT_DIST,
						)
					print("Missing start-url for address: '{}'".format(starturl))
					sess.add(new)
				try:
					sess.commit()
				except Exception:
					print("Failure inserting start url for address: '{}'".format(starturl))

					sess.rollback()
Esempio n. 2
0
def initializeStartUrls(rules):
    print("Initializing all start URLs in the database")
    with common.database.session_context() as sess:
        for ruleset in [
                rset for rset in rules
                if rset['starturls'] and rset['rewalk_disabled'] is False
        ]:
            for starturl in ruleset['starturls']:
                have = sess.query(db.WebPages) \
                 .filter(db.WebPages.url == starturl)   \
                 .count()
                if not have:
                    netloc = urlFuncs.getNetLoc(starturl)
                    new = db.WebPages(
                        url=starturl,
                        starturl=starturl,
                        netloc=netloc,
                        type=ruleset['type'],
                        priority=db.DB_IDLE_PRIORITY,
                        distance=db.DB_DEFAULT_DIST,
                        normal_fetch_mode=ruleset['normal_fetch_mode'],
                        epoch=0,
                    )
                    print(
                        "Missing start-url for address: '{}'".format(starturl))
                    sess.add(new)
                try:
                    sess.commit()
                except sqlalchemy.SQLAlchemyError:
                    print(
                        "Failure inserting start url for address: '{}'".format(
                            starturl))

                    sess.rollback()
Esempio n. 3
0
def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")
	sess = db.get_db_session()
	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = sess.query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except sqlalchemy.SQLAlchemyError:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	db.delete_db_session()
Esempio n. 4
0
def initializeStartUrls(rules):
	print("Initializing all start URLs in the database")
	sess = db.get_db_session()
	for ruleset in [rset for rset in rules if rset['starturls']]:
		for starturl in ruleset['starturls']:
			have = sess.query(db.WebPages) \
				.filter(db.WebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = db.WebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						type              = ruleset['type'],
						priority          = db.DB_IDLE_PRIORITY,
						distance          = db.DB_DEFAULT_DIST,
						normal_fetch_mode = ruleset['normal_fetch_mode'],
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except sqlalchemy.SQLAlchemyError:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	db.delete_db_session()
Esempio n. 5
0
def initializeRawStartUrls():
	print("Initializing all start URLs in the database")
	sess = common.database.get_db_session()
	for module in RawArchiver.RawActiveModules.ACTIVE_MODULES:
		for starturl in module.get_start_urls():
			have = sess.query(common.database.RawWebPages) \
				.filter(common.database.RawWebPages.url == starturl)   \
				.count()
			if not have:
				netloc = urlFuncs.getNetLoc(starturl)
				new = common.database.RawWebPages(
						url               = starturl,
						starturl          = starturl,
						netloc            = netloc,
						priority          = common.database.DB_IDLE_PRIORITY,
						distance          = common.database.DB_DEFAULT_DIST,
					)
				print("Missing start-url for address: '{}'".format(starturl))
				sess.add(new)
			try:
				sess.commit()
			except Exception:
				print("Failure inserting start url for address: '{}'".format(starturl))

				sess.rollback()
	sess.close()
	common.database.delete_db_session()