def set_aliases(path=None): if path != None: sys.path.append(path) import config SetAlias(config.HarvestManStateObject()) import datamgr import rules import connector import urlqueue import logger import event SetAlias(logger.HarvestManLogger()) # Data manager object dmgr = datamgr.HarvestManDataManager() dmgr.initialize() SetAlias(dmgr) # Rules checker object ruleschecker = rules.HarvestManRulesChecker() SetAlias(ruleschecker) # Connector manager object connmgr = connector.HarvestManNetworkConnector() SetAlias(connmgr) # Connector factory conn_factory = connector.HarvestManUrlConnectorFactory( objects.config.connections) SetAlias(conn_factory) queuemgr = urlqueue.HarvestManCrawlerQueue() SetAlias(queuemgr) SetAlias(event.HarvestManEvent())
self.csslinks.append(url) self.links.append(url) for item in l3: if not item: continue url = item[1].replace("'", '').replace('"', '') if url not in self.links: self.links.append(url) if __name__ == "__main__": import os import config import logger SetAlias(config.HarvestManStateObject()) SetAlias(logger.HarvestManLogger()) cfg = objects.config cfg.verbosity = 5 SetLogSeverity() cfg.getquerylinks = True p = HarvestManSimpleParser() #p.enable_feature('option') #p = HarvestManSGMLOpParser() urls = ['http://projecteuler.net/index.php?section=problems'] urls = [ 'http://www.evvs.dk/index.php?cPath=30&osCsid=3b110c689f01d722dbbe53c5cee0bf2d'
def __init__(self): # Locate HarvestMan modules import config self.__dict__['cfg'] = config.HarvestManStateObject()