def testWebSiteInfoProvider(self): localConfig = readConfig( '../config_local.yaml' ) testConfig = readConfig( '../config_test.yaml' ) wsInfo = WebSiteInfoProvider( user_agent = localConfig['user_agent'], max_wait_time_secs=10, default_crawl_delay = 3 ) ''' informacion que necesito: 1. última vez que lo visité 2. el can fetch, debe quedar expuesto ''' wsInfo.canFetchOrWait( "https://es.wikipedia.org/wiki/Wikipedia:Portada" )
def testSentenceProcessor(self): localConfig = readConfig('../config_local.yaml') testConfig = readConfig('../config_test.yaml') db = getServer(localConfig)[localConfig["db"]["db"]] sp = SentenceProcessorCouch(db) sp.addSentence('https://es.wikipedia.org', 'This is a test', ['this', 'is', 'a', 'test']) sp.addSentence('https://es.wikipedia.org', 'This is another test', ['this', 'is', 'another', 'test']) self.assertTrue( sp.isSentence('https://es.wikipedia.org', 'This is a test')) self.assertTrue( sp.isSentence('https://es.wikipedia.org', 'This is another test'))
def readConfig(self): for name, value in common.readConfig(Hud.config_name): if name == 'background': self.backgroundColor = common.hex_to_rgba(value) elif name == 'foreground': self.foregroundColor = common.hex_to_rgb(value) elif name == 'highlight': self.highlightColor = common.hex_to_rgb(value) else : raise Exception('whoa')
parser = argparse.ArgumentParser( description= 'Consume inbound mails, generating opens, clicks, OOBs and FBLs. Config file {} must be present in current directory.' .format(configFileName())) parser.add_argument( 'directory', type=str, help='directory to ingest .msg files, process and delete them', ) parser.add_argument( '-f', action='store_true', help='Keep looking for new files forever (like tail -f does)') args = parser.parse_args() cfg = readConfig(configFileName()) logger = createLogger(cfg['Sink'].get('Logfile', baseProgName() + '.log'), cfg['Sink'].getint('Logfile_backupCount', 10)) if args.directory: if args.f: # Process the inbound directory forever while True: fnameList = glob.glob(os.path.join(args.directory, '*.msg')) if fnameList: consumeFiles(logger, fnameList, cfg) time.sleep(5) cfg = readConfig( configFileName()) # get config again, in case it's changed else:
def setUp(self): self.localConfig = readConfig('../config_local.yaml') self.testConfig = readConfig('../config_test.yaml')
year, week, day = t.isocalendar() odd_week_offset = (week % nweeks) * 7 i = day - 1 + odd_week_offset return d[i], i # ----------------------------------------------------------------------------------------- # Main code # ----------------------------------------------------------------------------------------- # This script should be run once per hour from crontab if __name__ == "__main__": logger = createLogger(baseProgName() + '.log', 10) try: t = datetime.utcnow() cfg = readConfig('consume-mail.ini') weekly_cycle_bounce_rate = cfg.get('Weekly_Cycle_Bounce_Rate', '3').split(',') today_bounce_rate, x = nWeeklyCycle(weekly_cycle_bounce_rate, t) logger.info( 'Today is day {} (zero based) in the {}-day cycle. Bounce rate will be {}%' .format(x, len(weekly_cycle_bounce_rate), today_bounce_rate)) filename = '/etc/pmta/config' logger.info('Changing line of file {}'.format(filename)) with open(filename, "r+") as f: pmta_cfg = f.readlines() pmta_cfg_bounce_param = 'dummy-smtp-blacklist-bounce-percent' for i, s in enumerate(pmta_cfg): if pmta_cfg_bounce_param in s: pmta_cfg[ i] = '{} {}\t# Updated by script {} on {} UTC\n'.format(
def setUp(self): self.localConfig = readConfig('../config_local.yaml') self.testConfig = readConfig('../config_test.yaml') self.db = getServer(self.localConfig)[self.localConfig["db"]["db"]] self.up = UrlProcessorCouch(self.db)
data['sentences'] = [] for sentenceId in wordData['sentences']: sentenceData = db[sentenceId] data['sentences'].append({ 'sentence': sentenceData['sentence'], 'source': sentenceData['source'], 'date': sentenceData['date'] }) file.write(json.dumps(data) + "\n") if __name__ == '__main__': print("corpus extractor v.1.0") logging.info("corpus extractor v.1.0") (localConfigFile, configFile, loggingFile) = parseArguments(sys.argv[1:]) setupLogger(loggingFile) localConfig = readConfig(localConfigFile) server = getServer(localConfig) db = getDatabaseConnection(server, localConfig['db']['db']) if 'sentence_threshold' in localConfig: sentenceThreshold = localConfig['sentence_threshold'] else: sentenceThreshold = getMinimumSentenceThreshold( db, 'all_words/sentences_length', threshold=95) logging.info( f"Processing word entries that have at least {sentenceThreshold} sentences, the discarded words would be put into {DISCARDED_WORDS_TXT}" ) processEntries(db, localConfig['corpus_result_dir'], sentenceThreshold) print("Finished") logging.info("finished")
def readConfig(self): for name, command in common.readConfig(Aliases.file_name): self.items.append((name, command))
urls_to_visit.append(child.attrs['href']) return urls_to_visit def set_urls_as_not_visited(db: couchdb.Database, not_visited_view): for url in db.iterview(not_visited_view, 100): urlDoc = db[url.id] urlDoc['visited'] = False db.save(urlDoc) if __name__ == '__main__': print("webcrawler v.1.0") (local_config_file, config_file, logging_file) = parseArguments(sys.argv[1:]) local_config = readConfig(local_config_file) setupLogger(logging_file) server = getServer(local_config) db = server[local_config["db"]["db"]] sp = SentenceProcessorCouch(db) up = UrlProcessorCouch(db) exclusions = ExclusionRules(config_file['exclusion_rules']) webSiteInfoProvider = WebSiteInfoProvider(local_config['user_agent'], max_wait_time_secs=10, default_crawl_delay=5) engine = Engine(local_config['working_hours'], local_config['max_jobs'], local_config['user_agent'], UrlsProviderReal(db, "urls/not_visited"), sp, up, webSiteInfoProvider, exclusions) engine.start() print("finished")