コード例 #1
0
 def testSentenceProcessor(self):
     localConfig = readConfig('../config_local.yaml')
     testConfig = readConfig('../config_test.yaml')
     db = getServer(localConfig)[localConfig["db"]["db"]]
     sp = SentenceProcessorCouch(db)
     sp.addSentence('https://es.wikipedia.org', 'This is a test',
                    ['this', 'is', 'a', 'test'])
     sp.addSentence('https://es.wikipedia.org', 'This is another test',
                    ['this', 'is', 'another', 'test'])
     self.assertTrue(
         sp.isSentence('https://es.wikipedia.org', 'This is a test'))
     self.assertTrue(
         sp.isSentence('https://es.wikipedia.org', 'This is another test'))
コード例 #2
0
 def setUp(self):
     self.localConfig = readConfig('../config_local.yaml')
     self.testConfig = readConfig('../config_test.yaml')
     self.db = getServer(self.localConfig)[self.localConfig["db"]["db"]]
     self.up = UrlProcessorCouch(self.db)
コード例 #3
0
    data['sentences'] = []
    for sentenceId in wordData['sentences']:
        sentenceData = db[sentenceId]
        data['sentences'].append({
            'sentence': sentenceData['sentence'],
            'source': sentenceData['source'],
            'date': sentenceData['date']
        })
    file.write(json.dumps(data) + "\n")


if __name__ == '__main__':
    print("corpus extractor v.1.0")
    logging.info("corpus extractor v.1.0")
    (localConfigFile, configFile, loggingFile) = parseArguments(sys.argv[1:])
    setupLogger(loggingFile)
    localConfig = readConfig(localConfigFile)
    server = getServer(localConfig)
    db = getDatabaseConnection(server, localConfig['db']['db'])
    if 'sentence_threshold' in localConfig:
        sentenceThreshold = localConfig['sentence_threshold']
    else:
        sentenceThreshold = getMinimumSentenceThreshold(
            db, 'all_words/sentences_length', threshold=95)
    logging.info(
        f"Processing word entries that have at least {sentenceThreshold} sentences, the discarded words would be put into {DISCARDED_WORDS_TXT}"
    )
    processEntries(db, localConfig['corpus_result_dir'], sentenceThreshold)
    print("Finished")
    logging.info("finished")
コード例 #4
0
    return urls_to_visit


def set_urls_as_not_visited(db: couchdb.Database, not_visited_view):
    for url in db.iterview(not_visited_view, 100):
        urlDoc = db[url.id]
        urlDoc['visited'] = False
        db.save(urlDoc)


if __name__ == '__main__':
    print("webcrawler v.1.0")
    (local_config_file, config_file,
     logging_file) = parseArguments(sys.argv[1:])
    local_config = readConfig(local_config_file)
    setupLogger(logging_file)
    server = getServer(local_config)
    db = server[local_config["db"]["db"]]
    sp = SentenceProcessorCouch(db)
    up = UrlProcessorCouch(db)
    exclusions = ExclusionRules(config_file['exclusion_rules'])
    webSiteInfoProvider = WebSiteInfoProvider(local_config['user_agent'],
                                              max_wait_time_secs=10,
                                              default_crawl_delay=5)
    engine = Engine(local_config['working_hours'], local_config['max_jobs'],
                    local_config['user_agent'],
                    UrlsProviderReal(db, "urls/not_visited"), sp, up,
                    webSiteInfoProvider, exclusions)
    engine.start()
    print("finished")