コード例 #1
0
def crawl(crawler):
    out = crawler.get_output(language='my')
    crawl_udhr(crawler, out, filename='udhr_mya.txt')
    crawl_bbc_news(crawler, out, urlprefix='/burmese/')

    zawgyi_out = crawler.get_output(language='my-t-d0-zawgyi')
    crawl_than_lwin_times(crawler, zawgyi_out)
コード例 #2
0
ファイル: crawl_id.py プロジェクト: usApp-stAck/corpuscrawler
def crawl(crawler):
    out = crawler.get_output(language='id')
    crawler.crawl_abc_net_au(out, program_id='indonesian')
    crawler.crawl_voice_of_america(out, host='voaindonesia.com')
    crawl_bbc_news(crawler, out, urlprefix='/indonesia/')
    crawl_deutsche_welle(crawler, out, prefix='/id/')
    crawl_udhr(crawler, out, filename='udhr_ind.txt')
    crawl_bibleis(crawler, out, bible='INDASV')   
    crawl_bibleis(crawler, out, bible='INDWBT')
    crawl_bibleis(crawler, out, bible='INDSHV')
コード例 #3
0
ファイル: crawl_cy.py プロジェクト: zhezhe123/corpuscrawler
def crawl(crawler):
    out = crawler.get_output(language='cy')
    crawl_udhr(crawler, out, filename='udhr_cym.txt')
    crawl_bbc_news(crawler, out, urlprefix='/cymrufyw/')
コード例 #4
0
def crawl(crawler):
    out = crawler.get_output(language='ja')
    crawl_udhr(crawler, out, filename='udhr_jpn.txt')
    crawl_bbc_news(crawler, out, urlprefix='/japanese/')
コード例 #5
0
def crawl(crawler):
    out = crawler.get_output(language='si')
    crawl_udhr(crawler, out, filename='udhr_sin.txt')
    crawl_bbc_news(crawler, out, urlprefix='/sinhala/')
コード例 #6
0
ファイル: crawl_ky.py プロジェクト: keshan/corpuscrawler
def crawl(crawler):
    out = crawler.get_output(language='ky')
    crawl_udhr(crawler, out, filename='udhr_kir.txt')
    crawl_bbc_news(crawler, out, urlprefix='/kyrgyz/')
    crawl_azattyk_org(crawler, out)
コード例 #7
0
ファイル: crawl_gd.py プロジェクト: zhezhe123/corpuscrawler
def crawl(crawler):
    out = crawler.get_output(language='gd')
    crawl_udhr(crawler, out, filename='udhr_gla.txt')
    _crawl_dasg(crawler, out)
    crawl_bbc_news(crawler, out, urlprefix='/naidheachdan/')
コード例 #8
0
def crawl(crawler):
    out = crawler.get_output(language='pcm')
    crawl_udhr(crawler, out, filename='udhr_pcm.txt')
    crawl_bbc_news(crawler, out, urlprefix='/pidgin/')
コード例 #9
0
def crawl(crawler):
    out = crawler.get_output(language='sw')
    crawl_udhr(crawler, out, filename='udhr_swh.txt')
    crawl_bbc_news(crawler, out, urlprefix='/swahili/')
    crawl_deutsche_welle(crawler, out, prefix='/sw/')
コード例 #10
0
def crawl(crawler):
    out = crawler.get_output(language='es')
    crawl_udhr(crawler, out, filename='udhr_spa.txt')
    crawl_bbc_news(crawler, out, urlprefix='/mundo/')
    crawl_deutsche_welle(crawler, out, prefix='/es/')
コード例 #11
0
def crawl(crawler):
    out = crawler.get_output(language='vi')
    crawl_udhr(crawler, out, filename='udhr_vie.txt')
    crawl_bbc_news(crawler, out, urlprefix='/vietnamese/')
    crawler.crawl_voice_of_america(out, host='voatiengviet.com')
コード例 #12
0
def crawl(crawler):
    out = crawler.get_output(language='my')
    crawl_udhr(crawler, out, filename='udhr_mya.txt')
    crawl_bbc_news(crawler, out, urlprefix='/burmese/')
コード例 #13
0
def crawl(crawler):
    out = crawler.get_output(language='ur')
    crawl_udhr(crawler, out, filename='udhr_urd.txt')
    crawl_bbc_news(crawler, out, urlprefix='/urdu/')
コード例 #14
0
def crawl(crawler):
    out = crawler.get_output(language='ta')
    crawl_udhr(crawler, out, filename='udhr_tam.txt')
    crawl_bbc_news(crawler, out, urlprefix='/tamil/')
コード例 #15
0
ファイル: crawl_rw.py プロジェクト: zhezhe123/corpuscrawler
def crawl(crawler):
    out = crawler.get_output(language='rw')
    crawl_udhr(crawler, out, filename='udhr_kin.txt')
    crawl_bbc_news(crawler, out, urlprefix='/gahuza/')
コード例 #16
0
ファイル: crawl_ar.py プロジェクト: keshan/corpuscrawler
def crawl_modern_standard_arabic(crawler):
    out = crawler.get_output(language='ar')
    crawl_udhr(crawler, out, filename='udhr_arb.txt')
    crawl_deutsche_welle(crawler, out, prefix='/ar/')
    crawl_sputnik_news(crawler, out, host='arabic.sputniknews.com')
    crawl_bbc_news(crawler, out, urlprefix='/arabic/')
コード例 #17
0
def crawl(crawler):
    out = crawler.get_output(language='so')
    crawl_udhr(crawler, out, filename='udhr_som.txt')
    crawl_bbc_news(crawler, out, urlprefix='/somali/')