Ejemplo n.º 1
0
    downloads_path = config.get('general',
                                'downloads_path',
                                fallback='/tmp/downloads/')
    if not os.path.exists(downloads_path):
        os.makedirs(downloads_path)
    elif not os.path.isdir(downloads_path):
        print('ERROR: downloads_path parameter points to file!')
        sys.exit(1)

    crawler = Crawler(config, 'captions')
    crawler.get(config.get('captions', 'url'))

    captions_list = []

    for row in crawler.get_elements('tbody tr'):
        items = crawler.get_elements('td', root=row)
        filename = items[0].text
        print("Current Filename is {}".format(filename))
        url = crawler.get_attr('a', 'href', root=items[0])
        crawler.download(url, filename)

        convert_filename = filename.replace('.pdf', '.txt')
        os.system("pdftotext '%s' '%s'" %
                  (downloads_path + 'Captions/' + filename,
                   downloads_path + 'Captions/' + convert_filename))
        file_handle = open(downloads_path + 'Captions/' + convert_filename,
                           encoding="utf-8")
        content = file_handle.readlines()
        content = list(filter(None, [x.strip() for x in content]))