Esempio n. 1
0
 def test_url_save_guess_file(self):
     md5, file = url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'),
                                 fake_headers()).splitlines()[0].split()
     Log.d(TAG, 'md5={}, file={}'.format(md5, file))
     self.assertEqual(
         file,
         url_save_guess_file(URL_DEBIAN_CD_PATH.format(file))[0])
Esempio n. 2
0
 def test_url_save(self):
     md5, file = url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'),
                                 fake_headers()).splitlines()[0].split()
     Log.d(TAG, 'md5={}, file={}'.format(md5, file))
     file_actual, size = url_save(
         URL_DEBIAN_CD_PATH.format(file),
         reporthook=lambda a, b: Log.d(
             TAG, '{:>5}% downloaded'.format(round(a * 100 / b, 1))))
     Log.d(TAG, 'file size: {} MiB'.format(round(size / 1024 / 1024, 1)))
     md5_actual = hashlib.md5()
     with open(file_actual, 'rb') as f:
         buffer = f.read(512 * 1024)
         while buffer:
             md5_actual.update(buffer)
             buffer = f.read(512 * 1024)
     self.assertEqual(md5, md5_actual.hexdigest())
Esempio n. 3
0
 async def do_get(word: str) -> List[str]:
     async with sem:
         try:
             actual, fields = await asyncio.get_running_loop().run_in_executor(None, self.get_card, word)
         except Exception as e:
             Log.e(TAG, 'can\'t get card: "{}", {}'.format(word, e))
             async with lock:
                 skipped.append(word)
             Log.e(TAG, 'skipped: "{}"'.format(word))
         else:
             async with lock:
                 bar.extra = actual
                 bar.increment()
                 if actual not in visited:
                     visited.add(word)
                     visited.add(actual)
                     return fields
Esempio n. 4
0
 def get_card(self, word: str) -> Tuple[str, List[str]]:
     Log.i(TAG, 'querying "{}"'.format(word))
     response = urlopen_with_retry(
         URL_QUERY.format(urllib.parse.quote(word.replace('/', ' '))),
         fake_headers())
     actual = urllib.parse.urlsplit(response.geturl()).path.rsplit('/',
                                                                   1)[-1]
     actual = ' '.join(actual.split('-'))
     if not actual:
         raise WordNotFoundError('can\'t find: "{}"'.format(word))
     if actual != word:
         Log.i(TAG, 'redirected to: "{}"'.format(actual))
     content = url_get_content(response)
     fields = self._extract_fields(content)
     Log.i(TAG, 'parsed: "{}"'.format(actual))
     return actual, fields
Esempio n. 5
0
    def generate_cards(self, *words: str):
        Log.i(TAG, 'generating {} cards'.format(len(words)))
        file = valid_path(self.cards_file)

        # region Access with lock in coroutines
        visited = set()
        skipped = []
        bar = ProgressBar(len(words))
        lock = asyncio.Lock()

        # endregion

        async def do_generate():
            sem = asyncio.Semaphore(DEFAULT_CONCURRENCY)

            async def do_get(word: str) -> List[str]:
                async with sem:
                    try:
                        actual, fields = await asyncio.get_running_loop().run_in_executor(None, self.get_card, word)
                    except Exception as e:
                        Log.e(TAG, 'can\'t get card: "{}", {}'.format(word, e))
                        async with lock:
                            skipped.append(word)
                        Log.e(TAG, 'skipped: "{}"'.format(word))
                    else:
                        async with lock:
                            bar.extra = actual
                            bar.increment()
                            if actual not in visited:
                                visited.add(word)
                                visited.add(actual)
                                return fields

            # gather all tasks to keep results stable
            return await asyncio.gather(*[do_get(w) for w in words])

        bar.update()
        cards = asyncio.run(do_generate())
        cards = [card for card in cards if card]
        bar.done()
        with open(file, 'a', encoding='utf8') as fp:
            writer = csv.writer(fp)
            writer.writerows(cards)
        Log.i(TAG, 'generated {} cards to: {}'.format(len(cards), file))
        if skipped:
            Log.e(TAG, 'skipped {} words:\n{}'.format(len(skipped), '\n'.join(skipped)))
Esempio n. 6
0
 def _retrieve_styling(self) -> str:
     Log.i(TAG, 'retrieving styling')
     style = url_get_content(URL_STYLE, fake_headers())
     font = url_save_guess_file(URL_FONT, fake_headers())[0]
     # add '_' to tell Anki that the file is used by template
     _font = url_save(URL_FONT,
                      headers=fake_headers(),
                      filename=valid_path(
                          os.path.join(self.media_path, '_' + font)))[0]
     Log.i(TAG, 'saved font file to: {}'.format(_font))
     _font = os.path.basename(_font)
     style = re.sub(r'url\([\S]*?/{}'.format(font), 'url({}'.format(_font),
                    style)
     style = '<style>{}</style>'.format(style)
     style += '<script type="text/javascript">{}</script>'.format(
         url_get_content(URL_AMP, fake_headers()))
     style += '<script type="text/javascript">{}</script>'.format(
         url_get_content(URL_AMP_ACCORDION, fake_headers()))
     Log.i(TAG, 'retrieved styling')
     return style
Esempio n. 7
0
 def test_url_get_content(self):
     Log.d(
         TAG,
         url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'),
                         fake_headers()))
Esempio n. 8
0
 def test_urlopen_with_retry(self):
     url = URL_CAMBRIDGE_QUERY.format(urllib.parse.quote('cater to'))
     with urlopen_with_retry(url, fake_headers()) as response:
         Log.d(TAG, 'headers={}'.format(response.headers))
         Log.d(TAG, 'status={}, url={}'.format(response.status,
                                               response.url))
Esempio n. 9
0
 def collapse2(h):
     m = parse_tag.match(h)
     Log.d(TAG, '{}\n{}\n{}'.format(m.group(1), m.group(2), m.group(3)))
     return m.group(1) + HTML_COLLAPSE.format(m.group(2)) + m.group(3)
Esempio n. 10
0
 def generate_styling(self):
     Log.i(TAG, 'generating styling')
     sf = valid_path(self.styling_file)
     with open(sf, 'w', encoding='utf8') as fp:
         fp.write(self._styling)
     Log.i(TAG, 'generated styling to: {}'.format(sf))
Esempio n. 11
0
 def test_findall(self):
     for e in htmls.findall(self.HTML, 'a'):
         Log.d(TAG, e)
Esempio n. 12
0
 def generate_front_template(self):
     Log.i(TAG, 'generating front template')
     ftf = valid_path(self.front_template_file)
     with open(ftf, 'w', encoding='utf8') as fp:
         fp.write(self._front_template)
     Log.i(TAG, 'generated front template to: {}'.format(ftf))
Esempio n. 13
0
 def generate_cards(self, *words: str):
     Log.i(TAG, 'trying to generate {} cards'.format(len(words)))
     visited = set()
     skipped = []
     cf = valid_path(self.cards_file)
     with open(cf, 'a', encoding='utf8') as fp:
         for word in words:
             if word in visited:
                 Log.i(TAG, 'skipping duplicate: "{}"'.format(word))
                 continue
             try:
                 actual, fields = self.get_card(word)
             except Exception as e:
                 Log.e(TAG, e)
                 skipped.append(word)
                 Log.w(TAG, 'skipped: "{}"'.format(word))
             else:
                 if fp.tell():
                     fp.write('\n')
                 fp.write('\t'.join(fields))
                 visited.add(word)
                 visited.add(actual)
     if skipped:
         Log.w(
             TAG, 'skipped {} words:\n'.format(len(skipped)) +
             '\n'.join(skipped))
     Log.i(
         TAG,
         'generated {} cards to: {}'.format(len(words) - len(skipped), cf))
Esempio n. 14
0
 def test_removeall(self):
     Log.d(TAG, htmls.removeall(self.HTML, 'a'))
Esempio n. 15
0
    def test_sub(self):
        def rm_tag(s):
            return re.sub(r'<[\s\S]*?>([\s\S]*)<[\s\S]*>', r'\g<1>', s)

        Log.d(TAG,
              htmls.sub(self.HTML, rm_tag, 'a', 'href="http://example.com/"'))
Esempio n. 16
0
 def generate_back_template(self):
     Log.i(TAG, 'generating back template')
     btf = valid_path(self.back_template_file)
     with open(btf, 'w', encoding='utf8') as fp:
         fp.write(self._back_template)
     Log.i(TAG, 'generated back template to: {}'.format(btf))
Esempio n. 17
0
 def test_find_positions(self):
     for i, j in htmls.find_positions(self.HTML, 'a',
                                      'href="http://example.org/"'):
         Log.d(TAG, 'i={}, j={}'.format(i, j))