def update_story_has_mathjax(storys=None): story_ids = _get_story_ids(storys) LOG.info('total %s storys', len(story_ids)) for story_id in tqdm.tqdm(story_ids, ncols=80, ascii=True): with transaction.atomic(): story = Story.objects.only('id', 'content', '_version').get(pk=story_id) if processor.story_has_mathjax(story.content): story.has_mathjax = True story.save()
def test_story_has_mathjax(): has_mathjax_cases = [ r'$x^{y^z}=(1+{\rm e}^x)^{-2xy^w}$', r'$f(x,y,z) = 3y^2z \left( 3+\frac{7x+5}{1+y^2} \right)$', r'$1 \over 3$', r'$\vec{a} \cdot \vec{b}=0$', r'<p>这里 $n$ 是特征', r'向量 $\vec x$ 的长度,即特征的维数。', r'<code>$v_i$</code> 是长度', r'为 $k$ 的向量,与特征 id 对应,称为特征的隐向量。', r'`sum_(i=1)^n i^3=((n(n+1))/2)^2`', r'<code>`sum_(i=1)^n i^3=((n(n+1))/2)^2`</code>', ] not_mathjax_cases = [ r'$10 aaa $10 $10 aaa $10', r'$10 $10 $10 $10', r'$10.0', r'100$ 100$', r'console.log($.fn.jquery); window.$;', r'$ === jQuery; typeof($);', r"$('p,div'); $('p.red,p.green');", r""" The model of subscription premium audio content is popular in China, where Ximalaya, a unicorn consumer audio platform, has a subscription feature for $3 monthly that enables users to access over 4000 e-books and over 300 premium audio courses or podcasts. Audio content is also available a la carte starting at $0.03 per short, serialized book chapter, or anywhere from $10 to $45 for paid audio courses. """, r"""$ shellcheck test.sh In test.sh line 4: if[ $# -eq 0 ]""", r'$ shellcheck if[ $# -eq 0 ]', '$x^\n{y^z}$', r'$x^{$y^z}$', '`x^\n{y^z}`', r'```x^{y^z}```', ] for text in has_mathjax_cases: assert story_has_mathjax(text), text for text in not_mathjax_cases: assert not story_has_mathjax(text), text
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) link = normlize_url(data["link"]) valid_link = '' if link: try: valid_link = validate_url(link) except Invalid: LOG.warning(f'invalid story link {link!r}') story['link'] = valid_link content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story link=%r content length=%s, will only save plain text!' LOG.warning(msg, link, len(content)) content = story_html_to_text(content) content = process_story_links(content, valid_link) story['content'] = content summary = data["summary"] if not summary: summary = content summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary title = shorten(data["title"] or link or summary, 200) unique_id = shorten(data['id'] or link or title, 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = unique_id story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} story['unique_id'] = shorten(_get_story_unique_id(data), 200) content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) content = story_html_clean(content) content = process_story_links(content, data["link"]) story['content'] = content summary = data["summary"] if not summary: summary = content # TODO: performance summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary story['link'] = data["link"] title = shorten(data["title"] or story['link'] or story['unique_id'], 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)