def test_find_snippets_for_multiple_patterns(self): create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ' print c', ]))) self._extract(['re.findall', '"foo"']) self.assertEqual(PostSnippet.select().count(), 2) snippets = [s.snippet for s in PostSnippet.select()] patterns = [s.pattern.pattern for s in PostSnippet.select()] self.assertIn('\n'.join([ '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ]), snippets) self.assertIn('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', ]), snippets) self.assertIn('re.findall', patterns) self.assertIn('"foo"', patterns)
def test_find_multiple_snippets_in_one_post(self): create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', 'for c in characters:', ' print c', '', 'digits = re.findall(r"\w", string)', 'for d in digits:', ' print d', ]))) self._extract(['re.findall']) self.assertEqual(PostSnippet.select().count(), 2) snippets = [code.snippet for code in PostSnippet.select()] self.assertIn('\n'.join([ '', 'string = "foo"', 'characters = re.findall(r"\w", string)', 'for c in characters:', ' print c', ]), snippets) self.assertIn('\n'.join([ ' print c', '', 'digits = re.findall(r"\w", string)', 'for d in digits:', ' print d', ]), snippets)
def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False): # Fetch all posts, filtering by those for which tags have been specified posts = Post.select(Post.id, Post.body) if tags is not None: posts = ( posts .join(PostTag, on=(Post.id == PostTag.post_id)) .join(Tag, on=(Tag.id == PostTag.tag_id)) .where(Tag.tag_name << tags) ) # Initialize the progress bar if show_progress: post_count = posts.count() progress_bar = ProgressBar(maxval=post_count, widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Processing web page ', Counter(), ' / ' + str(post_count) + '.' ]) progress_bar.start() # Zip all patterns with a scanner that scans for it pattern_scanner_pairs = [] for pattern in patterns: snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern) extractor = PythonSnippetExtractor(pattern, lines_of_context) scanner = NodeScanner(extractor, tags=['pre', 'code']) pattern_scanner_pairs.append((snippet_pattern, scanner)) # For each post, extract snippets for all patterns # Note that currently there is some repeated work: each extractor will # try to parse all relevant nodes as Python for post_index, post in enumerate(posts, start=1): document = BeautifulSoup(post.body, 'html.parser') for snippet_pattern, scanner in pattern_scanner_pairs: snippets = scanner.scan(document) # Store a record of each snippet that was found for snippet in snippets: PostSnippet.create( post=post, snippet=snippet, compute_index=compute_index, pattern=snippet_pattern, ) if show_progress: progress_bar.update(post_index) if show_progress: progress_bar.finish()
def test_skip_nonpython_code(self): create_post(body=self._make_post_body('\n'.join([ 'var $ = require("jquery")', '$("div").text("div text")', 'var ranomString = "re.match";', ])), view_count=375) self._extract(['re.match']) self.assertEqual(PostSnippet.select().count(), 0)
def main(patterns, tags, lines_of_context, show_progress, *args, **kwargs): # Create a new index for this computation last_compute_index = PostSnippet.select(fn.Max(PostSnippet.compute_index)).scalar() or 0 compute_index = last_compute_index + 1 # Read patterns from a file with open(patterns) as patterns_file: pattern_list = [p.strip() for p in patterns_file.readlines()] # Run snippet extraction extract_snippets(pattern_list, tags, compute_index, lines_of_context, show_progress)
def test_find_snippet(self): # First, we create the models in memory post = create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ' print c', ]))) # Here is the line of code that actually performs the extraction for a pattern # By default, it should run extraction for all posts self._extract(['re.findall']) # There are a few effects that we check # First, that the number of snippets has increased self.assertEqual(PostSnippet.select().count(), 1) # The content of this snippet should show context around the pattern self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([ '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ])) # The snippet should link back to the post that it was create from self.assertEqual(PostSnippet.select().first().post, post) # A model for the pattern should have been created self.assertEqual(SnippetPattern.select().count(), 1) self.assertEqual(SnippetPattern.select().first().pattern, 're.findall') # The snippet should be linked back to the pattern self.assertEqual(SnippetPattern.select().first(), PostSnippet.select().first().pattern)
def test_handle_missing_post_context(self): # If there is no context available in the lines below the one where a pattern is found, # make sure that the extraction is still successful. create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'characters = re.findall(r"\w", string)', ]))) self._extract(['re.findall']) self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([ 'import re', '', 'characters = re.findall(r"\w", string)', ]))
def test_find_snippet_with_tags(self): # These two posts are equivalent, except that only is tagged with a tag that we # will use for filtering in the test. post1 = create_post(body=self._make_post_body('\n'.join([ 'import re', 'characters = re.findall(r"\w", "foo")', 'for c in characters:', ' print c', ]))) post2 = create_post(body=self._make_post_body('\n'.join([ 'import re', 'characters = re.findall(r"\w", "foo")', 'for c in characters:', ' print c', ]))) tag1 = create_tag(tag_name='javascript') tag2 = create_tag(tag_name='python') PostTag.create(post_id=post1.id, tag_id=tag1.id) PostTag.create(post_id=post2.id, tag_id=tag2.id) self._extract(['re.findall'], tags=['python']) self.assertEqual(PostSnippet.select().count(), 1) self.assertEqual(PostSnippet.select().first().post, post2)
def test_specify_lines_of_context(self): create_post(body=self._make_post_body('\n'.join([ 'import re', '', 'string = "foo"', 'characters = re.findall(r"\w", string)', '', 'for c in characters:', ' print c', ]))) self._extract(['re.findall'], lines_of_context=1) self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([ 'string = "foo"', 'characters = re.findall(r"\w", string)', '', ]))
def test_skip_non_code_nodes_plaintext(self): create_post(body='<p>re.findall</p>') self._extract(['re.findall']) self.assertEqual(PostSnippet.select().count(), 0)