def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False):

    # Fetch all posts, filtering by those for which tags have been specified
    posts = Post.select(Post.id, Post.body)
    if tags is not None:
        posts = (
            posts
            .join(PostTag, on=(Post.id == PostTag.post_id))
            .join(Tag, on=(Tag.id == PostTag.tag_id))
            .where(Tag.tag_name << tags)
        )

    # Initialize the progress bar
    if show_progress:
        post_count = posts.count()
        progress_bar = ProgressBar(maxval=post_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Processing web page ', Counter(), ' / ' + str(post_count) + '.'
        ])
        progress_bar.start()

    # Zip all patterns with a scanner that scans for it
    pattern_scanner_pairs = []
    for pattern in patterns:
        snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern)
        extractor = PythonSnippetExtractor(pattern, lines_of_context)
        scanner = NodeScanner(extractor, tags=['pre', 'code'])
        pattern_scanner_pairs.append((snippet_pattern, scanner))

    # For each post, extract snippets for all patterns
    # Note that currently there is some repeated work: each extractor will
    # try to parse all relevant nodes as Python
    for post_index, post in enumerate(posts, start=1):
        document = BeautifulSoup(post.body, 'html.parser')

        for snippet_pattern, scanner in pattern_scanner_pairs:
            snippets = scanner.scan(document)

            # Store a record of each snippet that was found
            for snippet in snippets:
                PostSnippet.create(
                    post=post,
                    snippet=snippet,
                    compute_index=compute_index,
                    pattern=snippet_pattern,
                )

        if show_progress:
            progress_bar.update(post_index)

    if show_progress:
        progress_bar.finish()
Beispiel #2
0
def extract_snippets(patterns, tags, compute_index, lines_of_context, show_progress=False):

    # Fetch all posts, filtering by those for which tags have been specified
    posts = Post.select(Post.id, Post.body)
    if tags is not None:
        posts = (
            posts
            .join(PostTag, on=(Post.id == PostTag.post_id))
            .join(Tag, on=(Tag.id == PostTag.tag_id))
            .where(Tag.tag_name << tags)
        )

    # Initialize the progress bar
    if show_progress:
        post_count = posts.count()
        progress_bar = ProgressBar(maxval=post_count, widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Processing web page ', Counter(), ' / ' + str(post_count) + '.'
        ])
        progress_bar.start()

    # Zip all patterns with a scanner that scans for it
    pattern_scanner_pairs = []
    for pattern in patterns:
        snippet_pattern, _ = SnippetPattern.get_or_create(pattern=pattern)
        extractor = PythonSnippetExtractor(pattern, lines_of_context)
        scanner = NodeScanner(extractor, tags=['pre', 'code'])
        pattern_scanner_pairs.append((snippet_pattern, scanner))

    # For each post, extract snippets for all patterns
    # Note that currently there is some repeated work: each extractor will
    # try to parse all relevant nodes as Python
    for post_index, post in enumerate(posts, start=1):
        document = BeautifulSoup(post.body, 'html.parser')

        for snippet_pattern, scanner in pattern_scanner_pairs:
            snippets = scanner.scan(document)

            # Store a record of each snippet that was found
            for snippet in snippets:
                PostSnippet.create(
                    post=post,
                    snippet=snippet,
                    compute_index=compute_index,
                    pattern=snippet_pattern,
                )

        if show_progress:
            progress_bar.update(post_index)

    if show_progress:
        progress_bar.finish()
Beispiel #3
0
    def test_find_snippet(self):

        # First, we create the models in memory
        post = create_post(body=self._make_post_body('\n'.join([
            'import re',
            '',
            'string = "foo"',
            'characters = re.findall(r"\w", string)',
            '',
            'for c in characters:',
            '    print c',
        ])))

        # Here is the line of code that actually performs the extraction for a pattern
        # By default, it should run extraction for all posts
        self._extract(['re.findall'])

        # There are a few effects that we check
        # First, that the number of snippets has increased
        self.assertEqual(PostSnippet.select().count(), 1)

        # The content of this snippet should show context around the pattern
        self.assertEqual(PostSnippet.select().first().snippet, '\n'.join([
            '',
            'string = "foo"',
            'characters = re.findall(r"\w", string)',
            '',
            'for c in characters:',
        ]))

        # The snippet should link back to the post that it was create from
        self.assertEqual(PostSnippet.select().first().post, post)

        # A model for the pattern should have been created
        self.assertEqual(SnippetPattern.select().count(), 1)
        self.assertEqual(SnippetPattern.select().first().pattern, 're.findall')

        # The snippet should be linked back to the pattern
        self.assertEqual(SnippetPattern.select().first(), PostSnippet.select().first().pattern)