Python parse Exemples, wikichatter.parse Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : import_data.py Projet : amyxzhang/wikum

def get_wiki_talk_posts(article, current_task, total_count):
    from wikitools import wiki, api
    domain = article.url.split('/wiki/Talk:')[0]
    site = wiki.Wiki(domain + '/w/api.php')
    
    title = article.title.split(' - ')
    
    params = {'action': 'query', 'titles': title[0],'prop': 'revisions', 'rvprop': 'content', 'format': 'json'}
    request = api.APIRequest(site, params)
    result = request.query()
    id = article.disqus_id.split('#')[0]
    text = result['query']['pages'][id]['revisions'][0]['*']
    import wikichatter as wc
    parsed_text = wc.parse(text.encode('ascii','ignore'))
    start_sections = parsed_text['sections']
    
    if len(title) > 1:
        section_title = title[1]
        sections = parsed_text['sections']
        for s in sections:
            heading_title = s.get('heading','')
            heading_title = re.sub(r'\]','', heading_title)
            heading_title = re.sub(r'\[','', heading_title)
            if heading_title == section_title:
                start_sections = s['subsections']
                start_comments = s['comments']
    
                total_count = import_wiki_talk_posts(start_comments, article, None, current_task, total_count)
    
    total_count = import_wiki_sessions(start_sections, article, None, current_task, total_count)

Exemple #2

0

Afficher le fichier

Fichier : import_data.py Projet : trusttri/RfC_dumper

    def find_outer_section(title, text, id):
        # Check if closing comment is in here, if not look for the outer section.
        # If there is an outer section, choose it only if it has a closing statement,
        if len(title) > 1:
            section_title = title[1].encode('ascii', 'ignore')
            params = {
                'action': 'query',
                'titles': title[0],
                'prop': 'revisions',
                'rvprop': 'content',
                'format': 'json',
                'redirects': 'yes'
            }
            result = api.APIRequest(site, params).query()
            whole_text = _clean_wiki_text(
                result['query']['pages'][id]['revisions'][0]['*'])

            import wikichatter as wc
            parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore'))
            sections = parsed_whole_text['sections']

            for outer_section in sections:
                found_subection = get_section(outer_section['subsections'],
                                              section_title)
                if found_subection:
                    outer_comments = outer_section['comments']
                    for comment in outer_comments:
                        comment_text = '\n'.join(comment['text_blocks'])
                        if re.search(_CLOSE_COMMENT_RE, comment_text):
                            params = {
                                'action': 'parse',
                                'prop': 'sections',
                                'page': title[0],
                                'redirects': 'yes'
                            }
                            result = api.APIRequest(site, params).query()
                            for s in result['parse']['sections']:
                                if s['line'] == outer_section.get(
                                        'heading').strip():
                                    section_index = s['index']
                                    params = {
                                        'action': 'query',
                                        'titles': title[0],
                                        'prop': 'revisions',
                                        'rvprop': 'content',
                                        'rvsection': section_index,
                                        'format': 'json',
                                        'redirects': 'yes'
                                    }
                                    result = api.APIRequest(site,
                                                            params).query()
                                    final_section_text = result['query'][
                                        'pages'][id]['revisions'][0]['*']
                                    return final_section_text
        return text

Exemple #3

0

Afficher le fichier

Fichier : import_data.py Projet : VonRosenchild/wikum

def get_wiki_talk_posts(article, current_task, total_count):
    from wikitools import wiki, api
    domain = article.url.split('/wiki/')[0]
    site = wiki.Wiki(domain + '/w/api.php')

    title = article.title.split(' - ')

    params = {
        'action': 'query',
        'titles': title[0],
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json'
    }
    request = api.APIRequest(site, params)
    result = request.query()
    id = article.disqus_id.split('#')[0]
    text = result['query']['pages'][id]['revisions'][0]['*']
    import wikichatter as wc
    parsed_text = wc.parse(text.encode('ascii', 'ignore'))

    start_sections = parsed_text['sections']

    if len(title) > 1:
        section_title = title[1]
        sections = parsed_text['sections']
        for s in sections:
            heading_title = s.get('heading', '')
            heading_title = re.sub(r'\]', '', heading_title)
            heading_title = re.sub(r'\[', '', heading_title)
            heading_title = re.sub('<[^<]+?>', '', heading_title)
            if heading_title.strip() == str(section_title).strip():
                start_sections = s['subsections']
                start_comments = s['comments']

                total_count = import_wiki_talk_posts(start_comments, article,
                                                     None, current_task,
                                                     total_count)

    total_count = import_wiki_sessions(start_sections, article, None,
                                       current_task, total_count)

Exemple #4

0

Afficher le fichier

Fichier : example.py Projet : kjschiroo/WikiChatter

import os
import wikichatter as wc
import json

talk_samples_base = "./talk_samples/"
talk_files = []
for (name, directories, files) in os.walk(talk_samples_base):
    talk_files.extend([name + "/" + f for f in files])

for f_path in talk_files:
    with open(f_path, "r") as f:
        text = f.read()
        parsed = wc.parse(text)
        print(json.dumps(parsed))

Exemple #5

0

Afficher le fichier

Fichier : import_data.py Projet : trusttri/RfC_dumper

def get_wiki_talk_posts(article_id, disqus_id, section_index, original_title,
                        total_count, rfc_DB):
    def get_section(sections, section_title):
        for s in sections:
            heading_title = s.get('heading', '')
            heading_title = re.sub(r'\]', '', heading_title)
            heading_title = re.sub(r'\[', '', heading_title)
            heading_title = re.sub('<[^<]+?>', '', heading_title)
            if heading_title.strip() == str(section_title).strip():
                return s

    def find_outer_section(title, text, id):
        # Check if closing comment is in here, if not look for the outer section.
        # If there is an outer section, choose it only if it has a closing statement,
        if len(title) > 1:
            section_title = title[1].encode('ascii', 'ignore')
            params = {
                'action': 'query',
                'titles': title[0],
                'prop': 'revisions',
                'rvprop': 'content',
                'format': 'json',
                'redirects': 'yes'
            }
            result = api.APIRequest(site, params).query()
            whole_text = _clean_wiki_text(
                result['query']['pages'][id]['revisions'][0]['*'])

            import wikichatter as wc
            parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore'))
            sections = parsed_whole_text['sections']

            for outer_section in sections:
                found_subection = get_section(outer_section['subsections'],
                                              section_title)
                if found_subection:
                    outer_comments = outer_section['comments']
                    for comment in outer_comments:
                        comment_text = '\n'.join(comment['text_blocks'])
                        if re.search(_CLOSE_COMMENT_RE, comment_text):
                            params = {
                                'action': 'parse',
                                'prop': 'sections',
                                'page': title[0],
                                'redirects': 'yes'
                            }
                            result = api.APIRequest(site, params).query()
                            for s in result['parse']['sections']:
                                if s['line'] == outer_section.get(
                                        'heading').strip():
                                    section_index = s['index']
                                    params = {
                                        'action': 'query',
                                        'titles': title[0],
                                        'prop': 'revisions',
                                        'rvprop': 'content',
                                        'rvsection': section_index,
                                        'format': 'json',
                                        'redirects': 'yes'
                                    }
                                    result = api.APIRequest(site,
                                                            params).query()
                                    final_section_text = result['query'][
                                        'pages'][id]['revisions'][0]['*']
                                    return final_section_text
        return text

    from wikitools import wiki, api
    site = wiki.Wiki(_DOMAIN + '/w/api.php')
    title = original_title.split(' - ')

    params = {
        'action': 'query',
        'titles': title[0],
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json',
        'redirects': 'yes'
    }
    if section_index:
        params['rvsection'] = section_index

    request = api.APIRequest(site, params)
    result = request.query()
    page_id = disqus_id.split('#')[0]

    if page_id in result['query']['pages']:
        text = result['query']['pages'][page_id]['revisions'][0]['*']

        # If there isn't a closing statement, it means that the RfC could exist as a subsection of another section, with the closing statement in the parent section.
        # Example: https://en.wikipedia.org/wiki/Talk:Alexz_Johnson#Lead_image
        if not re.search(_CLOSE_COMMENT_RE, text):
            text = find_outer_section(title, text, page_id)

        text = _clean_wiki_text(text)

        import wikichatter as wc
        parsed_text = wc.parse(text.encode('ascii', 'ignore'))

        start_sections = parsed_text['sections']
        if len(title) > 1:
            section_title = title[1].encode('ascii', 'ignore')
            sections = parsed_text['sections']
            found_section = get_section(sections, section_title)
            if found_section:
                start_sections = found_section['subsections']
                start_comments = found_section['comments']
                total_count = import_wiki_talk_posts(start_comments,
                                                     article_id, None,
                                                     total_count, rfc_DB)

        total_count = import_wiki_sessions(start_sections, article_id, None,
                                           total_count, rfc_DB)

Exemple #6

0

Afficher le fichier

Fichier : import_data.py Projet : vibster/wikum

def get_wiki_talk_posts(article, current_task, total_count):

    from wikitools import wiki, api
    domain = article.url.split('/wiki/')[0]
    site = wiki.Wiki(domain + '/w/api.php')

    title = article.title.split(' - ')
    # "section_index" is the index number of the section within the page.
    # There are some cases when wikicode does not parse a section as a section when given a "whole page".
    # To prevent this, we first grab only the section(not the entire page) using "section_index" and parse it.
    section_index = article.section_index

    params = {
        'action': 'query',
        'titles': title[0],
        'prop': 'revisions',
        'rvprop': 'content',
        'format': 'json',
        'redirects': 'yes'
    }
    if section_index:
        params['rvsection'] = section_index

    request = api.APIRequest(site, params)
    result = request.query()
    id = article.disqus_id.split('#')[0]
    text = result['query']['pages'][id]['revisions'][0]['*']

    def get_section(sections, section_title):
        for s in sections:
            heading_title = s.get('heading', '')
            heading_title = re.sub(r'\]', '', heading_title)
            heading_title = re.sub(r'\[', '', heading_title)
            heading_title = re.sub('<[^<]+?>', '', heading_title)
            if heading_title.strip() == str(section_title).strip():
                return s

    def find_outer_section(title, text, id):
        # Check if closing comment is in here, if not look for the outer section.
        # If there is an outer section, choose it only if it has a closing statement,
        if len(title) > 1:
            section_title = title[1].encode('ascii', 'ignore')
            params = {
                'action': 'query',
                'titles': title[0],
                'prop': 'revisions',
                'rvprop': 'content',
                'format': 'json',
                'redirects': 'yes'
            }
            result = api.APIRequest(site, params).query()
            whole_text = result['query']['pages'][id]['revisions'][0]['*']

            import wikichatter as wc
            parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore'))
            sections = parsed_whole_text['sections']

            for outer_section in sections:
                found_subection = get_section(outer_section['subsections'],
                                              section_title)
                if found_subection:
                    outer_comments = outer_section['comments']
                    for comment in outer_comments:
                        comment_text = '\n'.join(comment['text_blocks'])
                        if re.search(_CLOSE_COMMENT_RE, comment_text):
                            params = {
                                'action': 'parse',
                                'prop': 'sections',
                                'page': title[0],
                                'redirects': 'yes'
                            }
                            result = api.APIRequest(site, params).query()
                            for s in result['parse']['sections']:
                                if s['line'] == outer_section.get(
                                        'heading').strip():
                                    section_index = s['index']
                                    params = {
                                        'action': 'query',
                                        'titles': title[0],
                                        'prop': 'revisions',
                                        'rvprop': 'content',
                                        'rvsection': section_index,
                                        'format': 'json',
                                        'redirects': 'yes'
                                    }
                                    result = api.APIRequest(site,
                                                            params).query()
                                    final_section_text = result['query'][
                                        'pages'][id]['revisions'][0]['*']
                                    return final_section_text
        return text

    # If there isn't a closing statement, it means that the RfC could exist as a subsection of another section, with the closing statement in the parent section.
    # Example: https://en.wikipedia.org/wiki/Talk:Alexz_Johnson#Lead_image
    if not re.search(_CLOSE_COMMENT_RE, text):
        text = find_outer_section(title, text, id)

    import wikichatter as wc
    parsed_text = wc.parse(text.encode('ascii', 'ignore'))

    start_sections = parsed_text['sections']
    if len(title) > 1:
        section_title = title[1].encode('ascii', 'ignore')
        sections = parsed_text['sections']
        found_section = get_section(sections, section_title)
        if found_section:
            start_sections = found_section['subsections']
            start_comments = found_section['comments']
            total_count = import_wiki_talk_posts(start_comments, article, None,
                                                 current_task, total_count)

    total_count = import_wiki_sessions(start_sections, article, None,
                                       current_task, total_count)