def get_wiki_talk_posts(article, current_task, total_count): from wikitools import wiki, api domain = article.url.split('/wiki/Talk:')[0] site = wiki.Wiki(domain + '/w/api.php') title = article.title.split(' - ') params = {'action': 'query', 'titles': title[0],'prop': 'revisions', 'rvprop': 'content', 'format': 'json'} request = api.APIRequest(site, params) result = request.query() id = article.disqus_id.split('#')[0] text = result['query']['pages'][id]['revisions'][0]['*'] import wikichatter as wc parsed_text = wc.parse(text.encode('ascii','ignore')) start_sections = parsed_text['sections'] if len(title) > 1: section_title = title[1] sections = parsed_text['sections'] for s in sections: heading_title = s.get('heading','') heading_title = re.sub(r'\]','', heading_title) heading_title = re.sub(r'\[','', heading_title) if heading_title == section_title: start_sections = s['subsections'] start_comments = s['comments'] total_count = import_wiki_talk_posts(start_comments, article, None, current_task, total_count) total_count = import_wiki_sessions(start_sections, article, None, current_task, total_count)
def find_outer_section(title, text, id): # Check if closing comment is in here, if not look for the outer section. # If there is an outer section, choose it only if it has a closing statement, if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() whole_text = _clean_wiki_text( result['query']['pages'][id]['revisions'][0]['*']) import wikichatter as wc parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore')) sections = parsed_whole_text['sections'] for outer_section in sections: found_subection = get_section(outer_section['subsections'], section_title) if found_subection: outer_comments = outer_section['comments'] for comment in outer_comments: comment_text = '\n'.join(comment['text_blocks']) if re.search(_CLOSE_COMMENT_RE, comment_text): params = { 'action': 'parse', 'prop': 'sections', 'page': title[0], 'redirects': 'yes' } result = api.APIRequest(site, params).query() for s in result['parse']['sections']: if s['line'] == outer_section.get( 'heading').strip(): section_index = s['index'] params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'rvsection': section_index, 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() final_section_text = result['query'][ 'pages'][id]['revisions'][0]['*'] return final_section_text return text
def get_wiki_talk_posts(article, current_task, total_count): from wikitools import wiki, api domain = article.url.split('/wiki/')[0] site = wiki.Wiki(domain + '/w/api.php') title = article.title.split(' - ') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json' } request = api.APIRequest(site, params) result = request.query() id = article.disqus_id.split('#')[0] text = result['query']['pages'][id]['revisions'][0]['*'] import wikichatter as wc parsed_text = wc.parse(text.encode('ascii', 'ignore')) start_sections = parsed_text['sections'] if len(title) > 1: section_title = title[1] sections = parsed_text['sections'] for s in sections: heading_title = s.get('heading', '') heading_title = re.sub(r'\]', '', heading_title) heading_title = re.sub(r'\[', '', heading_title) heading_title = re.sub('<[^<]+?>', '', heading_title) if heading_title.strip() == str(section_title).strip(): start_sections = s['subsections'] start_comments = s['comments'] total_count = import_wiki_talk_posts(start_comments, article, None, current_task, total_count) total_count = import_wiki_sessions(start_sections, article, None, current_task, total_count)
import os import wikichatter as wc import json talk_samples_base = "./talk_samples/" talk_files = [] for (name, directories, files) in os.walk(talk_samples_base): talk_files.extend([name + "/" + f for f in files]) for f_path in talk_files: with open(f_path, "r") as f: text = f.read() parsed = wc.parse(text) print(json.dumps(parsed))
def get_wiki_talk_posts(article_id, disqus_id, section_index, original_title, total_count, rfc_DB): def get_section(sections, section_title): for s in sections: heading_title = s.get('heading', '') heading_title = re.sub(r'\]', '', heading_title) heading_title = re.sub(r'\[', '', heading_title) heading_title = re.sub('<[^<]+?>', '', heading_title) if heading_title.strip() == str(section_title).strip(): return s def find_outer_section(title, text, id): # Check if closing comment is in here, if not look for the outer section. # If there is an outer section, choose it only if it has a closing statement, if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() whole_text = _clean_wiki_text( result['query']['pages'][id]['revisions'][0]['*']) import wikichatter as wc parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore')) sections = parsed_whole_text['sections'] for outer_section in sections: found_subection = get_section(outer_section['subsections'], section_title) if found_subection: outer_comments = outer_section['comments'] for comment in outer_comments: comment_text = '\n'.join(comment['text_blocks']) if re.search(_CLOSE_COMMENT_RE, comment_text): params = { 'action': 'parse', 'prop': 'sections', 'page': title[0], 'redirects': 'yes' } result = api.APIRequest(site, params).query() for s in result['parse']['sections']: if s['line'] == outer_section.get( 'heading').strip(): section_index = s['index'] params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'rvsection': section_index, 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() final_section_text = result['query'][ 'pages'][id]['revisions'][0]['*'] return final_section_text return text from wikitools import wiki, api site = wiki.Wiki(_DOMAIN + '/w/api.php') title = original_title.split(' - ') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } if section_index: params['rvsection'] = section_index request = api.APIRequest(site, params) result = request.query() page_id = disqus_id.split('#')[0] if page_id in result['query']['pages']: text = result['query']['pages'][page_id]['revisions'][0]['*'] # If there isn't a closing statement, it means that the RfC could exist as a subsection of another section, with the closing statement in the parent section. # Example: https://en.wikipedia.org/wiki/Talk:Alexz_Johnson#Lead_image if not re.search(_CLOSE_COMMENT_RE, text): text = find_outer_section(title, text, page_id) text = _clean_wiki_text(text) import wikichatter as wc parsed_text = wc.parse(text.encode('ascii', 'ignore')) start_sections = parsed_text['sections'] if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') sections = parsed_text['sections'] found_section = get_section(sections, section_title) if found_section: start_sections = found_section['subsections'] start_comments = found_section['comments'] total_count = import_wiki_talk_posts(start_comments, article_id, None, total_count, rfc_DB) total_count = import_wiki_sessions(start_sections, article_id, None, total_count, rfc_DB)
def get_wiki_talk_posts(article, current_task, total_count): from wikitools import wiki, api domain = article.url.split('/wiki/')[0] site = wiki.Wiki(domain + '/w/api.php') title = article.title.split(' - ') # "section_index" is the index number of the section within the page. # There are some cases when wikicode does not parse a section as a section when given a "whole page". # To prevent this, we first grab only the section(not the entire page) using "section_index" and parse it. section_index = article.section_index params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } if section_index: params['rvsection'] = section_index request = api.APIRequest(site, params) result = request.query() id = article.disqus_id.split('#')[0] text = result['query']['pages'][id]['revisions'][0]['*'] def get_section(sections, section_title): for s in sections: heading_title = s.get('heading', '') heading_title = re.sub(r'\]', '', heading_title) heading_title = re.sub(r'\[', '', heading_title) heading_title = re.sub('<[^<]+?>', '', heading_title) if heading_title.strip() == str(section_title).strip(): return s def find_outer_section(title, text, id): # Check if closing comment is in here, if not look for the outer section. # If there is an outer section, choose it only if it has a closing statement, if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() whole_text = result['query']['pages'][id]['revisions'][0]['*'] import wikichatter as wc parsed_whole_text = wc.parse(whole_text.encode('ascii', 'ignore')) sections = parsed_whole_text['sections'] for outer_section in sections: found_subection = get_section(outer_section['subsections'], section_title) if found_subection: outer_comments = outer_section['comments'] for comment in outer_comments: comment_text = '\n'.join(comment['text_blocks']) if re.search(_CLOSE_COMMENT_RE, comment_text): params = { 'action': 'parse', 'prop': 'sections', 'page': title[0], 'redirects': 'yes' } result = api.APIRequest(site, params).query() for s in result['parse']['sections']: if s['line'] == outer_section.get( 'heading').strip(): section_index = s['index'] params = { 'action': 'query', 'titles': title[0], 'prop': 'revisions', 'rvprop': 'content', 'rvsection': section_index, 'format': 'json', 'redirects': 'yes' } result = api.APIRequest(site, params).query() final_section_text = result['query'][ 'pages'][id]['revisions'][0]['*'] return final_section_text return text # If there isn't a closing statement, it means that the RfC could exist as a subsection of another section, with the closing statement in the parent section. # Example: https://en.wikipedia.org/wiki/Talk:Alexz_Johnson#Lead_image if not re.search(_CLOSE_COMMENT_RE, text): text = find_outer_section(title, text, id) import wikichatter as wc parsed_text = wc.parse(text.encode('ascii', 'ignore')) start_sections = parsed_text['sections'] if len(title) > 1: section_title = title[1].encode('ascii', 'ignore') sections = parsed_text['sections'] found_section = get_section(sections, section_title) if found_section: start_sections = found_section['subsections'] start_comments = found_section['comments'] total_count = import_wiki_talk_posts(start_comments, article, None, current_task, total_count) total_count = import_wiki_sessions(start_sections, article, None, current_task, total_count)