def getData(part): res = {} res.update(part) if "submitted_by" not in res: res["submitted_by"] = {"year": int(part["part_entered"][:4]), "team_name": "Unknown"} part_pages = [] for pageid in pageids[part["part_name"]]: try: p = pages[str(pageid["pageid"])] except: continue raw_page = partinfo_re.sub("", p["revisions"][0]["*"]) try: content = remove_html_tags(wikimarkup.parse(raw_page)) except Exception as e: # parse error content = remove_html_tags(raw_page) # give up parsing part_pages.append( { "content": content, "title": pageid["title"], "revid": p["revisions"][0]["revid"], "timestamp": p["revisions"][0]["timestamp"], "user": p["revisions"][0]["user"], "pageid": pageid["pageid"], } ) res["pages"] = part_pages s = score[part["part_name"]] res["reliability"] = s["reliability"] res["num_teams_used"] = s["num_teams_used"] return res
def getData(part): res = {} res.update(part) if 'submitted_by' not in res: res['submitted_by'] = { 'year': int(part['part_entered'][:4]), 'team_name': 'Unknown', } part_pages = [] for pageid in pageids[part['part_name']]: try: p = pages[str(pageid['pageid'])] except: continue raw_page = partinfo_re.sub('', p['revisions'][0]['*']) try: content = remove_html_tags(wikimarkup.parse(raw_page)) except Exception as e: # parse error content = remove_html_tags(raw_page) # give up parsing part_pages.append({ 'content': content, 'title': pageid['title'], 'revid': p['revisions'][0]['revid'], 'timestamp': p['revisions'][0]['timestamp'], 'user': p['revisions'][0]['user'], 'pageid': pageid['pageid'], }) res['pages'] = part_pages s = score[part['part_name']] res['reliability'] = s['reliability'] res['num_teams_used'] = s['num_teams_used'] return res
def import_wiki_sessions(sections, article, reply_to, current_task, total_count): for section in sections: heading = section.get('heading', None) if heading: parsed_text = parse(heading) comment_author = CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True) comments = Comment.objects.filter(article=article, author=comment_author, text=parsed_text) if comments.count() > 0: comment_wikum = comments[0] else: comment_wikum = Comment.objects.create(article = article, author = comment_author, text = parsed_text, reply_to_disqus = reply_to, text_len = len(parsed_text), ) comment_wikum.save() comment_wikum.disqus_id = comment_wikum.id comment_wikum.save() total_count += 1 if current_task and total_count % 3 == 0: current_task.update_state(state='PROGRESS', meta={'count': total_count}) else: comment_wikum = reply_to if len(section['comments']) > 0: total_count = import_wiki_talk_posts(section['comments'], article, comment_wikum.disqus_id, current_task, total_count) if len(section['subsections']) > 0: total_count = import_wiki_sessions(section['subsections'], article, comment_wikum.disqus_id, current_task, total_count) return total_count
def get_table(title, site='en.wikipedia.org', table_idx=0): """ Given an article title, site, and the index of that table within that page returns a dataframe constructed from that table. Parameters: title (str) : article title (not a URL) site (str) : site base url (i.e. en.wikipedia.org) table_idx (int) : index of table on page (zero-indexed, so first table is idx=0) Returns: a pandas.DataFrame with the column names set to the column headers and a default index """ assert 'http://' not in title, 'dataframe_from_url expects a title, not a full URL' url = 'http://{0}/w/api.php?action=query&titles={1}&prop=revisions&rvprop=content&format=json'.format(site, title) logger.info('getting url: %s', url) res = json.loads(urllib.urlopen(url).read()) wiki = res['query']['pages'].values()[0]['revisions'][0]['*'] html = wikimarkup.parse(wiki) dom = soup.BeautifulSoup(html) table = itertools.islice(dom.findAll('table'),table_idx, table_idx + 1).next() row_tags = table.findAll('tr') header = [e.text.strip() for e in row_tags[0].findAll('th')] rows = [[e.text.strip() for e in row.findAll('td')] for row in row_tags[1:]] df = pd.DataFrame(rows, columns = header) return df
def result(task_id): task = processarticle.AsyncResult(task_id) if task.state == 'SUCCESS': content = task.result['article'] html = parse(content, False) return render_template("result.html", content = content, html = html, info = task.result['info']) else: return "Be more patient!"
def get_tables_1(wiki): html = PyQuery(wikimarkup.parse(wiki)) frames = [] for table in html('table'): data = [[x.text.strip() for x in row] for row in table.getchildren()] df = pd.DataFrame(data[1:], columns=data[0]) frames.append(df) return frames
def get_tables(url): html = PyQuery(wikimarkup.parse(get_wiki_raw(url))) frames = pd.DataFrame() for table in html('table'): data = [[x.text.strip() for x in row] for row in table.getchildren()] df = pd.DataFrame(data[1:], columns=data[0]) if np.prod(df.shape) > np.prod(frames.shape): frames = df return frames
def get_most_active_users_wiki_df(url): res = json.loads(urllib.urlopen(url).read()) wiki = res['query']['pages'].values()[0]['revisions'][0]['*'] html = wikimarkup.parse(wiki) dom = soup.BeautifulSoup(html) row_tags = dom.findAll('tr') header = [e.text.strip() for e in row_tags[0].findAll('th')] rows = [[e.text.strip() for e in row.findAll('td')] for row in row_tags[1:]] df = pd.DataFrame(rows, columns = header) df.User = df.User.apply(extract_username) return df[~df.User.isnull()]
def show(self, c): from wikimarkup import parse f = os.path.join(self.runpath, 'data', '%s.wiki' % c) if os.path.exists(f): wiki = open(f).read() wiki = reg.sub(self._pic, wiki) wiki = reg1.sub(' (', wiki) wiki = reg2.sub(')', wiki) html = parse(wiki, showToc=False) self.load_html_string(html, 'file:///') else: self.open(os.path.join(self.runpath, 'err.html'))
def import_wiki_talk_posts(comments, article, reply_to, current_task, total_count): for comment in comments: text = '' for block in comment['text_blocks']: t = parse(block) if t.strip() != '': text += '<P>%s</P>' % t author = comment.get('author') if author: comment_author = import_wiki_authors([author], article)[0] else: comment_author = CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True) comments = Comment.objects.filter(article=article, author=comment_author,text=text) if comments.count() > 0: comment_wikum = comments[0] else: time = None if comment.get('time_stamp'): time = datetime.datetime.strptime(comment['time_stamp'], '%H:%M, %d %B %Y (%Z)') cosigners = comment['cosigners'] comment_cosigners = import_wiki_authors(cosigners, article) comment_wikum = Comment.objects.create(article = article, author = comment_author, text = text, reply_to_disqus = reply_to, text_len = len(text), ) if time: comment_wikum.created_at = time comment_wikum.save() comment_wikum.disqus_id = comment_wikum.id comment_wikum.save() for signer in comment_cosigners: comment_wikum.cosigners.add(signer) total_count += 1 if current_task and total_count % 3 == 0: current_task.update_state(state='PROGRESS', meta={'count': total_count}) replies = comment['comments'] total_count = import_wiki_talk_posts(replies, article, comment_wikum.disqus_id, current_task, total_count) return total_count
def wikified_content(self): # TODO: check memcache for rendered page? # replacements here transforms = [ AutoLink(), WikiWords(), HideReferers(), ] content = self.content content = wikimarkup.parse(content) for transform in transforms: content = transform.run(content, self) return content
def save_org_description(request): if request.user.is_anonymous(): raise Http404 text = request.POST['text'] org_id = request.POST['id'] field = request.POST['field'] data_format = request.POST['data_format'] org = models.Organization.objects.get(id = org_id) setattr(org, field, text) org.save() #save the descr if data_format == 'mediawiki': data = simplejson.dumps({'text': parse(text), 'id': org_id }) else: data = simplejson.dumps({'text': text, 'id': org_id }) return http.HttpResponse(data, mimetype="application/json")
def getData(team): res = {} res.update(results[team]) ##res.update(wikis[team]) res['year'] = YEAR idx=0 team_pages = [] for pageid in pageids[team]: # idx += 1 # if(idx > 79): # break try: p = pages[str(pageid['pageid'])] except: continue raw_page = p['revisions'][0]['*'] try: content = remove_html_tags(wikimarkup.parse(raw_page)) except Exception as e: # parse error content = remove_html_tags(raw_page) # give up parsing print ('give up') content = '<{<{<wikititle>}>}>' + pageid['title'] + '<{<{</wikititle>}>}>' + content # add title to content header team_pages.append({ 'content': content, 'title': pageid['title'], 'revid': p['revisions'][0]['revid'], 'timestamp': p['revisions'][0]['timestamp'], 'user': p['revisions'][0]['user'], 'pageid': pageid['pageid'], }) # print(str(idx) + ':' + str(pageid)) # pprint.pprint(content) res['pages'] = team_pages # res['school'] = teaminfo[team]['school'] # res['description'] = teaminfo[team]['description'] # res['title'] = teaminfo[team]['title'] # res['track'] = teaminfo[team]['track'] # res['abstract'] = teaminfo[team]['abstract'] res.update(teaminfo[team]) return res
def getData(team): res = {} res.update(results[team]) res["year"] = YEAR # idx=0 team_pages = [] for pageid in pageids[team]: # idx += 1 # if(idx > 79): # break try: p = pages[str(pageid["pageid"])] except: continue raw_page = p["revisions"][0]["*"] try: content = remove_html_tags(wikimarkup.parse(raw_page)) except Exception as e: # parse error content = remove_html_tags(raw_page) # give up parsing print ("give up") content = ( "<{<{<wikititle>}>}>" + pageid["title"] + "<{<{</wikititle>}>}>" + content ) # add title to content header team_pages.append( { "content": content, "title": pageid["title"], "revid": p["revisions"][0]["revid"], "timestamp": p["revisions"][0]["timestamp"], "user": p["revisions"][0]["user"], "pageid": pageid["pageid"], } ) # print(str(idx) + ':' + str(pageid)) # pprint.pprint(content) res["pages"] = team_pages # res['school'] = teaminfo[team]['school'] # res['description'] = teaminfo[team]['description'] # res['title'] = teaminfo[team]['title'] # res['track'] = teaminfo[team]['track'] # res['abstract'] = teaminfo[team]['abstract'] res.update(teaminfo[team]) return res
def save_illustrations_page(page): def insert_images(markup): def replace_image(match): image = match.group(1) image = mwpage.Page.normalize_title(image) image_jobs.append(gevent.spawn(download_image, image)) return '<img src={0} alt="{0}/>'.format(image) images = re.compile(r'Image\:(.+?)(?:\n|\|.+)', re.IGNORECASE) markup = re.sub(images, replace_image, markup) return markup markup = wikimarkup.parse(page.edit().encode('utf-8'), showToc=False) markup = clean_page(markup) markup = re.sub(r'<gallery>', '', markup) markup = re.sub(r'</gallery>', '', markup) markup = insert_images(markup) return page, markup
def parse_wiki(self): for index in self._query_hash: query = self._query_hash[index] content = self.retrieve(query) if content: doc = json.load(content) ## get the content of the markup ## thanks to http://goo.gl/wDPha text = doc['query']['pages'].itervalues().next( )['revisions'][0]['*'] global g_current_query g_current_query = query html = parse(text) print 'Query processed: %s' % query ## wait for 1 second to avoid unnecessary banning from WikiPedia server time.sleep(1) else: print 'Skipping query %s' % query
def parse_json(json_file): try: with open(json_file) as f: print 'Loading %s' % (json_file) dump_json = json.load(f) except IOError as e: print 'Failed to open file: %s' % json_file global g_cur_idx global g_dist_hash global g_timestamp global g_rel_ent_dist_db sorted_index = dump_json.keys() sorted_index.sort(key=lambda x: int(x)) for index in sorted_index: g_cur_idx = index g_dist_hash[g_cur_idx] = {} g_rev_hash[g_cur_idx] = {} query = dump_json[index]['query'] g_rel_ent_dist_db.hset(RedisDB.query_ent_hash, index, query) #print 'Query: %s %s' % (index, query) #continue for rev_id in dump_json[index]['revisions']: revid = dump_json[index]['revisions'][rev_id]['revid'] g_timestamp = dump_json[index]['revisions'][rev_id]['timestamp'] text = dump_json[index]['revisions'][rev_id]['text'] #print '%s %s %s' % (query, revid, timestamp) g_rev_hash[g_cur_idx][g_timestamp] = revid try: html = parse(text) # catch all other exceptions in the parse process, # print out the traceback, and move on without interruption except: print "Exception in parse_json()" print '-' * 60 traceback.print_exc(file=sys.stdout) print '-' * 60 pass g_rel_ent_dist_db.rpush(RedisDB.query_ent_list, index) sorted_ts = g_dist_hash[index].keys() # sort the timestamp in chronic order sorted_ts.sort(key=lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')) # for each month, save the number of related entities # as well as the related entities last_num = 0 last_revid = 0 last_ts = '' last_dt_str = datetime.datetime.strptime(sorted_ts[0], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m') for ts in sorted_ts: revid = g_rev_hash[index][ts] dt_str = datetime.datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m') if dt_str != last_dt_str: hash_key = 'query-%s' % index # save the number of related entities for last month g_rel_ent_dist_db.hset(hash_key, last_dt_str, last_num) print '%s %s %s %s %d' % (index, query, last_dt_str, last_revid, last_num) # save the related entities for last month ent_list = [] for ent in g_dist_hash[index][last_ts]: ent_list.append(ent) ent_str = '='.join(ent_list) hash_key = 'query-rel-ent-%s' % index g_rel_ent_dist_db.hset(hash_key, last_dt_str, ent_str) last_num = len(g_dist_hash[index][ts].keys()) last_revid = revid last_dt_str = dt_str last_ts = ts
def markdown(s): wikimarkup.registerInternalLinkHook(None, link_hook) return wikimarkup.parse(s, showToc=True)
def render(content, *args, **kwargs): return wikimarkup.parse(content, *args, **kwargs)
def recurse_viz(parent, posts, replaced, article, is_collapsed): children = [] hid_children = [] replace_children = [] pids = [post.disqus_id for post in posts] if replaced: num_subtree_children = 0 else: num_subtree_children = len(pids) reps = Comment.objects.filter(reply_to_disqus__in=pids, article=article).select_related() for post in posts: if post.json_flatten == '': #if True: if post.author: if post.author.anonymous: author = "Anonymous" else: author = post.author.username else: author = "" v1 = {'size': post.points, 'd_id': post.id, 'parent': parent.id if parent else None, 'author': author, 'replace_node': post.is_replacement, 'collapsed': is_collapsed, 'tags': [(tag.text, tag.color) for tag in post.tags.all()] } if 'https://en.wikipedia.org/wiki/' in article.url: v1['name'] = parse(post.text) v1['wikitext'] = post.text if post.summary.strip() == '': v1['summary'] = '' else: v1['summary'] = parse(post.summary) v1['sumwiki'] = post.summary if post.extra_summary.strip() == '': v1['extra_summary'] = '' else: v1['summary'] = parse(post.extra_summary) v1['extrasumwiki'] = post.extra_summary else: v1['name'] = post.text v1['summary'] = post.summary v1['extra_summary'] = post.extra_summary c1 = reps.filter(reply_to_disqus=post.disqus_id).order_by('-points') if c1.count() == 0: vals = [] hid = [] rep = [] num_subchildren = 0 else: replace_future = replaced or post.is_replacement vals, hid, rep, num_subchildren = recurse_viz(post, c1, replace_future, article, is_collapsed or post.is_replacement) v1['children'] = vals v1['hid'] = hid v1['replace'] = rep post.json_flatten = json.dumps(v1) if not post.is_replacement: post.num_subchildren = num_subchildren else: post.num_subchildren = 0 post.save() if not post.is_replacement: num_subtree_children += num_subchildren else: v1 = json.loads(post.json_flatten) if post.hidden: hid_children.append(v1) elif parent and parent.is_replacement: replace_children.append(v1) else: children.append(v1) return children, hid_children, replace_children, num_subtree_children
def parse(self, input_type, output_format, text): return parse(text)
def parse_json(json_file): try: with open(json_file) as f: print 'Loading %s' % (json_file) dump_json = json.load(f) except IOError as e: print 'Failed to open file: %s' % json_file global g_cur_idx global g_dist_hash global g_timestamp global g_rel_ent_dist_db sorted_index = dump_json.keys() sorted_index.sort(key=lambda x: int(x)) for index in sorted_index: g_cur_idx = index g_dist_hash[g_cur_idx] = {} g_rev_hash[g_cur_idx] = {} query = dump_json[index]['query'] g_rel_ent_dist_db.rpush(RedisDB.query_ent_list, index) g_rel_ent_dist_db.hset(RedisDB.query_ent_hash, index, query) print 'Query: %s %s' % (index, query) #continue for rev_id in dump_json[index]['revisions']: revid = dump_json[index]['revisions'][rev_id]['revid'] g_timestamp = dump_json[index]['revisions'][rev_id]['timestamp'] text = dump_json[index]['revisions'][rev_id]['text'] #print '%s %s %s' % (query, revid, timestamp) g_rev_hash[g_cur_idx][g_timestamp] = revid try: html = parse(text) # catch all other exceptions in the parse process, # print out the traceback, and move on without interruption except: print "Exception in parse_json()" print '-' * 60 traceback.print_exc(file=sys.stdout) print '-' * 60 pass sorted_ts = g_dist_hash[index].keys() # sort the timestamp in chronic order sorted_ts.sort(key=lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')) # for each month, save the number of related entities # as well as the related entities last_revid = 0 last_ts = '' last_dt_str = datetime.datetime.strptime(sorted_ts[0], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m') rel_ent_hash = {} irrel_ent_hash = {} for ts in sorted_ts: revid = g_rev_hash[index][ts] dt_str = datetime.datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m') if dt_str != last_dt_str: # save the related entities for last month ent_list = [] for ent in g_dist_hash[index][last_ts]: # check the cache first if ent in rel_ent_hash: ent_list.append(ent) continue if ent in irrel_ent_hash: continue # update the cache accordingly if in_rel_doc(query, ent): ent_list.append(ent) rel_ent_hash[ent] = 1 else: irrel_ent_hash[ent] = 1 # save the number of related entities for last month hash_key = 'query-rel-ent-num-%s' % index ent_num = len(ent_list) g_rel_ent_dist_db.hset(hash_key, last_dt_str, ent_num) print '%s %s %s %s %d' % (index, query, last_dt_str, last_revid, ent_num) last_revid = revid last_dt_str = dt_str last_ts = ts
def summarize_comment(request): try: article_id = request.POST['article'] a = Article.objects.get(id=article_id) id = request.POST['id'] summary = request.POST['comment'] top_summary, bottom_summary = get_summary(summary) req_user = request.user if request.user.is_authenticated() else None c = Comment.objects.get(id=id) from_summary = c.summary + '\n----------\n' + c.extra_summary c.summary = top_summary c.extra_summary = bottom_summary c.save() if from_summary != '': action = 'edit_sum' explanation = 'edit summary' else : action = 'sum_comment' explanation = 'initial summary' h = History.objects.create(user=req_user, article=a, action=action, from_str=from_summary, to_str=summary, explanation=explanation) h.comments.add(c) recurse_up_post(c) if 'wikipedia.org' in a.url: res = {} if top_summary.strip() != '': res['top_summary'] = parse(top_summary) else: res['top_summary'] = '' res['top_summary_wiki'] = top_summary if bottom_summary.strip() != '': res['bottom_summary'] = parse(bottom_summary) else: res['bottom_summary'] = '' res['bottom_summary_wiki'] = bottom_summary return JsonResponse(res) else: return JsonResponse({'top_summary': top_summary, 'bottom_summary': bottom_summary}) except Exception, e: print e return HttpResponseBadRequest()
def summarize_selected(request): try: article_id = request.POST['article'] a = Article.objects.get(id=article_id) ids = request.POST.getlist('ids[]') children_ids = request.POST.getlist('children[]') children_ids = [int(x) for x in children_ids] child_id = request.POST['child'] delete_nodes = request.POST.getlist('delete_nodes[]') summary = request.POST['comment'] top_summary, bottom_summary = get_summary(summary) req_user = request.user if request.user.is_authenticated() else None comments = Comment.objects.filter(id__in=ids) children = [c for c in comments if c.id in children_ids] child = Comment.objects.get(id=child_id) new_id = random_with_N_digits(10); new_comment = Comment.objects.create(article=a, is_replacement=True, reply_to_disqus=child.reply_to_disqus, summary=top_summary, extra_summary=bottom_summary, disqus_id=new_id, points=child.points, text_len=len(summary)) h = History.objects.create(user=req_user, article=a, action='sum_selected', to_str=summary, explanation='initial summary of group of comments') for c in children: c.reply_to_disqus = new_id c.save() for c in comments: h.comments.add(c) for node in delete_nodes: delete_node(node) recurse_up_post(new_comment) recurse_down_num_subtree(new_comment) make_vector(new_comment, a) if 'wikipedia.org' in a.url: res = {'d_id': new_comment.id} if top_summary.strip() != '': res['top_summary'] = parse(top_summary) else: res['top_summary'] = '' res['top_summary_wiki'] = top_summary if bottom_summary.strip() != '': res['bottom_summary'] = parse(bottom_summary) else: res['bottom_summary'] = '' res['bottom_summary_wiki'] = bottom_summary return JsonResponse(res) else: return JsonResponse({'d_id': new_comment.id, 'top_summary': top_summary, 'bottom_summary': bottom_summary}) except Exception, e: print e return HttpResponseBadRequest()
def summarize_comments(request): try: article_id = request.POST['article'] a = Article.objects.get(id=article_id) id = request.POST['id'] summary = request.POST['comment'] top_summary, bottom_summary = get_summary(summary) delete_nodes = request.POST.getlist('delete_nodes[]') req_user = request.user if request.user.is_authenticated() else None c = Comment.objects.get(id=id) if not c.is_replacement: new_id = random_with_N_digits(10); new_comment = Comment.objects.create(article=a, is_replacement=True, reply_to_disqus=c.reply_to_disqus, summary=top_summary, extra_summary=bottom_summary, disqus_id=new_id, points=c.points, text_len=len(summary)) c.reply_to_disqus = new_id c.save() h = History.objects.create(user=req_user, article=a, action='sum_nodes', to_str=summary, explanation='initial summary of subtree') d_id = new_comment.id recurse_down_num_subtree(new_comment) else: from_summary = c.summary + '\n----------\n' + c.extra_summary c.summary = top_summary c.extra_summary=bottom_summary c.save() h = History.objects.create(user=req_user, article=a, action='edit_sum_nodes', from_str=from_summary, to_str=summary, explanation='edit summary of subtree') d_id = c.id new_comment = c for node in delete_nodes: delete_node(node) h.comments.add(c) recurse_up_post(c) make_vector(new_comment, a) if 'wikipedia.org' in a.url: res = {'d_id': d_id} if top_summary.strip() != '': res['top_summary'] = parse(top_summary) else: res['top_summary'] = '' res['top_summary_wiki'] = top_summary if bottom_summary.strip() != '': res['bottom_summary'] = parse(bottom_summary) else: res['bottom_summary'] = '' res['bottom_summary_wiki'] = bottom_summary return JsonResponse(res) else: return JsonResponse({'d_id': d_id, 'top_summary': top_summary, 'bottom_summary': bottom_summary}) except Exception, e: print e import traceback print traceback.format_exc() return HttpResponseBadRequest()
def galleryTagHook(parser_env, body, attributes={}): widths = attributes.get('widths') if widths: widths = re.sub('px', '', widths) gal_width = int(widths) else: gal_width = 155 heights = attributes.get('heights') if heights: heights = re.sub('px', '', heights) def_image = int(heights) else: def_image = 120 start_text = '' if attributes.get('mode', None) == 'packed': start_text = '<ul class="gallery mw-gallery-packed">' files = body.split('\n') for file in files: if file.strip() != '': res = file.split('|') filename = res[0].strip() site = wiki.Wiki('https://en.wikipedia.org/w/api.php') params = { 'action': 'query', 'titles': filename, 'prop': 'imageinfo', 'iiprop': 'url|thumbmime', 'iiurlheight': 131 } request = api.APIRequest(site, params) result = request.query() try: url = result['query']['pages'].values( )[0]['imageinfo'][0]['thumburl'] desc_url = result['query']['pages'].values( )[0]['imageinfo'][0]['descriptionurl'] width = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbwidth'] height = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbheight'] except: continue text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % ( float(int(width)) + 1.496, float(int(width)) + 1.496) text += '<div class="thumb" style="width: %spx;"><div style="margin:0px auto;">' % ( float(int(width)) + 0.496) text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % ( desc_url, url, width, height) text += '</div></div></div><div class="gallerytext"><p>' if res[1] == 'thumb': inner_text = '|'.join(res[2:]).strip() else: inner_text = '|'.join(res[1:]).strip() text += parse(inner_text) text += '</p></div></li>' start_text += text elif attributes.get('mode', None) == 'nolines': start_text = '<ul class="gallery mw-gallery-nolines">' if not attributes.get('widths'): gal_width = 125 files = body.split('\n') for file in files: if file.strip() != '': res = file.split('|') filename = res[0].strip() site = wiki.Wiki('https://en.wikipedia.org/w/api.php') params = { 'action': 'query', 'titles': filename, 'prop': 'imageinfo', 'iiprop': 'url|thumbmime', 'iiurlwidth': gal_width - 5 } request = api.APIRequest(site, params) result = request.query() try: url = result['query']['pages'].values( )[0]['imageinfo'][0]['thumburl'] desc_url = result['query']['pages'].values( )[0]['imageinfo'][0]['descriptionurl'] width = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbwidth'] height = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbheight'] except: continue ratio = float(float(width) / float(height)) if height > def_image: height = def_image width = ratio * def_image text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % ( gal_width, gal_width) text += '<div class="thumb" style="width: %spx;"><div style="margin:0px auto;">' % ( gal_width - 5) text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % ( desc_url, url, width, height) text += '</div></div></div><div class="gallerytext"><p>' if res[1] == 'thumb': inner_text = '|'.join(res[2:]).strip() else: inner_text = '|'.join(res[1:]).strip() text += parse(inner_text) text += '</p></div></li>' start_text += text else: start_text = '<ul class="gallery mw-gallery-traditional">' files = body.split('\n') for file in files: if file.strip() != '': res = file.split('|') filename = res[0].strip() site = wiki.Wiki('https://en.wikipedia.org/w/api.php') params = { 'action': 'query', 'titles': filename, 'prop': 'imageinfo', 'iiprop': 'url|thumbmime', 'iiurlwidth': gal_width - 35 } request = api.APIRequest(site, params) result = request.query() try: url = result['query']['pages'].values( )[0]['imageinfo'][0]['thumburl'] desc_url = result['query']['pages'].values( )[0]['imageinfo'][0]['descriptionurl'] width = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbwidth'] height = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbheight'] except: continue ratio = float(float(width) / float(height)) if height > def_image: height = def_image width = ratio * def_image text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % ( gal_width, gal_width) text += '<div class="thumb" style="width: %spx;"><div style="margin:%spx auto;">' % ( gal_width - 5, float(gal_width - height) / 2.0) text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % ( desc_url, url, width, height) text += '</div></div></div><div class="gallerytext"><p>' if res[1] == 'thumb': inner_text = '|'.join(res[2:]).strip() else: inner_text = '|'.join(res[1:]).strip() text += parse(inner_text) text += '</p></div></li>' start_text += text start_text += '</ul>' return start_text
longitude_number = float(lng) new_location = pw.LocationObject(description=pw.NO_DATA_UNICODE, latitude=latitude_number, longitude=longitude_number) new_plant = pw.PowerPlant(idnr, name, plant_country=COUNTRY_NAME, plant_location=new_location, plant_fuel=set([u'Coal']), plant_source=SOURCE_NAME, plant_source_url=SOURCE_URL, plant_cap_year=2017) plants_dictionary[idnr] = new_plant # use wikimarkup to detect the main table and transform to html code html = str(PyQuery(wikimarkup.parse(wiki))) # read html code into pandas dataframe frames = pd.read_html(html, header=0) df = frames[0] # make changes to Units column to include only the plant name, not the unit for i, unit in enumerate(frames[0]['Unit']): df.set_value(i, 'Unit', unit.strip('[]').split('|')[0]) for plant in plants_dictionary.values(): plant.capacity = df.groupby(['Unit']).sum().loc[plant.name].values[0] plant.nat_lang = df.loc[df['Unit'] == plant.name, 'Chinese Name'].values[0] # print list(set(df.loc[df['Unit'] == plant.name, 'Sponsor'].values)) owner = list(set(df.loc[df['Unit'] == plant.name, 'Sponsor'].values)) if len(owner) == 1:
def save_page(page): markup = wikimarkup.parse(page.edit().encode('utf-8'), showToc=False) markup = clean_page(markup) markup = put_images(markup) return page, markup
p1_score += score scores[p1_name] = p1_score total_score[p1_name][0] += p1_score total_score[p1_name].append(p1_score) # total_score[p1_name].append("%s <span style='color: gray;'>(%s)</span>" % (p1_score,)) place = 1 for name, score in sorted(scores.items(), key=lambda x: -x[1]): color = colors[by_name[name]] wiki_table += u'# <span style="color: %s">%s</span> → <b>%s</b>\n' \ % (color, name, score) total_score_places[name].append(place) place += 1 print wiki_table html = parse(wiki_table) save_file(stats_folder + '/' + u'word_%s.html' % word, html) save_file(u'stats/latest/word_%s.html' % word, html) content = '' for name, score in sorted(scores.items(), key=lambda x: x[0]): content += u'%s\t%s\n' % (name, score) save_file(stats_folder + u'/score_%s.html' % word, content) save_file(u'stats/latest/score_%s.html' % word, content) total = u'{| style="text-align: center" \n' total += u'! rowspan=2 | Место || rowspan=2 | Участник || rowspan=2 | Всего ' \ u'|| colspan=%s | Матчи \n' % len(words) total += u'|-\n' total += u'! %s\n' % u' || '.join(["[/balda/%s/ %s]" % (word, word) for word in words]) total += u'|-\n'
# coding: utf8 import json from wikimarkup import parse fh = open("ciccio.txt", "r"); content = fh.read() fh.close() obj = json.read(content) obj = obj['query']['pages']['25458']['revisions'][0]['*'] obj = obj[0:12000] #print(obj) print parse(obj)