Example #1
0
def getData(part):
    res = {}
    res.update(part)

    if "submitted_by" not in res:
        res["submitted_by"] = {"year": int(part["part_entered"][:4]), "team_name": "Unknown"}
    part_pages = []
    for pageid in pageids[part["part_name"]]:
        try:
            p = pages[str(pageid["pageid"])]
        except:
            continue
        raw_page = partinfo_re.sub("", p["revisions"][0]["*"])
        try:
            content = remove_html_tags(wikimarkup.parse(raw_page))
        except Exception as e:  # parse error
            content = remove_html_tags(raw_page)  # give up parsing
        part_pages.append(
            {
                "content": content,
                "title": pageid["title"],
                "revid": p["revisions"][0]["revid"],
                "timestamp": p["revisions"][0]["timestamp"],
                "user": p["revisions"][0]["user"],
                "pageid": pageid["pageid"],
            }
        )
    res["pages"] = part_pages
    s = score[part["part_name"]]
    res["reliability"] = s["reliability"]
    res["num_teams_used"] = s["num_teams_used"]
    return res
Example #2
0
def getData(part):
    res = {}
    res.update(part)

    if 'submitted_by' not in res:
        res['submitted_by'] = {
            'year': int(part['part_entered'][:4]),
            'team_name': 'Unknown',
        }
    part_pages = []
    for pageid in pageids[part['part_name']]:
        try:
            p = pages[str(pageid['pageid'])]
        except:
            continue
        raw_page = partinfo_re.sub('', p['revisions'][0]['*'])
        try:
            content = remove_html_tags(wikimarkup.parse(raw_page))
        except Exception as e:  # parse error
            content = remove_html_tags(raw_page)  # give up parsing
        part_pages.append({
            'content': content,
            'title': pageid['title'],
            'revid': p['revisions'][0]['revid'],
            'timestamp': p['revisions'][0]['timestamp'],
            'user': p['revisions'][0]['user'],
            'pageid': pageid['pageid'],
        })
    res['pages'] = part_pages
    s = score[part['part_name']]
    res['reliability'] = s['reliability']
    res['num_teams_used'] = s['num_teams_used']
    return res
Example #3
0
def import_wiki_sessions(sections, article, reply_to, current_task, total_count):
    for section in sections:
        heading = section.get('heading', None)
        if heading:
            parsed_text = parse(heading)
            comment_author = CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True)
            
            comments = Comment.objects.filter(article=article, author=comment_author, text=parsed_text)
            if comments.count() > 0:
                comment_wikum = comments[0]
            else:
                comment_wikum = Comment.objects.create(article = article,
                                                       author = comment_author,
                                                       text = parsed_text,
                                                       reply_to_disqus = reply_to,
                                                       text_len = len(parsed_text),
                                                       )
                comment_wikum.save()
                comment_wikum.disqus_id = comment_wikum.id
                comment_wikum.save()
                
            total_count += 1
            
            if current_task and total_count % 3 == 0:
                current_task.update_state(state='PROGRESS',
                                          meta={'count': total_count})
            
        else:
            comment_wikum = reply_to
        if len(section['comments']) > 0:
            total_count = import_wiki_talk_posts(section['comments'], article, comment_wikum.disqus_id, current_task, total_count)
        if len(section['subsections']) > 0:
            total_count = import_wiki_sessions(section['subsections'], article, comment_wikum.disqus_id, current_task, total_count)
    return total_count
Example #4
0
def get_table(title, site='en.wikipedia.org', table_idx=0):
    """
    Given an article title, site, and the index of that table within that page
    returns a dataframe constructed from that table.
    
    Parameters:
      title (str)     : article title (not a URL)
      site  (str)     : site base url (i.e. en.wikipedia.org)
      table_idx (int) : index of table on page (zero-indexed, so first table is idx=0) 
      
    Returns:
      a pandas.DataFrame with the column names set to the column headers and a default
      index
    """
    assert 'http://' not in title, 'dataframe_from_url expects a title, not a full URL'
    url = 'http://{0}/w/api.php?action=query&titles={1}&prop=revisions&rvprop=content&format=json'.format(site, title)
    logger.info('getting url: %s', url)
    res = json.loads(urllib.urlopen(url).read())
    wiki = res['query']['pages'].values()[0]['revisions'][0]['*']
    html = wikimarkup.parse(wiki)
    dom = soup.BeautifulSoup(html)
    table = itertools.islice(dom.findAll('table'),table_idx, table_idx + 1).next()
    row_tags = table.findAll('tr')
    header = [e.text.strip() for e in row_tags[0].findAll('th')]
    rows = [[e.text.strip() for e in row.findAll('td')] for row in row_tags[1:]]
    df = pd.DataFrame(rows, columns = header)
    return df
Example #5
0
def result(task_id):
	task = processarticle.AsyncResult(task_id)
	if task.state == 'SUCCESS':
		content = task.result['article']
		html = parse(content, False)
		return render_template("result.html", content = content, html = html, info = task.result['info'])
	else:
		return "Be more patient!"
def get_tables_1(wiki):
    html = PyQuery(wikimarkup.parse(wiki))
    frames = []
    for table in html('table'):
        data = [[x.text.strip() for x in row]
                for row in table.getchildren()]
        df = pd.DataFrame(data[1:], columns=data[0])
        frames.append(df)
    return frames
Example #7
0
def get_tables(url):
    html = PyQuery(wikimarkup.parse(get_wiki_raw(url)))
    frames = pd.DataFrame()
    for table in html('table'):
        data = [[x.text.strip() for x in row]
                for row in table.getchildren()]
        df = pd.DataFrame(data[1:], columns=data[0])
        if np.prod(df.shape) > np.prod(frames.shape):
		frames = df
    return frames
Example #8
0
def get_most_active_users_wiki_df(url):
    res = json.loads(urllib.urlopen(url).read())
    wiki = res['query']['pages'].values()[0]['revisions'][0]['*']
    html = wikimarkup.parse(wiki)
    dom = soup.BeautifulSoup(html)
    row_tags = dom.findAll('tr')
    header = [e.text.strip() for e in row_tags[0].findAll('th')]
    rows = [[e.text.strip() for e in row.findAll('td')] for row in row_tags[1:]]
    df = pd.DataFrame(rows, columns = header)
    df.User = df.User.apply(extract_username)
    return df[~df.User.isnull()]
Example #9
0
 def show(self, c):
     from wikimarkup import parse
     f = os.path.join(self.runpath, 'data', '%s.wiki' % c)
     if os.path.exists(f):
         wiki = open(f).read()
         wiki = reg.sub(self._pic, wiki)
         wiki = reg1.sub(' (', wiki)
         wiki = reg2.sub(')', wiki)
         html = parse(wiki, showToc=False)
         self.load_html_string(html, 'file:///')
     else:
         self.open(os.path.join(self.runpath, 'err.html'))
Example #10
0
 def show(self, c):
     from wikimarkup import parse
     f = os.path.join(self.runpath, 'data', '%s.wiki' % c)
     if os.path.exists(f):
         wiki = open(f).read()
         wiki = reg.sub(self._pic, wiki)
         wiki = reg1.sub(' (', wiki)
         wiki = reg2.sub(')', wiki)
         html = parse(wiki, showToc=False)
         self.load_html_string(html, 'file:///')
     else:
         self.open(os.path.join(self.runpath, 'err.html'))
Example #11
0
def import_wiki_talk_posts(comments, article, reply_to, current_task, total_count):
    
    for comment in comments:
        text = ''
        for block in comment['text_blocks']:
            t = parse(block)
            if t.strip() != '':
                text += '<P>%s</P>' % t
       
        author = comment.get('author')
        if author:
            comment_author = import_wiki_authors([author], article)[0]
        else:
            comment_author = CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True)
            
        comments = Comment.objects.filter(article=article, author=comment_author,text=text)
        if comments.count() > 0:
            comment_wikum = comments[0]
        else:
            time = None
            if comment.get('time_stamp'):
                time = datetime.datetime.strptime(comment['time_stamp'], '%H:%M, %d %B %Y (%Z)')

            cosigners = comment['cosigners']
            comment_cosigners = import_wiki_authors(cosigners, article)

            comment_wikum = Comment.objects.create(article = article,
                                                   author = comment_author,
                                                   text = text,
                                                   reply_to_disqus = reply_to,
                                                   text_len = len(text),
                                                   )
            if time:
                comment_wikum.created_at = time
            
            comment_wikum.save()
            comment_wikum.disqus_id = comment_wikum.id
            comment_wikum.save()
            
            for signer in comment_cosigners:
                comment_wikum.cosigners.add(signer)
        
        total_count += 1
        
        if current_task and total_count % 3 == 0:
            current_task.update_state(state='PROGRESS',
                                      meta={'count': total_count})
        
        replies = comment['comments']
        total_count = import_wiki_talk_posts(replies, article, comment_wikum.disqus_id, current_task, total_count)
    
    return total_count
Example #12
0
    def wikified_content(self):
        # TODO: check memcache for rendered page?

        # replacements here
        transforms = [
            AutoLink(),
            WikiWords(),
            HideReferers(),
        ]
        content = self.content
        content = wikimarkup.parse(content)
        for transform in transforms:
            content = transform.run(content, self)
        return content
Example #13
0
def save_org_description(request):
    if request.user.is_anonymous():
        raise Http404
    text = request.POST['text']
    org_id = request.POST['id']
    field = request.POST['field']
    data_format = request.POST['data_format']
    org = models.Organization.objects.get(id = org_id)
    setattr(org, field, text)
    org.save()
    #save the descr
    if data_format == 'mediawiki':
        data = simplejson.dumps({'text': parse(text), 'id': org_id })
    else:
        data = simplejson.dumps({'text': text, 'id': org_id })
    return http.HttpResponse(data, mimetype="application/json")
Example #14
0
def getData(team):
    res = {}



    res.update(results[team])
    ##res.update(wikis[team])
    res['year'] = YEAR
    
    idx=0
    
    team_pages = []
    for pageid in pageids[team]:
#        idx += 1
#        if(idx > 79):
#            break
        try:
            p = pages[str(pageid['pageid'])]
        except:
            continue
        raw_page = p['revisions'][0]['*']
        try:
            content = remove_html_tags(wikimarkup.parse(raw_page))
        except Exception as e: # parse error
            content = remove_html_tags(raw_page) # give up parsing
            print ('give up')
        content = '<{<{<wikititle>}>}>' + pageid['title'] + '<{<{</wikititle>}>}>' + content  # add title to content header
        team_pages.append({
            'content': content,
            'title': pageid['title'],
            'revid': p['revisions'][0]['revid'],
            'timestamp': p['revisions'][0]['timestamp'],
            'user': p['revisions'][0]['user'],
            'pageid': pageid['pageid'],
            })
#        print(str(idx) + ':' + str(pageid))
#        pprint.pprint(content)
    res['pages'] = team_pages
#    res['school'] = teaminfo[team]['school']
#    res['description'] = teaminfo[team]['description']
#    res['title'] = teaminfo[team]['title']
#    res['track'] = teaminfo[team]['track']
#    res['abstract'] = teaminfo[team]['abstract']
    res.update(teaminfo[team])
    

    return res
Example #15
0
def getData(team):
    res = {}

    res.update(results[team])
    res["year"] = YEAR

    #    idx=0

    team_pages = []
    for pageid in pageids[team]:
        #        idx += 1
        #        if(idx > 79):
        #            break
        try:
            p = pages[str(pageid["pageid"])]
        except:
            continue
        raw_page = p["revisions"][0]["*"]
        try:
            content = remove_html_tags(wikimarkup.parse(raw_page))
        except Exception as e:  # parse error
            content = remove_html_tags(raw_page)  # give up parsing
            print ("give up")
        content = (
            "<{<{<wikititle>}>}>" + pageid["title"] + "<{<{</wikititle>}>}>" + content
        )  # add title to content header
        team_pages.append(
            {
                "content": content,
                "title": pageid["title"],
                "revid": p["revisions"][0]["revid"],
                "timestamp": p["revisions"][0]["timestamp"],
                "user": p["revisions"][0]["user"],
                "pageid": pageid["pageid"],
            }
        )
    #        print(str(idx) + ':' + str(pageid))
    #        pprint.pprint(content)
    res["pages"] = team_pages
    #    res['school'] = teaminfo[team]['school']
    #    res['description'] = teaminfo[team]['description']
    #    res['title'] = teaminfo[team]['title']
    #    res['track'] = teaminfo[team]['track']
    #    res['abstract'] = teaminfo[team]['abstract']
    res.update(teaminfo[team])

    return res
Example #16
0
    def save_illustrations_page(page):
        def insert_images(markup):
            def replace_image(match):
                image = match.group(1)
                image = mwpage.Page.normalize_title(image)
                image_jobs.append(gevent.spawn(download_image, image))
                return '<img src={0} alt="{0}/>'.format(image)

            images = re.compile(r'Image\:(.+?)(?:\n|\|.+)', re.IGNORECASE)
            markup = re.sub(images, replace_image, markup)
            return markup

        markup = wikimarkup.parse(page.edit().encode('utf-8'), showToc=False)
        markup = clean_page(markup)
        markup = re.sub(r'&lt;gallery&gt;', '', markup)
        markup = re.sub(r'&lt;/gallery&gt;', '', markup)
        markup = insert_images(markup)
        return page, markup
Example #17
0
 def parse_wiki(self):
     for index in self._query_hash:
         query = self._query_hash[index]
         content = self.retrieve(query)
         if content:
             doc = json.load(content)
             ## get the content of the markup
             ## thanks to http://goo.gl/wDPha
             text = doc['query']['pages'].itervalues().next(
             )['revisions'][0]['*']
             global g_current_query
             g_current_query = query
             html = parse(text)
             print 'Query processed: %s' % query
             ## wait for 1 second to avoid unnecessary banning from WikiPedia server
             time.sleep(1)
         else:
             print 'Skipping query %s' % query
Example #18
0
def parse_json(json_file):
  try:
    with open(json_file) as f:
      print 'Loading %s' % (json_file)
      dump_json = json.load(f)

  except IOError as e:
    print 'Failed to open file: %s' % json_file

  global g_cur_idx
  global g_dist_hash
  global g_timestamp
  global g_rel_ent_dist_db

  sorted_index = dump_json.keys()
  sorted_index.sort(key=lambda x: int(x))

  for index in sorted_index:
    g_cur_idx = index
    g_dist_hash[g_cur_idx] = {}
    g_rev_hash[g_cur_idx] = {}

    query = dump_json[index]['query']
    g_rel_ent_dist_db.hset(RedisDB.query_ent_hash, index, query)
    #print 'Query: %s %s' % (index, query)
    #continue

    for rev_id in dump_json[index]['revisions']:
      revid = dump_json[index]['revisions'][rev_id]['revid']
      g_timestamp = dump_json[index]['revisions'][rev_id]['timestamp']
      text = dump_json[index]['revisions'][rev_id]['text']
      #print '%s %s %s' % (query, revid, timestamp)

      g_rev_hash[g_cur_idx][g_timestamp] = revid

      try:
        html = parse(text)
      # catch all other exceptions in the parse process,
      # print out the traceback, and move on without interruption
      except:
        print "Exception in parse_json()"
        print '-' * 60
        traceback.print_exc(file=sys.stdout)
        print '-' * 60
        pass

    g_rel_ent_dist_db.rpush(RedisDB.query_ent_list, index)

    sorted_ts = g_dist_hash[index].keys()
    # sort the timestamp in chronic order
    sorted_ts.sort(key=lambda x: datetime.datetime.strptime(x,
      '%Y-%m-%dT%H:%M:%SZ'))

    # for each month, save the number of related entities
    # as well as the related entities
    last_num = 0
    last_revid = 0
    last_ts = ''
    last_dt_str = datetime.datetime.strptime(sorted_ts[0],
        '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m')

    for ts in sorted_ts:
      revid = g_rev_hash[index][ts]

      dt_str = datetime.datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m')
      if dt_str != last_dt_str:
        hash_key = 'query-%s' % index

        # save the number of related entities for last month
        g_rel_ent_dist_db.hset(hash_key, last_dt_str, last_num)
        print '%s %s %s %s %d' % (index, query, last_dt_str, last_revid, last_num)

        # save the related entities for last month
        ent_list = []
        for ent in g_dist_hash[index][last_ts]:
          ent_list.append(ent)

        ent_str = '='.join(ent_list)
        hash_key = 'query-rel-ent-%s' % index
        g_rel_ent_dist_db.hset(hash_key, last_dt_str, ent_str)

      last_num = len(g_dist_hash[index][ts].keys())
      last_revid = revid
      last_dt_str = dt_str
      last_ts = ts
Example #19
0
def markdown(s):
    wikimarkup.registerInternalLinkHook(None, link_hook)
    return wikimarkup.parse(s, showToc=True)
Example #20
0
def render(content, *args, **kwargs):
    return wikimarkup.parse(content, *args, **kwargs)
Example #21
0
def recurse_viz(parent, posts, replaced, article, is_collapsed):
    children = []
    hid_children = []
    replace_children = []
    
    pids = [post.disqus_id for post in posts]
    
    if replaced:
        num_subtree_children = 0
    else:
        num_subtree_children = len(pids)
    
    reps = Comment.objects.filter(reply_to_disqus__in=pids, article=article).select_related()
    for post in posts:
        if post.json_flatten == '':
        #if True:
            if post.author:
                if post.author.anonymous:
                    author = "Anonymous"
                else:
                    author = post.author.username
            else:
                author = ""
                
            v1 = {'size': post.points,
                  'd_id': post.id,
                  'parent': parent.id if parent else None,
                  'author': author,
                  'replace_node': post.is_replacement,
                  'collapsed': is_collapsed,
                  'tags': [(tag.text, tag.color) for tag in post.tags.all()]
                  }

            if 'https://en.wikipedia.org/wiki/' in article.url:
                v1['name'] = parse(post.text)
                v1['wikitext'] = post.text
                
                if post.summary.strip() == '':
                    v1['summary'] = ''
                else:
                    v1['summary'] = parse(post.summary)
                
                v1['sumwiki'] = post.summary
                    
                if post.extra_summary.strip() == '':
                    v1['extra_summary'] = ''
                else:
                    v1['summary'] = parse(post.extra_summary)
                
                v1['extrasumwiki'] = post.extra_summary
                
            else:
                v1['name'] = post.text
                v1['summary'] = post.summary
                v1['extra_summary'] = post.extra_summary
                
            
            
            c1 = reps.filter(reply_to_disqus=post.disqus_id).order_by('-points')
            if c1.count() == 0:
                vals = []
                hid = []
                rep = []
                num_subchildren = 0
            else:
                replace_future = replaced or post.is_replacement
                vals, hid, rep, num_subchildren = recurse_viz(post, c1, replace_future, article, is_collapsed or post.is_replacement)
            v1['children'] = vals
            v1['hid'] = hid
            v1['replace'] = rep
            post.json_flatten = json.dumps(v1)
            if not post.is_replacement:
                post.num_subchildren = num_subchildren
            else:
                post.num_subchildren = 0
            post.save()
            if not post.is_replacement:
                num_subtree_children += num_subchildren
        else:
            v1 = json.loads(post.json_flatten)
        
        if post.hidden:
            hid_children.append(v1)
        elif parent and parent.is_replacement:
            replace_children.append(v1)
        else:
            children.append(v1)
            
    return children, hid_children, replace_children, num_subtree_children
Example #22
0
 def parse(self, input_type, output_format, text):
     return parse(text)
Example #23
0
def parse_json(json_file):
  try:
    with open(json_file) as f:
      print 'Loading %s' % (json_file)
      dump_json = json.load(f)

  except IOError as e:
    print 'Failed to open file: %s' % json_file

  global g_cur_idx
  global g_dist_hash
  global g_timestamp
  global g_rel_ent_dist_db

  sorted_index = dump_json.keys()
  sorted_index.sort(key=lambda x: int(x))

  for index in sorted_index:
    g_cur_idx = index
    g_dist_hash[g_cur_idx] = {}
    g_rev_hash[g_cur_idx] = {}

    query = dump_json[index]['query']
    g_rel_ent_dist_db.rpush(RedisDB.query_ent_list, index)
    g_rel_ent_dist_db.hset(RedisDB.query_ent_hash, index, query)
    print 'Query: %s %s' % (index, query)
    #continue

    for rev_id in dump_json[index]['revisions']:
      revid = dump_json[index]['revisions'][rev_id]['revid']
      g_timestamp = dump_json[index]['revisions'][rev_id]['timestamp']
      text = dump_json[index]['revisions'][rev_id]['text']
      #print '%s %s %s' % (query, revid, timestamp)

      g_rev_hash[g_cur_idx][g_timestamp] = revid

      try:
        html = parse(text)
      # catch all other exceptions in the parse process,
      # print out the traceback, and move on without interruption
      except:
        print "Exception in parse_json()"
        print '-' * 60
        traceback.print_exc(file=sys.stdout)
        print '-' * 60
        pass

    sorted_ts = g_dist_hash[index].keys()
    # sort the timestamp in chronic order
    sorted_ts.sort(key=lambda x: datetime.datetime.strptime(x,
      '%Y-%m-%dT%H:%M:%SZ'))

    # for each month, save the number of related entities
    # as well as the related entities
    last_revid = 0
    last_ts = ''
    last_dt_str = datetime.datetime.strptime(sorted_ts[0],
        '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m')

    rel_ent_hash = {}
    irrel_ent_hash = {}
    for ts in sorted_ts:
      revid = g_rev_hash[index][ts]

      dt_str = datetime.datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m')
      if dt_str != last_dt_str:
        # save the related entities for last month
        ent_list = []
        for ent in g_dist_hash[index][last_ts]:
          # check the cache first
          if ent in rel_ent_hash:
            ent_list.append(ent)
            continue

          if ent in irrel_ent_hash:
            continue

          # update the cache accordingly
          if in_rel_doc(query, ent):
            ent_list.append(ent)
            rel_ent_hash[ent] = 1
          else:
            irrel_ent_hash[ent] = 1

        # save the number of related entities for last month
        hash_key = 'query-rel-ent-num-%s' % index
        ent_num = len(ent_list)
        g_rel_ent_dist_db.hset(hash_key, last_dt_str, ent_num)
        print '%s %s %s %s %d' % (index, query, last_dt_str, last_revid, ent_num)

      last_revid = revid
      last_dt_str = dt_str
      last_ts = ts
Example #24
0
def summarize_comment(request):
    try:
        article_id = request.POST['article']
        a = Article.objects.get(id=article_id)
        id = request.POST['id']
        summary = request.POST['comment']
        
        top_summary, bottom_summary = get_summary(summary)
        
        req_user = request.user if request.user.is_authenticated() else None
        
        c = Comment.objects.get(id=id)
        from_summary = c.summary + '\n----------\n' + c.extra_summary
        c.summary = top_summary
        c.extra_summary = bottom_summary
        c.save()
        
        if from_summary != '':
            action = 'edit_sum'
            explanation = 'edit summary'
        else :
            action = 'sum_comment'
            explanation = 'initial summary'
            

        h = History.objects.create(user=req_user, 
                                   article=a,
                                   action=action,
                                   from_str=from_summary,
                                   to_str=summary,
                                   explanation=explanation)
        
        h.comments.add(c)
        recurse_up_post(c)
        
        
        if 'wikipedia.org' in a.url:
            res = {}
            if top_summary.strip() != '':
                res['top_summary'] = parse(top_summary)
            else:
                res['top_summary'] = ''
            
            res['top_summary_wiki'] = top_summary
            
            if bottom_summary.strip() != '':
                res['bottom_summary'] = parse(bottom_summary)
            else:
                res['bottom_summary'] = ''
            
            res['bottom_summary_wiki'] = bottom_summary
                
            return JsonResponse(res)
        else:
            return JsonResponse({'top_summary': top_summary,
                                 'bottom_summary': bottom_summary})
        
        
    except Exception, e:
        print e
        return HttpResponseBadRequest()
Example #25
0
def summarize_selected(request):
    try:
        article_id = request.POST['article']
        a = Article.objects.get(id=article_id)
        ids = request.POST.getlist('ids[]')
        children_ids = request.POST.getlist('children[]')
        children_ids = [int(x) for x in children_ids]
        child_id = request.POST['child']
        
        delete_nodes = request.POST.getlist('delete_nodes[]')
        
        summary = request.POST['comment']
        
        top_summary, bottom_summary = get_summary(summary)
        
        req_user = request.user if request.user.is_authenticated() else None
        
        comments = Comment.objects.filter(id__in=ids)
        children = [c for c in comments if c.id in children_ids]
        child = Comment.objects.get(id=child_id)

        new_id = random_with_N_digits(10);
            
        new_comment = Comment.objects.create(article=a, 
                                             is_replacement=True, 
                                             reply_to_disqus=child.reply_to_disqus,
                                             summary=top_summary,
                                             extra_summary=bottom_summary,
                                             disqus_id=new_id,
                                             points=child.points,
                                             text_len=len(summary))

        h = History.objects.create(user=req_user, 
                                   article=a,
                                   action='sum_selected',
                                   to_str=summary,
                                   explanation='initial summary of group of comments') 
       
        for c in children:
            c.reply_to_disqus = new_id
            c.save()
            
        for c in comments:
            h.comments.add(c)
            
                
        for node in delete_nodes:
            delete_node(node)
        
        
        recurse_up_post(new_comment)
        
        recurse_down_num_subtree(new_comment)
        
        make_vector(new_comment, a)
        
        
        if 'wikipedia.org' in a.url:
            res = {'d_id': new_comment.id}
            if top_summary.strip() != '':
                res['top_summary'] = parse(top_summary)
            else:
                res['top_summary'] = ''
            
            res['top_summary_wiki'] = top_summary
            
            if bottom_summary.strip() != '':
                res['bottom_summary'] = parse(bottom_summary)
            else:
                res['bottom_summary'] = ''
            
            res['bottom_summary_wiki'] = bottom_summary
                
            return JsonResponse(res)
        else:
            return JsonResponse({'d_id': new_comment.id,
                                 'top_summary': top_summary,
                                 'bottom_summary': bottom_summary})
        
    except Exception, e:
        print e
        return HttpResponseBadRequest()  
Example #26
0
def summarize_comments(request):
    try:
        article_id = request.POST['article']
        a = Article.objects.get(id=article_id)
        id = request.POST['id']
        summary = request.POST['comment']
        
        top_summary, bottom_summary = get_summary(summary)

        delete_nodes = request.POST.getlist('delete_nodes[]')
        
        req_user = request.user if request.user.is_authenticated() else None
        
        c = Comment.objects.get(id=id)
        
        if not c.is_replacement:
            new_id = random_with_N_digits(10);
            
            new_comment = Comment.objects.create(article=a, 
                                                 is_replacement=True, 
                                                 reply_to_disqus=c.reply_to_disqus,
                                                 summary=top_summary,
                                                 extra_summary=bottom_summary,
                                                 disqus_id=new_id,
                                                 points=c.points,
                                                 text_len=len(summary))
        
            c.reply_to_disqus = new_id
            c.save()

            h = History.objects.create(user=req_user, 
                                       article=a,
                                       action='sum_nodes',
                                       to_str=summary,
                                       explanation='initial summary of subtree')
            
            d_id = new_comment.id
            
            recurse_down_num_subtree(new_comment)
            
        else:
            from_summary = c.summary + '\n----------\n' + c.extra_summary
            c.summary = top_summary
            c.extra_summary=bottom_summary
            c.save()
            
            h = History.objects.create(user=req_user, 
                           article=a,
                           action='edit_sum_nodes',
                           from_str=from_summary,
                           to_str=summary,
                           explanation='edit summary of subtree')
            
            d_id = c.id
            
            new_comment = c
        
        
        
        for node in delete_nodes:
            delete_node(node)
        
        h.comments.add(c)
        recurse_up_post(c)
        
        
        make_vector(new_comment, a)
        if 'wikipedia.org' in a.url:
            res = {'d_id': d_id}
            if top_summary.strip() != '':
                res['top_summary'] = parse(top_summary)
            else:
                res['top_summary'] = ''
            
            res['top_summary_wiki'] = top_summary
            
            if bottom_summary.strip() != '':
                res['bottom_summary'] = parse(bottom_summary)
            else:
                res['bottom_summary'] = ''
            
            res['bottom_summary_wiki'] = bottom_summary
                
            return JsonResponse(res)
        else:
            return JsonResponse({'d_id': d_id,
                                 'top_summary': top_summary,
                                 'bottom_summary': bottom_summary})
        
    except Exception, e:
        print e
        
        import traceback
        print traceback.format_exc()
        
        return HttpResponseBadRequest()  
Example #27
0
def markdown(s):
    wikimarkup.registerInternalLinkHook(None, link_hook)
    return wikimarkup.parse(s, showToc=True)
Example #28
0
def galleryTagHook(parser_env, body, attributes={}):
    widths = attributes.get('widths')
    if widths:
        widths = re.sub('px', '', widths)
        gal_width = int(widths)
    else:
        gal_width = 155

    heights = attributes.get('heights')
    if heights:
        heights = re.sub('px', '', heights)
        def_image = int(heights)
    else:
        def_image = 120

    start_text = ''
    if attributes.get('mode', None) == 'packed':
        start_text = '<ul class="gallery mw-gallery-packed">'
        files = body.split('\n')
        for file in files:
            if file.strip() != '':
                res = file.split('|')
                filename = res[0].strip()

                site = wiki.Wiki('https://en.wikipedia.org/w/api.php')

                params = {
                    'action': 'query',
                    'titles': filename,
                    'prop': 'imageinfo',
                    'iiprop': 'url|thumbmime',
                    'iiurlheight': 131
                }
                request = api.APIRequest(site, params)
                result = request.query()
                try:
                    url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumburl']
                    desc_url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['descriptionurl']
                    width = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbwidth']
                    height = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbheight']
                except:
                    continue
                text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % (
                    float(int(width)) + 1.496, float(int(width)) + 1.496)
                text += '<div class="thumb" style="width: %spx;"><div style="margin:0px auto;">' % (
                    float(int(width)) + 0.496)

                text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % (
                    desc_url, url, width, height)
                text += '</div></div></div><div class="gallerytext"><p>'
                if res[1] == 'thumb':
                    inner_text = '|'.join(res[2:]).strip()
                else:
                    inner_text = '|'.join(res[1:]).strip()
                text += parse(inner_text)
                text += '</p></div></li>'
                start_text += text
    elif attributes.get('mode', None) == 'nolines':
        start_text = '<ul class="gallery mw-gallery-nolines">'

        if not attributes.get('widths'):
            gal_width = 125

        files = body.split('\n')
        for file in files:
            if file.strip() != '':
                res = file.split('|')
                filename = res[0].strip()

                site = wiki.Wiki('https://en.wikipedia.org/w/api.php')
                params = {
                    'action': 'query',
                    'titles': filename,
                    'prop': 'imageinfo',
                    'iiprop': 'url|thumbmime',
                    'iiurlwidth': gal_width - 5
                }
                request = api.APIRequest(site, params)
                result = request.query()
                try:
                    url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumburl']
                    desc_url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['descriptionurl']
                    width = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbwidth']
                    height = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbheight']
                except:
                    continue

                ratio = float(float(width) / float(height))

                if height > def_image:
                    height = def_image
                    width = ratio * def_image

                text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % (
                    gal_width, gal_width)
                text += '<div class="thumb" style="width: %spx;"><div style="margin:0px auto;">' % (
                    gal_width - 5)

                text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % (
                    desc_url, url, width, height)
                text += '</div></div></div><div class="gallerytext"><p>'
                if res[1] == 'thumb':
                    inner_text = '|'.join(res[2:]).strip()
                else:
                    inner_text = '|'.join(res[1:]).strip()
                text += parse(inner_text)
                text += '</p></div></li>'
                start_text += text
    else:
        start_text = '<ul class="gallery mw-gallery-traditional">'
        files = body.split('\n')
        for file in files:
            if file.strip() != '':
                res = file.split('|')
                filename = res[0].strip()

                site = wiki.Wiki('https://en.wikipedia.org/w/api.php')
                params = {
                    'action': 'query',
                    'titles': filename,
                    'prop': 'imageinfo',
                    'iiprop': 'url|thumbmime',
                    'iiurlwidth': gal_width - 35
                }
                request = api.APIRequest(site, params)
                result = request.query()
                try:
                    url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumburl']
                    desc_url = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['descriptionurl']
                    width = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbwidth']
                    height = result['query']['pages'].values(
                    )[0]['imageinfo'][0]['thumbheight']
                except:
                    continue

                ratio = float(float(width) / float(height))

                if height > def_image:
                    height = def_image
                    width = ratio * def_image

                text = '<li class="gallerybox" style="width: %spx"><div style="width: %spx">' % (
                    gal_width, gal_width)
                text += '<div class="thumb" style="width: %spx;"><div style="margin:%spx auto;">' % (
                    gal_width - 5, float(gal_width - height) / 2.0)

                text += '<a href="%s" class="image"><img src="%s" width="%s" height="%s"></a>' % (
                    desc_url, url, width, height)
                text += '</div></div></div><div class="gallerytext"><p>'
                if res[1] == 'thumb':
                    inner_text = '|'.join(res[2:]).strip()
                else:
                    inner_text = '|'.join(res[1:]).strip()
                text += parse(inner_text)
                text += '</p></div></li>'
                start_text += text
    start_text += '</ul>'
    return start_text
Example #29
0
        longitude_number = float(lng)
        new_location = pw.LocationObject(description=pw.NO_DATA_UNICODE,
                                         latitude=latitude_number,
                                         longitude=longitude_number)
        new_plant = pw.PowerPlant(idnr,
                                  name,
                                  plant_country=COUNTRY_NAME,
                                  plant_location=new_location,
                                  plant_fuel=set([u'Coal']),
                                  plant_source=SOURCE_NAME,
                                  plant_source_url=SOURCE_URL,
                                  plant_cap_year=2017)
        plants_dictionary[idnr] = new_plant

# use wikimarkup to detect the main table and transform to html code
html = str(PyQuery(wikimarkup.parse(wiki)))

# read html code into pandas dataframe
frames = pd.read_html(html, header=0)
df = frames[0]

# make changes to Units column to include only the plant name, not the unit
for i, unit in enumerate(frames[0]['Unit']):
    df.set_value(i, 'Unit', unit.strip('[]').split('|')[0])

for plant in plants_dictionary.values():
    plant.capacity = df.groupby(['Unit']).sum().loc[plant.name].values[0]
    plant.nat_lang = df.loc[df['Unit'] == plant.name, 'Chinese Name'].values[0]
    # print list(set(df.loc[df['Unit'] == plant.name, 'Sponsor'].values))
    owner = list(set(df.loc[df['Unit'] == plant.name, 'Sponsor'].values))
    if len(owner) == 1:
Example #30
0
    def save_page(page):
        markup = wikimarkup.parse(page.edit().encode('utf-8'), showToc=False)
        markup = clean_page(markup)
        markup = put_images(markup)

        return page, markup
Example #31
0
                    p1_score += score
            scores[p1_name] = p1_score
            total_score[p1_name][0] += p1_score
            total_score[p1_name].append(p1_score)
            # total_score[p1_name].append("%s <span style='color: gray;'>(%s)</span>" % (p1_score,))

        place = 1
        for name, score in sorted(scores.items(), key=lambda x: -x[1]):
            color = colors[by_name[name]]
            wiki_table += u'# <span style="color: %s">%s</span> → <b>%s</b>\n' \
                          % (color, name, score)
            total_score_places[name].append(place)
            place += 1

        print wiki_table
        html = parse(wiki_table)
        save_file(stats_folder + '/' + u'word_%s.html' % word, html)
        save_file(u'stats/latest/word_%s.html' % word, html)

        content = ''
        for name, score in sorted(scores.items(), key=lambda x: x[0]):
            content += u'%s\t%s\n' % (name, score)
        save_file(stats_folder + u'/score_%s.html' % word, content)
        save_file(u'stats/latest/score_%s.html' % word, content)

    total = u'{| style="text-align: center" \n'
    total += u'! rowspan=2 | Место || rowspan=2 | Участник || rowspan=2 | Всего ' \
             u'|| colspan=%s | Матчи \n' % len(words)
    total += u'|-\n'
    total += u'! %s\n' % u' || '.join(["[/balda/%s/ %s]" % (word, word) for word in words])
    total += u'|-\n'
Example #32
0
# coding: utf8
import json
from wikimarkup import parse
fh = open("ciccio.txt", "r");
content = fh.read()
fh.close()
obj = json.read(content)
obj = obj['query']['pages']['25458']['revisions'][0]['*']
obj = obj[0:12000]
#print(obj)
print parse(obj)