def test_equality(self): u1 = page.Page(self.site, "GoodUsername", check=True) u2 = page.Page(self.site, "GoodUsername", check=False) self.assertEqual(u1, u2) site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php") u3 = user.User(site2, "GoodUsername") self.assertNotEqual(u1, u3)
def queries(): if args.queryfile: for l in open(args.queryfile): yield l.strip() elif args.query: yield args.query elif args.category and not HAS_WIKITOOLS: sys.exit( "-cat option given, but wikitools package is not present, see < https://github.com/alexz-enwp/wikitools >" ) elif args.category and HAS_WIKITOOLS: site = wiki.Wiki("https://commons.wikimedia.org/w/api.php") query = [] params = { 'action': 'query', 'prop': 'imageinfo', 'iiprop': 'url', 'generator': 'categorymembers', 'gcmtitle': 'Category:' + args.category, 'gcmnamespace': '6', 'gcmprop': 'title' } req = api.APIRequest(site, params) for data in req.queryGen(): keys = data['query']['pages'].keys() for key in keys: url = data['query']['pages'][key]['imageinfo'][0]['url'] yield re.sub("https://upload.wikimedia.org", "", url) else: sys.exit("No query given")
def fileHook(parser_env, namespace, body): (file_name, pipe, size) = body.partition('|') site = wiki.Wiki('https://en.wikipedia.org/w/api.php') params = { 'action': 'query', 'titles': 'File:' + file_name, 'prop': 'imageinfo', 'iiprop': 'url|thumbmime', 'iiurlwidth': size } request = api.APIRequest(site, params) result = request.query() try: url = result['query']['pages'].values()[0]['imageinfo'][0]['thumburl'] desc_url = result['query']['pages'].values( )[0]['imageinfo'][0]['descriptionurl'] width = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbwidth'] height = result['query']['pages'].values( )[0]['imageinfo'][0]['thumbheight'] except: return file_name text = '<a href="%s" class="image">' % desc_url text += '<img alt="%s" src="%s" width="%s" height="%s"></a>' % ( file_name, url, width, height) return text
def genotype_getter(my_filtered_snps): site = wiki.Wiki("http://snpedia.com/api.php") genotypes = {} for single_snp in my_filtered_snps: type_counter = 1 wikipage = page.Page(site, single_snp.name) snp_page = wikipage.getWikiText() while snp_page.find("geno" + str(type_counter)) != -1: if genotypes.has_key(single_snp.name): current_genotypes = genotypes[single_snp.name] type_start = snp_page.find("geno" + str(type_counter)) type_start = snp_page.find("(", type_start) type_stop = snp_page.find(")", type_start) current_genotypes.append( str(snp_page[type_start:type_stop + 1])) genotypes[single_snp.name] = current_genotypes else: type_start = snp_page.find("geno" + str(type_counter)) type_start = snp_page.find("(", type_start) type_stop = snp_page.find(")", type_start) genotypes[single_snp.name] = [ str(snp_page[type_start:type_stop + 1]) ] type_counter += 1 print "Got genotypes for " + str(single_snp.name) genotype_outfile = open("genotypes.data", "wb") pickle.dump(genotypes, genotype_outfile) genotype_outfile.close() return genotypes
def loadData(self): from wikitools import wiki from wikitools import category wikiobj = wiki.Wiki("https://en.wikipedia.org/w/api.php") wikicat = category.Category(wikiobj, title="2016_films") self.wikipages = wikicat.getAllMembers()
def extlinks_extraction(self, lang, title): links = [] linklist = [] site = wiki.Wiki("https://" + lang + ".wikipedia.org/w/api.php") #urllib2.quote(title.encode("utf8")) #title = title.encode("utf-8") params = { 'action': 'query', 'titles': title, 'prop': 'extlinks', 'ellimit': 500 } req = api.APIRequest(site, params) for res in req.queryGen(): #pprint.pprint(res) for pidkey in res['query']['pages']: #print res['query']['pages'] if 'extlinks' in res['query']['pages'][pidkey]: linklist = res['query']['pages'][pidkey][ 'extlinks'] + linklist links = links + linklist linklist = [] # print links return links
def getBLPs(): site = wiki.Wiki() site.login(settings.bot, settings.botpass) site.setMaxlag(-1) date = datetime.datetime.utcnow()+datetime.timedelta(days=5) table = date.strftime('pop_%b%y') db = MySQLdb.connect(host="sql-s1-user", read_default_file="/home/alexz/.my.cnf") cursor = db.cursor() insertquery = 'INSERT INTO u_alexz.'+table+' (title, project_assess) VALUES( %s, %s )' updatequery = 'UPDATE u_alexz.'+table+' SET project_assess=CONCAT(project_assess,",",%s) WHERE title=%s' selectquery = """SELECT page_title FROM enwiki_p.page JOIN enwiki_p.categorylinks ON page_id=cl_from WHERE cl_to='Living_people' AND page_namespace=0 AND page_is_redirect=0 """ cursor.execute(selectquery) pagesincat = cursor.fetchall() project_assess = "'wpblp':(None,None)" for title in pagesincat: realtitle = title[0].decode('utf8').encode('utf8') if realtitle in titlelist: bits = (project_assess, realtitle) cursor.execute(updatequery, bits) else: titlelist.add(realtitle) bits = (realtitle, project_assess) cursor.execute(insertquery, bits) db.close()
def getcontent(title): site = wiki.Wiki("http://wiki.chinahpo.org/api.php?") pagehandle = page.Page(site, title) #title is the name of each SNP snp_page = pagehandle.getWikiText() #Wiki page parse #print snp_page.encode('u8') title = title.replace("/", "&") open('./CHPO/%s' % title, 'w+').write(snp_page) # write into file
def wikipedia_query(query_params): """ An extremely basic wrapper for the wikitools api. """ site = wiki.Wiki() # This defaults to en.wikipedia.org request = api.APIRequest(site, query_params) result = request.query() return result[query_params['action']]
def wiki_login(w_url, w_user, w_pwd): print("Login ...") w_site = wiki.Wiki(w_url) w_site.login(w_user, w_pwd) print("... done.") return w_site
def get_site(user='******', api_site='http://wiki.travellerrpg.com/api.php', password=False): site = wiki.Wiki(api_site) access = site.login(user, password=password, remember=True) if not access: logger.error('Unable to log in') return site
def search_snpedia(snp): """ http://snpedia.com/index.php/Bulk """ site = wiki.Wiki("http://bots.snpedia.com/api.php") pagehandle = page.Page(site, snp) snp_page = pagehandle.getWikiText() return snp_page
def __init__(self, project, dumpspath): #self.db = None self.basic, self.fixes, self.paths = loadPathsAndLibs( project, dumpspath) self.project = project self.wiki = wiki.Wiki(self.paths['siteurl']) if self.paths: self.test()
def __init__(self): for site_lang_code in self.lang_codes.keys(): self.sites[site_lang_code] = wiki.Wiki('https://' + site_lang_code + '.wikipedia.org/w/api.php') self.deserialize_progress_tracker()
def get_article(url, source_id, rfc_DB): cmd = 'select id, disqus_id, section_index, title from website_article where url = %s' article_result = rfc_DB.fetch_one(cmd, (urllib2.unquote(url), )) if article_result is not None: article_id, disqus_id, section_index, title = article_result return article_id, disqus_id, section_index, title else: if 'wikipedia.org/wiki/' in url: url_parts = url.split('/wiki/') wiki_sub = url_parts[1].split(':') wiki_parts = ':'.join(wiki_sub[1:]).split('#') wiki_page = wiki_parts[0] section = None if len(wiki_parts) > 1: section = wiki_parts[1] from wikitools import wiki, api site = wiki.Wiki(_DOMAIN + '/w/api.php') page = urllib2.unquote( str(wiki_sub[0]) + ':' + wiki_page.encode('ascii', 'ignore')) params = { 'action': 'parse', 'prop': 'sections', 'page': page, 'redirects': 'yes' } from wikitools import wiki, api try: request = api.APIRequest(site, params) result = request.query() disqus_id = str(result['parse']['pageid']) section_title = None section_index = None if section: for s in result['parse']['sections']: if s['anchor'] == section: disqus_id = str(disqus_id) + '#' + str(s['index']) section_title = s['line'] section_index = s['index'] title = result['parse']['title'] if section_title is not None: title = title + ' - ' + section_title link = urllib2.unquote(url) article_insert_command = " insert into website_article (disqus_id, title, url, source_id, section_index)\ values (%s, %s, %s, %s, %s)" article_id = rfc_DB.insert( article_insert_command, (disqus_id, title, link, source_id, section_index)) return article_id, disqus_id, section_index, title except api.APIError as e: print e
def __init__(self, language='en'): self.language = language # base_sites_by_language = { # 'en': "https://en.wikipedia.org/w/api.php", # 'es': "http://es.wikipedia.org/w/api.php" # } self.site_url = "https://{:s}.wikipedia.org/w/api.php".format(language) self.site = wiki.Wiki(self.site_url)
def import_wiki_authors(authors, rfc_DB): found_authors = set() anonymous_exist = False for author in authors: if author: found_authors.add(author) else: anonymous_exist = True authors_list = '|'.join(found_authors) from wikitools import wiki, api site = wiki.Wiki(_DOMAIN + '/w/api.php') params = { 'action': 'query', 'list': 'users', 'ususers': authors_list, 'usprop': 'blockinfo|groups|editcount|registration|emailable|gender', 'format': 'json' } request = api.APIRequest(site, params) result = request.query() comment_authors = [] for user in result['query']['users']: comment_author_id = None try: author_id = user['userid'] # first check if the author exists using the username command = "select id from website_commentauthor where username = %s" (comment_author_id, ) = rfc_DB.fetch_one(command, (user['name'], )) # if no author exists with the same username if comment_author_id is None: author_insert_command = " insert into website_commentauthor (username, disqus_id, joined_at, edit_count, gender, groups, is_wikipedia)\ values (%s, %s, %s, %s, %s, %s, %s)" joined_at = datetime.datetime.strptime(user['registration'], '%Y-%m-%dT%H:%M:%SZ') params = (user['name'], author_id, joined_at, user['editcount'], user['gender'], ','.join(user['groups']), 1) comment_author_id = rfc_DB.insert(author_insert_command, params) except Exception: command = " insert into website_commentauthor (username, is_wikipedia)\ values (%s, %s)" comment_author_id = rfc_DB.insert(command, (user['name'], 1)) if comment_author_id is not None: comment_authors.append(comment_author_id) if anonymous_exist: anonymous_id = rfc_DB.get_anonymous_id() comment_authors.append(anonymous_id) return comment_authors
def test_equality(self): p1 = page.Page(self.site, "Page", check=True) p2 = page.Page(self.site, "Page", check=False) self.assertEqual(p1, p2) site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php") p3 = page.Page(site2, "Page") self.assertNotEqual(p1, p3) p4 = page.Page(self.site, "Talk:Page") self.assertNotEqual(p1, p4)
def setupProject(project, abbrv): site = wiki.Wiki() site.login(settings.bot, settings.botpass) site.setMaxlag(-1) date = datetime.datetime.utcnow()+datetime.timedelta(days=5) table = date.strftime('pop_%b%y') db = MySQLdb.connect(host="sql-s1-user", read_default_file="/home/alexz/.my.cnf") cursor = db.cursor() projecttitles = set() project = project.replace(' ', '_') types = ['FA', 'FL', 'A', 'GA', 'B', 'C', 'start', 'stub', 'list', 'image', 'portal', 'category', 'book', 'disambig', 'template', 'unassessed', 'blank', 'non-article'] insertquery = 'INSERT INTO u_alexz.'+table+' (title, project_assess) VALUES( %s, %s )' updatequery = 'UPDATE u_alexz.'+table+' SET project_assess=CONCAT(project_assess,",",%s) WHERE title=%s' selectquery = """SELECT page_namespace-1, page_title, SUBSTRING_INDEX(clB.cl_to, '-', 1) FROM enwiki_p.page JOIN enwiki_p.categorylinks AS clA ON page_id=clA.cl_from LEFT JOIN enwiki_p.categorylinks AS clB ON page_id=clB.cl_from AND clB.cl_to LIKE "%%-importance_"""+project+"""_articles" WHERE clA.cl_to=%s AND page_is_redirect=0 """ for type in types: if type == "unassessed": cat = "Category:Unassessed "+project+" articles" elif type == "non-article": cat = "Category:Non-article "+project+" pages" elif type == "blank": cat = "Category:"+project+" pages" else: cat = "Category:"+type+"-Class "+project+" articles" catpage = page.Page(site, cat) if not catpage.exists: continue catpage.setNamespace(0) catname = catpage.title.replace(' ', '_').encode('utf-8') print catname cursor.execute(selectquery, (catname)) pagesincat = cursor.fetchall() for title in pagesincat: if not title[0]%2 == 0: continue realtitle = title[1].decode('utf8').encode('utf8') if title[0] != 0: p = page.Page(site, realtitle, check=False, namespace=title[0]) realtitle = p.title.encode('utf8').replace(' ', '_') if realtitle in projecttitles: continue if title[2] is None: project_assess = "'%s':('%s',None)" % (abbrv, type) else: project_assess = "'%s':('%s','%s')" % (abbrv, type, title[2]) projecttitles.add(realtitle) if realtitle in titlelist: bits = (project_assess, realtitle) cursor.execute(updatequery, bits) else: titlelist.add(realtitle) bits = (realtitle, project_assess) cursor.execute(insertquery, bits) del projecttitles db.close()
def get_snpedia_snp_names(): site = wiki.Wiki('http://bots.snpedia.com/api.php') snps = category.Category(site, 'Is_a_snp') snpedia = set() for article in snps.getAllMembersGen(namespaces=[0]): snpedia.add(article.title.lower()) return snpedia
def __init__(self, project, dumpspath, username='', password=''): self.basic, self.fixes, self.paths = loadPathsAndLibs( project, dumpspath) self.project = project self.wiki = wiki.Wiki(self.paths['siteurl']) self.username = username self.password = password if self.paths: self.test() print('UploadFixes initilised')
def get_drugs(fname): site = wiki.Wiki("http://bots.snpedia.com/api.php") drugs = category.Category(site, "Is_a_medicine") n = 0 with open(fname, 'w') as f: for article in drugs.getAllMembersGen(namespaces=[0]): drug = _normalize_str(article.title.strip()) f.write(drug + '\n') n += 1 print 'drugs extracted:', n
def getWiki(configuration): """Create the wiki object from the configuration.""" _wiki = wiki.Wiki(configuration.wiki_apiurl) _wiki.cookiepath = configuration.cookiejar if not _wiki.login(configuration.wiki_username, configuration.wiki_password, domain=configuration.wiki_domain, remember=True): raise WikiLoginError("Login failed early") if not _wiki.isLoggedIn(): raise WikiLoginError("Login failed") return _wiki
def crawl(url_param): # Fix eventual full URL url_param = unquote_plus(basename(url_param)) # Generate query params = { 'action' : 'query', 'prop' : 'imageinfo|revisions', 'iiprop' : 'url|sha1|size', 'rvprop' : 'content', 'rawcontinue' : '' } url_type = get_url_type(url_param) if url_type == 'category': params['generator'] = 'categorymembers' params['gcmtitle'] = url_param params['gcmlimit'] = 'max' elif url_type == 'file': params['titles'] = url_param else: params['generator'] = 'images' params['titles'] = url_param params['gimlimit'] = 'max' # Call API site = wiki.Wiki(API_URL) request = api.APIRequest(site, params) print_verbose("Site: %s" % str(site), 2) print_verbose("Query: ", 2) pprint_verbose(params, 2) result = request.query(querycontinue=True) print_verbose("Result: ", 4) pprint_verbose(result, 4) # Check result if 'error' in result: raise Error(result['error']) if 'warnings' in result: sys.stderr.write(result['warnings']) return None if '-1' in result['query']['pages']: sys.stderr.write(result['query']['pages']['-1']) return None return result['query']['pages']
def import_wiki_authors(authors, article): found_authors = [] anonymous_exist = False for author in authors: if author: found_authors.append(author) else: anonymous_exist = True authors_list = '|'.join(found_authors) from wikitools import wiki, api domain = article.url.split('/wiki/')[0] site = wiki.Wiki(domain + '/w/api.php') params = { 'action': 'query', 'list': 'users', 'ususers': authors_list, 'usprop': 'blockinfo|groups|editcount|registration|emailable|gender', 'format': 'json' } request = api.APIRequest(site, params) result = request.query() comment_authors = [] for user in result['query']['users']: try: author_id = user['userid'] comment_author = CommentAuthor.objects.filter(disqus_id=author_id) if comment_author.count() > 0: comment_author = comment_author[0] else: joined_at = datetime.datetime.strptime(user['registration'], '%Y-%m-%dT%H:%M:%SZ') comment_author = CommentAuthor.objects.create( username=user['name'], disqus_id=author_id, joined_at=user['registration'], edit_count=user['editcount'], gender=user['gender'], groups=','.join(user['groups']), is_wikipedia=True) except Exception: comment_author = CommentAuthor.objects.create( username=user['name'], is_wikipedia=True) comment_authors.append(comment_author) if anonymous_exist: comment_authors.append( CommentAuthor.objects.get(disqus_id='anonymous', is_wikipedia=True)) return comment_authors
def toolbar_icon_clicked(self, widget, movie): import pprint # Used for formatting the output for viewing, not necessary for most code from wikitools import wiki, api site = wiki.Wiki("http://de.wikipedia.org/w/api.php") params = {'action':'query', 'list':'search', 'srsearch':'rocky', 'srprop':'', 'srlimit':'50' } req = api.APIRequest(site, params) res = req.query(querycontinue=False) pprint.pprint(res)
def test_parseJSON_maxlag(self): site = wiki.Wiki("https://en.wikipedia.org/w/api.php") params = {"action": "query"} req = api.APIRequest(site, params) req.changeParam("maxlag", "-1") warnings.filterwarnings("error", category=UserWarning, module="wikitools.api") with self.assertRaises(UserWarning): req.query(False) warnings.filterwarnings("default", category=UserWarning, module="wikitools.api")
def snpedia_getter(): site = wiki.Wiki("http://snpedia.com/api.php") # open snpedia snps = category.Category(site, "Is_a_snp") snpedia = {} for article in snps.getAllMembersGen(namespaces=[0]): # get all snp-names snpedia[article.title.lower()] = "in snpedia" print article.title snpedia_outfile = open("snpedia.data", "wb") # save all snps to cache pickle.dump(snpedia, snpedia_outfile) snpedia_outfile.close() return snpedia
def listFromCategory(project, dumpspath, categorytitle, namespaces=None, username=None, password=None): mdlfixes, paths = loadPathsAndLibs(project, dumpspath) site = wiki.Wiki(paths['siteurl'], username, password) c = category.Category(site, categorytitle) titles = c.getAllMembers(namespaces) with open(paths['list'], 'wt', encoding='utf_8') as ftitles: ftitles.write('\n'.join(titles)) print('titles written')
def __init__(self, config, config_section, wiki_name, callback): self.config = config self.config_section = config_section self.wiki_name = wiki_name self.wiki = _wiki.Wiki('https://%s/w/api.php' % self.wiki_name) self.username = config.get(self.config_section, 'wiki_user') self.password = config.get(self.config_section, 'wiki_password') self.callback = callback if self.callback.__self__.log: # We could use .getChild(), but then %(name)s would be 'bot.wiki', we want instead 'bot:wiki' self.log = logging.getLogger('%s:%s' % (self.callback.__self__.nickname, self.wiki_name)) self.load_wiki_configuration() self.loop = LoopingCall(self.fetch_log)