def test_equality(self): u1 = page.Page(self.site, "GoodUsername", check=True) u2 = page.Page(self.site, "GoodUsername", check=False) self.assertEqual(u1, u2) site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php") u3 = user.User(site2, "GoodUsername") self.assertNotEqual(u1, u3)
def test_equality(self): p1 = page.Page(self.site, "Page", check=True) p2 = page.Page(self.site, "Page", check=False) self.assertEqual(p1, p2) site2 = wiki.Wiki("https://en.wikipedia.org/w/api.php") p3 = page.Page(site2, "Page") self.assertNotEqual(p1, p3) p4 = page.Page(self.site, "Talk:Page") self.assertNotEqual(p1, p4)
def setupProject(project, abbrv): site = wiki.Wiki() site.login(settings.bot, settings.botpass) site.setMaxlag(-1) date = datetime.datetime.utcnow()+datetime.timedelta(days=5) table = date.strftime('pop_%b%y') db = MySQLdb.connect(host="sql-s1-user", read_default_file="/home/alexz/.my.cnf") cursor = db.cursor() projecttitles = set() project = project.replace(' ', '_') types = ['FA', 'FL', 'A', 'GA', 'B', 'C', 'start', 'stub', 'list', 'image', 'portal', 'category', 'book', 'disambig', 'template', 'unassessed', 'blank', 'non-article'] insertquery = 'INSERT INTO u_alexz.'+table+' (title, project_assess) VALUES( %s, %s )' updatequery = 'UPDATE u_alexz.'+table+' SET project_assess=CONCAT(project_assess,",",%s) WHERE title=%s' selectquery = """SELECT page_namespace-1, page_title, SUBSTRING_INDEX(clB.cl_to, '-', 1) FROM enwiki_p.page JOIN enwiki_p.categorylinks AS clA ON page_id=clA.cl_from LEFT JOIN enwiki_p.categorylinks AS clB ON page_id=clB.cl_from AND clB.cl_to LIKE "%%-importance_"""+project+"""_articles" WHERE clA.cl_to=%s AND page_is_redirect=0 """ for type in types: if type == "unassessed": cat = "Category:Unassessed "+project+" articles" elif type == "non-article": cat = "Category:Non-article "+project+" pages" elif type == "blank": cat = "Category:"+project+" pages" else: cat = "Category:"+type+"-Class "+project+" articles" catpage = page.Page(site, cat) if not catpage.exists: continue catpage.setNamespace(0) catname = catpage.title.replace(' ', '_').encode('utf-8') print catname cursor.execute(selectquery, (catname)) pagesincat = cursor.fetchall() for title in pagesincat: if not title[0]%2 == 0: continue realtitle = title[1].decode('utf8').encode('utf8') if title[0] != 0: p = page.Page(site, realtitle, check=False, namespace=title[0]) realtitle = p.title.encode('utf8').replace(' ', '_') if realtitle in projecttitles: continue if title[2] is None: project_assess = "'%s':('%s',None)" % (abbrv, type) else: project_assess = "'%s':('%s','%s')" % (abbrv, type, title[2]) projecttitles.add(realtitle) if realtitle in titlelist: bits = (project_assess, realtitle) cursor.execute(updatequery, bits) else: titlelist.add(realtitle) bits = (realtitle, project_assess) cursor.execute(insertquery, bits) del projecttitles db.close()
def test_constructor_section_priority(self): p1 = page.Page(self.site, "Page#Section 2", sectionnumber=0, section="Section 1") self.assertIs(p1.section, 0) p1 = page.Page(self.site, "Page#Section 2", section="Section 1") self.assertIs(p1.section, 1) p1 = page.Page(self.site, "Page#Section 2") self.assertIs(p1.section, 2)
def test_getWikiText(self): p1 = page.Page(self.site, "Page") p2 = page.Page(self.site, "Page#Section 1") api.logging = True p1.getWikiText() self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertNotIn("rvsection", log) p2.getWikiText() self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertIn("rvsection", log) self.assertNotEqual(p1.lastedittime, "")
def get_text(self, name): try: try: normalized_name = (name[0].upper() + name[1:]).replace( ' ', '_') # very special cases if name[0] != 'i' and name[0:2] != 'rs' and name[0:2] != 'gs': splits = name.split(' ') splits[0] = splits[0].upper() normalized_name = '_'.join(splits) print 'Using normalized name %s' % normalized_name pagehandle = page.Page(self.site, normalized_name, False, False) try: return pagehandle.getWikiText( getrequest=self.config.use_get_requests) except: # For unpatched wikitools versions return pagehandle.getWikiText() except NoPage: print 'ERROR : page %s is not found!' % normalized_name return None except Exception as error: print 'ERROR : wikitools exception while getting page %s!' % normalized_name print error.message return None
def getcontent(title): site = wiki.Wiki("http://wiki.chinahpo.org/api.php?") pagehandle = page.Page(site, title) #title is the name of each SNP snp_page = pagehandle.getWikiText() #Wiki page parse #print snp_page.encode('u8') title = title.replace("/", "&") open('./CHPO/%s' % title, 'w+').write(snp_page) # write into file
def snp_info(self, snp): """get all the data for a snp""" pagehandle = page.Page(self.site, snp) wikitext = pagehandle.getWikiText() return self.snp_info_from_wikitext(snp, wikitext)
def test_protect(self): self.site.login("GoodUsername", "goodpassword") p1 = page.Page(self.site, "Talk:Page") api.logging = True r = {"edit": "autoconfirmed", "move": "sysop"} e = {"edit": "1 week"} res = p1.protect(restrictions=r, expirations=e, reason="test_protect") log1 = api.querylog.pop() self.assertEqual(log1["action"], "query") self.assertEqual(log1["meta"], "tokens") log2 = api.querylog.pop() ebits = log2["expiry"].split("|") pbits = log2["protections"].split("|") self.assertIn("edit=autoconfirmed", pbits) self.assertIn("move=sysop", pbits) i = pbits.index("edit=autoconfirmed") self.assertEqual(ebits[i], "1 week") i = pbits.index("move=sysop") self.assertEqual(ebits[i], "indefinite") self.assertNotIn("cascade", log2) res = p1.protect(restrictions={ "edit": "all", "move": "all" }, reason="test_protect") self.assertEqual(len(res["protect"]["protections"]), 2) for prot in res["protect"]["protections"]: if "edit" in prot: self.assertEqual(prot["edit"], "") else: self.assertEqual(prot["move"], "")
def test_getHistory_no_revs(self): p1 = page.Page(self.site, "Page") api.logging = True hist = p1.getHistory(content=False, user="******") self.assertIs(len(api.querylog), 1) self.assertIs(hist[0], None) self.assertIs(len(hist), 1)
def tryedit(self, pagetitle, oldtext, newtext, summary, watchlist='watch', minor=False): #print('in tryedit', pagetitle) #print('self.wiki', self.wiki) #print('self.username', self.username) if not self.wiki.isLoggedIn(self.username): print('not logged in. trying to login...') print("self.username, self.password", self.username, self.password) print('self.wiki', self.wiki) ok = self.wiki.login(self.username, self.password) if not ok: print('not logged in') return False, CANTLOGIN print('in tryedit 2', pagetitle) thepage = page.Page(self.wiki, pagetitle) wikitext = thepage.getWikiText() ts = thepage.lastedittime print('ts', ts) #minor = basic.fixes.minor #print('minor',minor) if oldtext == wikitext: editlemma = thepage.edit(text=newtext, summary=summary, basetimestamp=ts, watchlist=watchlist, minor=1) return True, '' #print('editlemma',editlemma) print('oldtext >< wikitext') return False, PAGECHANGEDFROMOLD
def test_getHistory(self): p1 = page.Page(self.site, "Page") api.logging = True p1.getHistory(content=False) self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertNotIn("content", log["rvprop"])
def genotype_getter(my_filtered_snps): site = wiki.Wiki("http://snpedia.com/api.php") genotypes = {} for single_snp in my_filtered_snps: type_counter = 1 wikipage = page.Page(site, single_snp.name) snp_page = wikipage.getWikiText() while snp_page.find("geno" + str(type_counter)) != -1: if genotypes.has_key(single_snp.name): current_genotypes = genotypes[single_snp.name] type_start = snp_page.find("geno" + str(type_counter)) type_start = snp_page.find("(", type_start) type_stop = snp_page.find(")", type_start) current_genotypes.append( str(snp_page[type_start:type_stop + 1])) genotypes[single_snp.name] = current_genotypes else: type_start = snp_page.find("geno" + str(type_counter)) type_start = snp_page.find("(", type_start) type_stop = snp_page.find(")", type_start) genotypes[single_snp.name] = [ str(snp_page[type_start:type_stop + 1]) ] type_counter += 1 print "Got genotypes for " + str(single_snp.name) genotype_outfile = open("genotypes.data", "wb") pickle.dump(genotypes, genotype_outfile) genotype_outfile.close() return genotypes
def search_snpedia(snp): """ http://snpedia.com/index.php/Bulk """ site = wiki.Wiki("http://bots.snpedia.com/api.php") pagehandle = page.Page(site, snp) snp_page = pagehandle.getWikiText() return snp_page
def test_toggleTalk(self): p1 = page.Page(self.site, "Page") p2 = p1.toggleTalk() self.assertIs(p1.namespace, 0) self.assertGreater(p1.pageid, 0) self.assertEqual(p2.title, "Talk:Page") self.assertEqual(p2.unprefixedtitle, "Page") self.assertEqual(p2.urltitle, "Talk%3APage")
def test_getLogsGen(self): p1 = page.Page(self.site, "File:Test1.jpg") api.logging = True for log in p1.getLogsGen(logtype="upload", limit=5): pass self.assertGreater(len(api.querylog), 1) log = api.querylog.pop() self.assertNotIn("leuser", log)
def test_getHistoryGen(self): p1 = page.Page(self.site, "Page") api.logging = True for rev in p1.getHistoryGen(): pass self.assertGreater(len(api.querylog), 1) log = api.querylog.pop() self.assertIn("content", log["rvprop"])
def test_constructor_separate_ns(self): p1 = page.Page(self.site, "page", namespace=1) self.assertEqual(p1.title, "Talk:Page") self.assertEqual(p1.unprefixedtitle, "Page") self.assertEqual(p1.urltitle, "Talk%3APage") self.assertTrue(p1.exists) self.assertIs(p1.namespace, 1) self.assertGreater(p1.pageid, 0)
def get_wikitext(self, snp): """get the wikitext for a snp""" pagehandle = page.Page(self.site, snp) try: wikitext = pagehandle.getWikiText() return {'snp': snp, 'wikitext': wikitext} except page.NoPage: return {'snp': snp, 'wikitext': ""}
def test_getLogs(self): p1 = page.Page(self.site, "File:Test1.jpg") api.logging = True log = p1.getLogs(logtype="upload", limit=10) self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertEqual(log["letitle"], "File:Test1.jpg") self.assertEqual(log["letype"], "upload") self.assertNotIn("leuser", log)
def test_constructor_check_false(self): api.logging = True p1 = page.Page(self.site, "talk:page", check=False) self.assertIs(p1.exists, None) self.assertIs(p1.pageid, 0) self.assertIs(p1.namespace, 1) self.assertEqual(p1.unprefixedtitle, "Page") self.assertTrue(p1.followRedir) self.assertIs(len(api.querylog), 0)
def test_constructor(self): api.logging = True p1 = page.Page(self.site, "talk:page") self.assertEqual(p1.title, "Talk:Page") self.assertEqual(p1.unprefixedtitle, "Page") self.assertEqual(p1.urltitle, "Talk%3APage") self.assertTrue(p1.exists) self.assertIs(p1.namespace, 1) self.assertGreater(p1.pageid, 0) self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertIn("redirects", log)
def test_delete(self): self.site.login("GoodUsername", "goodpassword") p1 = page.Page(self.site, "Page to delete") p1.edit(text="text") api.logging = True res = p1.delete(reason="test_delete") log1 = api.querylog.pop() self.assertEqual(log1["action"], "query") self.assertEqual(log1["meta"], "tokens") log2 = api.querylog.pop() self.assertIn("reason", log2) self.assertIn("watchlist", log2) self.assertIn("delete", res)
def test_setNamespace(self): p1 = page.Page(self.site, "Page") p1_id = p1.pageid api.logging = True p1.setNamespace(1) self.assertNotEqual(p1_id, p1.pageid) self.assertGreater(p1.pageid, 0) self.assertEqual(p1.title, "Talk:Page") self.assertEqual(p1.unprefixedtitle, "Page") self.assertEqual(p1.urltitle, "Talk%3APage") self.assertIs(len(api.querylog), 1) log = api.querylog.pop() self.assertIn("redirects", log)
def test_move(self): self.site.login("GoodUsername", "goodpassword") p1 = page.Page(self.site, "Anotherpage") api.logging = True res = p1.move(mvto="User:Anotherpage", reason="test_move") log1 = api.querylog.pop() self.assertEqual(log1["action"], "query") self.assertEqual(log1["meta"], "tokens") log2 = api.querylog.pop() self.assertIn("reason", log2) self.assertIn("watchlist", log2) self.assertEqual(p1.namespace, 2) self.assertEqual(p1.title, "User:Anotherpage") self.assertEqual(p1.unprefixedtitle, "Anotherpage") self.assertEqual(p1.urltitle, "User%3AAnotherpage") res = p1.move(mvto="Anotherpage", reason="test_move") self.assertIn("move", res)
def crawl(folder): n = 0 site = wiki.Wiki("http://bots.snpedia.com/api.php") for result in query({'cmtitle': 'Category:Is_a_genotype'}): for item in result.values()[0]: snp = item['title'] if not (snp.startswith('I') or snp.startswith('R')): continue pagehandle = page.Page(site, snp) snp_page = pagehandle.getWikiText() with open(folder + '/' + snp + '.txt', 'w') as f: f.write(snp_page) print n, snp time.sleep(0.5) n += 1
def crawl(folder, db_session): # n = 0 # for file in os.listdir(folder): # if not file.endswith('.txt'): continue # with open(fodler + '/' + file) as f: # snp_name = file[:-4] # wikicode = mwparserfromhell.parse(f.read()) # templates = wikicode.filter_templates(recursive=False) site = wiki.Wiki("http://bots.snpedia.com/api.php") snp_name = "Rs7412(C;C)" pagehandle = page.Page(site, snp_name) snp_page = pagehandle.getWikiText() wikicode = mwparserfromhell.parse(snp_page) templates = wikicode.filter_templates(recursive=True) # get SNP for t in templates: tname = t.name.strip() if tname == 'Genotype': rs_id = _parse_entry(t, 'rsid') allele1 = _parse_entry(t, 'allele1') allele2 = _parse_entry(t, 'allele2') magnitude = _parse_entry(t, 'magnitude', out_type=float) repute = _parse_entry(t, 'repute') summary = _normalize_str(_parse_entry(t, 'summary')) genotype = allele1 + allele2 # get snp snp = db_session.query(SNP).filter(SNP.rs_id == rs_id).first() if not snp: snp = SNP(rs_id=rs_id) # create association db_session.add( Association(snp=snp, genotype=genotype, magnitude=magnitude, repute=repute, description=summary, source='snpedia')) db_session.commit()
def description_getter(genotypes): site = wiki.Wiki("http://snpedia.com/api.php") genotype_descriptions = {} for single_type in genotypes: for variant in genotypes[single_type]: genotype_name = str(single_type) + str(variant) print genotype_name wikipage = page.Page(site, genotype_name) if wikipage.exists == True: genotype_page = wikipage.getWikiText() if genotype_page.find("summary=") != -1: summary_start = genotype_page.find("summary=") + 8 summary_stop = genotype_page.find("\n", summary_start) print genotype_page[summary_start:summary_stop] genotype_descriptions[genotype_name] = genotype_page[ summary_start:summary_stop] description_outfile = open("description.data", "wb") pickle.dump(genotype_descriptions, description_outfile) description_outfile.close() return genotype_descriptions
def fill_article_table(num_art_want, multi): """ Write content to file and save paths etc to the database. When multi is true we're building the dataset for the multi-class classification task. Binary task otherwise. """ site = wiki.Wiki(u"https://en.wikipedia.org/w/api.php") MIN_PID = 10 # Page ids start at 10 MAX_PID = 50401510 # Set this to whatever within range NS = 0 # Main article namespace. https://goo.gl/Sa3yBC # Get num_art_want number of random articles count = 0 art_dict = {} while count < num_art_want: rand_pid = random.randint(MIN_PID, MAX_PID) if rand_pid not in art_dict: try: p = page.Page(site=site, namespace=NS, pageid=rand_pid) # Ensuring that its not a page from a different namespace. # This shouldn't happen according to what the API # documentation says. IDK why its happening. p.getCategories() # Over writing the page objects' list of templates with just # the infobox templates; seems like a bad thing to do but idk. p.templates = getInfoboxes(p.getTemplates()) if isWanted(p, multi=multi): count += 1 print "dl:", p.title rand_pid = unicode(str(rand_pid), 'utf-8') data = save_all_article_data(p, rand_pid) art_dict[rand_pid] = data except wiki.WikiError: pass populate_tables(art_dict)
def run(self): cat = category.Category(self.wiki, self.categoryname) self.overviewpage = page.Page(self.wiki, u"VEIDs") self.veidlist = {} for article in cat.getAllMembersGen(namespaces=[0]): self.collect_page_detail(article) try: oldtext = self.overviewpage.getWikiText() except page.NoPage: oldtext = "" newtext = self.build_new_overviewpage_text() # only save if something was changed if newtext == oldtext: return self.overviewpage.edit(text=newtext, skipmd5=True, bot=True, summary=u"Regenerated list.")