def __processArtists(self): """manage artist queue""" tempArtist = self.__artistQueue.pop() #get top element if not self.expansiveArtistGraph: #check if to follow artist links #artist already visited if self.bm.artistExists(tempArtist["artist"]): return tabs = "\t" * tempArtist["recursion"] if tempArtist["recursion"] > self.recursionLimit: return if self.printDebug: print (tabs + "[a] - " + str(tempArtist["artist"]).strip() + " <" + tempArtist["parent"] +">") #add artist to band manager self.bm.addArtist(Artist(tempArtist["artist"])) #if has no follow link, break function if tempArtist["link"].strip() == "": return parser = WikiParser(tempArtist["link"]) associatedActs = parser.getRelatedActs() for act in associatedActs : #add all associated acts to end of queue if not self.bm.bandExists(act["band"]): recursionValue = tempArtist["recursion"] + 1 if recursionValue > self.recursionLimit: continue act["recursion"] = recursionValue act["parent"] = "a." + tempArtist["artist"] self.__bandQueue.appendleft(act)
def __processBands(self): tempBand = self.__bandQueue.pop() #get top element #if has no follow link, break function if(str(tempBand["band"]) in self.__bandProcessedList): return else: self.__bandProcessedList.append(str(tempBand["band"])) if tempBand["link"].strip() == "": return parser = WikiParser(tempBand["link"]) members = parser.getBandMembers() #print tempBand.keys() formerMembers = parser.getBandMembers("former") tabs = "\t" * tempBand["recursion"] if tempBand["recursion"] > self.recursionLimit: return for member in members: #add member artists to artist stack member["recursion"] = tempBand["recursion"] + 1 member["parent"] = "b." + tempBand["band"] self.__artistQueue.appendleft(member) #link band with artist self.bm.link( Artist(member["artist"]), Band(tempBand["band"])) if(self.showFormerMembers): for member in formerMembers: #add member artists to artist stack recursionValue = tempBand["recursion"] + 1 if recursionValue > self.recursionLimit: continue member["recursion"] = recursionValue member["parent"] = "b." + tempBand["band"] self.__artistQueue.appendleft(member) #link band with artist self.bm.link( Artist(member["artist"]), Band(tempBand["band"]),True) #if artist accidentaly ends up in band, move to artist. if not members and not formerMembers: recursionValue = tempBand["recursion"] if recursionValue < self.recursionLimit: artistEntry = dict() artistEntry["artist"] = tempBand["band"] artistEntry["link"] = tempBand["link"] artistEntry["recursion"] = recursionValue artistEntry["parent"] = tempBand["parent"] self.__artistQueue.appendleft(artistEntry) return if self.printDebug: print (tabs + "[b] - " + str(tempBand["band"]).strip() + " <" + tempBand["parent"] +">") self.bm.addBand(tempBand["band"]) associatedActs = parser.getRelatedActs() #add all associated acts to end of queue for act in associatedActs : if not self.bm.bandExists(act["band"]): recursionValue = tempBand["recursion"] + 1 if recursionValue > self.recursionLimit: continue act["recursion"] = recursionValue act["parent"] = "b." + tempBand["band"] self.__bandQueue.appendleft(act)
def addtoQueue(self, link, recursionValue=1): """add band or artist to parse queue""" wparse = WikiParser(link) wmembers = wparse.getBandMembers() initialEntry = dict() if wmembers: #if band, add to band queue initialEntry["band"] = wparse.getName() initialEntry["link"] = link initialEntry["recursion"] = recursionValue initialEntry["parent"] = "root" self.__bandQueue.appendleft(initialEntry) else: #if artist add to artist queue initialEntry["artist"] = wparse.getName() initialEntry["link"] = link initialEntry["recursion"] = recursionValue initialEntry["parent"] = "root" self.__artistQueue.appendleft(initialEntry)
def map_reader(fd, size, url, params): from wikiparser import WikiParser count = 0 line = fd.readline() while "<page>" not in line: line = fd.readline() count += 1 fd.seek(0); while count > 0: fd.readline() count -= 1 reader = WikiParser.regex_reader("\s\s<page>([\s\S]*?)</page>\\n", fd, size, url); for row in reader: yield row
@author: wanjia """ from libZotero import zotero from urllib import urlopen import datetime from handler import Handler import os from wikiparser import WikiParser import hashlib #https://api.zotero.org/users/3949286/items?page=3&key=yrQKEJNQsAKekW9GOgGVzCBG # handler = Handler() wikiparser = WikiParser() zlib = zotero.Library('user', '3949286', '<null>', 'yrQKEJNQsAKekW9GOgGVzCBG') print zlib # retrieve the first five top-level items. #items = zlib.fetchItemsTop({'limit': 6, 'content': 'json,bib,coins'}) #for item in items: # print 'Item Type: %s | Key: %s | Title: %s ' % (item.itemType,item.itemKey, item.title) term = "Blinded_experiment" url = "https://en.wikipedia.org/wiki/" + term req = urlopen(url) #.readlines() filename = os.getcwd() + "/urlcontent.html" file = open(filename, "w") file.write(req.read())
statement = {i:j for i, j in self._pairwise(a['m'])} if statement != None: try: toyield1 = (title, str(statement['value'])) value = str(statement['wikibase-entityid']['numeric-id']) if 'wikibase-entityid' in statement else statement['string'] toyield2 = (title, str(statement['value']) + "----" + value) except KeyError: toyield1 = toyield2 = None yield toyield1 yield toyield2 except KeyError: pass def _pairwise(self, iterable): from itertools import izip a = iter(iterable) return izip(a, a) if __name__ == '__main__': from wikiparser import WikiParser if(len(sys.argv) < 2): print "USAGE: python", sys.argv[0], "<ddfs tag:name> [<output file path>]" print "You may omit the output file; it's stdout by default.\n" sys.exit() job = WikiParser().run(input=[sys.argv[1]]) outf = sys.stdout if len(sys.argv) < 3 else open(sys.argv[2], "w") for a, b in result_iterator(job.wait(show=True)): outf.write(a.encode('utf-8') + "," + b.encode('utf-8') + "\n")