def wikixml2graph(src, dst, distrust=False, threshold=0, downloadlists=True, verbose=False): ''' This function is the only one that have to be called directly in this module. The other ones are only helpers for this one. This function is able to take a gzipped, 7zipped, or bzipped xml downloaded from www.download.wikimedia.org understand: 1. what is the lang of the network (from the filename) 2. what is the data in which the snapshot would be taken (from the filename) 3. if it is current or history (from the filename) 4. if is possible to create a distrust-graph (only for xml with pages) and unzip it, parse it, and create in the right folder (your_home/shared_datasets/WikiNetwork/lang/date/ ) the c2 file with the network. (and if possible, the c2 with the distrust graph) threshold = the minimum weight on edges (the edges with weight < threshold, will be deleted) downloadlist = download the list of bots, and the list of blockedusers ''' if not i18n: raise IOError( os.path.join(os.environ['HOME'], 'shared_datasets', 'WikiNetwork', 'languageparameters.c2') + " does not exists! you have to sync your current directory (with sync_trustlet)" ) assert dst.endswith('.c2') srcname = os.path.split(src)[1] if 'current' in srcname: WikiContentHandler = WikiCurrentContentHandler elif 'history' in srcname: WikiContentHandler = WikiHistoryContentHandler else: raise Error( "I cannot understand the type of network (current or history?)") filename = os.path.split(src)[1] #rm dir size = os.stat(src).st_size s = os.path.split(src)[1] lang = s[:s.index('wiki')] assert lang in i18n, "The lang " + lang + " is not supported! (you can add it using the function addWikiLanguage in this package)" res = re.search('wiki-(\d{4})(\d{2})(\d{2})-', s) date = '-'.join([res.group(x) for x in xrange(1, 4)]) assert isdate(date) deleteafter = False # Support compressed file if type(src) is str: if src.endswith('.gz'): verbose = False src = GzipFile(src) elif BZ2 and src.endswith('.bz2'): src = BZ2File(src) verbose = False elif not BZ2 and src.endswith('.bz2'): if os.system("bunzip2 -q -k -f " + src): print 'an error has occourred! possible reason:' print '1. install bz2' print '2. no space left on device (in order to decompress your bzip)' print 'NB: consider install python-bz2' exit(1) src = src[:-4] # cut the last three chars deleteafter = True elif src.endswith('.7z'): verbose = False if SevenzipFile: src = SevenzipFile(src) else: print 'Install p7zip' exit(1) mkpath(os.path.split(dst)[0]) ch = WikiContentHandler(lang, xmlsize=size, inputfilename=filename, forcedistrust=distrust, threshold=threshold, verbose=verbose) sax.parse(src, ch) #check! if deleteafter: os.remove(src) pynet = del_ips(ch.getPyNetwork()) if not pynet[0] or not pynet[1]: raise Exception( "Conversion failed! no edges or no nodes in this network, you might check the line in the i18n corresponding to the " + i18n[lang] + " language") cachedict = {'network': 'Wiki', 'lang': lang, 'date': date} if threshold > 1: cachedict['threshold'] = threshold # x^th percentile edges = pynet[1] edges.sort(lambda x, y: cmp(x[2], y[2])) perc90 = edges[len(edges) * 9 / 10][2] perc95 = edges[len(edges) * 95 / 100][2] assert save(cachedict, pynet, dst) cachedict['%'] = 90 assert save(cachedict, perc90, dst) cachedict['%'] = 95 assert save(cachedict, perc95, dst) del cachedict['%'] if hasattr(ch, 'distrust') and ch.distrust: net = ch.getDistrustGraph() nodes = set(net.nodes()) edges = net.edges() #if a node is in edges, isn't useful keep it in nodes for e in edges: nodes.discard(e[0]) nodes.discard(e[1]) assert save({ 'network': 'DistrustWiki', 'lang': lang, 'date': date }, (list(nodes), edges), os.path.join(os.path.split(dst)[0], 'graphDistrust.c2')) if not downloadlists: return users, bots, blockedusers = get_list_users( lang, os.path.join(os.environ['HOME'], 'shared_datasets', 'WikiNetwork')) assert save({'lang': lang, 'list': 'bots'}, bots, dst) assert save({'lang': lang, 'list': 'blockedusers'}, blockedusers, dst) lenusers = len(users) assert save({'lang': lang, 'info': 'number of users'}, lenusers, dst)
def wikixml2graph(src,dst,distrust=False,threshold=0,downloadlists=True,verbose=False): ''' This function is the only one that have to be called directly in this module. The other ones are only helpers for this one. This function is able to take a gzipped, 7zipped, or bzipped xml downloaded from www.download.wikimedia.org understand: 1. what is the lang of the network (from the filename) 2. what is the data in which the snapshot would be taken (from the filename) 3. if it is current or history (from the filename) 4. if is possible to create a distrust-graph (only for xml with pages) and unzip it, parse it, and create in the right folder (your_home/shared_datasets/WikiNetwork/lang/date/ ) the c2 file with the network. (and if possible, the c2 with the distrust graph) threshold = the minimum weight on edges (the edges with weight < threshold, will be deleted) downloadlist = download the list of bots, and the list of blockedusers ''' if not i18n: raise IOError( os.path.join( os.environ['HOME'], 'shared_datasets', 'WikiNetwork', 'languageparameters.c2' )+" does not exists! you have to sync your current directory (with sync_trustlet)") assert dst.endswith('.c2') srcname = os.path.split(src)[1] if 'current' in srcname: WikiContentHandler = WikiCurrentContentHandler elif 'history' in srcname: WikiContentHandler = WikiHistoryContentHandler else: raise Error("I cannot understand the type of network (current or history?)") filename = os.path.split(src)[1] #rm dir size = os.stat(src).st_size s = os.path.split(src)[1] lang = s[:s.index('wiki')] assert lang in i18n, "The lang "+lang+" is not supported! (you can add it using the function addWikiLanguage in this package)" res = re.search('wiki-(\d{4})(\d{2})(\d{2})-',s) date = '-'.join([res.group(x) for x in xrange(1,4)]) assert isdate(date) deleteafter = False # Support compressed file if type(src) is str: if src.endswith('.gz'): verbose = False src = GzipFile(src) elif BZ2 and src.endswith('.bz2'): src = BZ2File(src) verbose = False elif not BZ2 and src.endswith('.bz2'): if os.system( "bunzip2 -q -k -f "+src ): print 'an error has occourred! possible reason:' print '1. install bz2' print '2. no space left on device (in order to decompress your bzip)' print 'NB: consider install python-bz2' exit(1) src = src[:-4] # cut the last three chars deleteafter = True elif src.endswith('.7z'): verbose = False if SevenzipFile: src = SevenzipFile(src) else: print 'Install p7zip' exit(1) mkpath(os.path.split(dst)[0]) ch = WikiContentHandler(lang,xmlsize=size, inputfilename=filename, forcedistrust=distrust, threshold=threshold, verbose=verbose) sax.parse(src,ch) #check! if deleteafter: os.remove( src ) pynet = del_ips(ch.getPyNetwork()) if not pynet[0] or not pynet[1]: raise Exception( "Conversion failed! no edges or no nodes in this network, you might check the line in the i18n corresponding to the "+i18n[lang]+" language" ) cachedict = {'network':'Wiki','lang':lang,'date':date} if threshold>1: cachedict['threshold'] = threshold # x^th percentile edges = pynet[1] edges.sort(lambda x,y: cmp(x[2],y[2])) perc90 = edges[len(edges)*9/10][2] perc95 = edges[len(edges)*95/100][2] assert save(cachedict,pynet,dst) cachedict['%'] = 90 assert save(cachedict,perc90,dst) cachedict['%'] = 95 assert save(cachedict,perc95,dst) del cachedict['%'] if hasattr(ch,'distrust') and ch.distrust: net = ch.getDistrustGraph() nodes = set(net.nodes()) edges = net.edges() #if a node is in edges, isn't useful keep it in nodes for e in edges: nodes.discard(e[0]) nodes.discard(e[1]) assert save({'network':'DistrustWiki','lang':lang,'date':date}, (list(nodes),edges), os.path.join(os.path.split(dst)[0],'graphDistrust.c2')) if not downloadlists: return users,bots,blockedusers = get_list_users(lang, os.path.join(os.environ['HOME'],'shared_datasets','WikiNetwork')) assert save({'lang':lang,'list':'bots'},bots,dst) assert save({'lang':lang,'list':'blockedusers'},blockedusers,dst) lenusers = len(users) assert save({'lang':lang,'info':'number of users'},lenusers,dst)