Exemple #1
0
def wikixml2graph(src,
                  dst,
                  distrust=False,
                  threshold=0,
                  downloadlists=True,
                  verbose=False):
    '''
    This function is the only one that have to be called directly in this module.
    The other ones are only helpers for this one.
    This function is able to take a gzipped, 7zipped, or bzipped xml downloaded
    from www.download.wikimedia.org understand:
    1. what is the lang of the network (from the filename)
    2. what is the data in which the snapshot would be taken (from the filename)
    3. if it is current or history (from the filename)
    4. if is possible to create a distrust-graph (only for xml with pages)
    and unzip it, parse it, and create in the right folder (your_home/shared_datasets/WikiNetwork/lang/date/ )
    the c2 file with the network. (and if possible, the c2 with the distrust graph)
    threshold = the minimum weight on edges (the edges with weight < threshold, will be deleted)
    downloadlist = download the list of bots, and the list of blockedusers
    '''

    if not i18n:
        raise IOError(
            os.path.join(os.environ['HOME'], 'shared_datasets', 'WikiNetwork',
                         'languageparameters.c2') +
            " does not exists! you have to sync your current directory (with sync_trustlet)"
        )

    assert dst.endswith('.c2')
    srcname = os.path.split(src)[1]

    if 'current' in srcname:
        WikiContentHandler = WikiCurrentContentHandler

    elif 'history' in srcname:
        WikiContentHandler = WikiHistoryContentHandler
    else:
        raise Error(
            "I cannot understand the type of network (current or history?)")

    filename = os.path.split(src)[1]  #rm dir
    size = os.stat(src).st_size

    s = os.path.split(src)[1]
    lang = s[:s.index('wiki')]
    assert lang in i18n, "The lang " + lang + " is not supported! (you can add it using the function addWikiLanguage in this package)"
    res = re.search('wiki-(\d{4})(\d{2})(\d{2})-', s)
    date = '-'.join([res.group(x) for x in xrange(1, 4)])
    assert isdate(date)

    deleteafter = False
    # Support compressed file
    if type(src) is str:
        if src.endswith('.gz'):
            verbose = False
            src = GzipFile(src)
        elif BZ2 and src.endswith('.bz2'):
            src = BZ2File(src)
            verbose = False
        elif not BZ2 and src.endswith('.bz2'):
            if os.system("bunzip2 -q -k -f " + src):
                print 'an error has occourred! possible reason:'
                print '1. install bz2'
                print '2. no space left on device (in order to decompress your bzip)'
                print 'NB: consider install python-bz2'
                exit(1)

            src = src[:-4]  # cut the last three chars
            deleteafter = True

        elif src.endswith('.7z'):
            verbose = False
            if SevenzipFile:
                src = SevenzipFile(src)
            else:
                print 'Install p7zip'
                exit(1)

    mkpath(os.path.split(dst)[0])

    ch = WikiContentHandler(lang,
                            xmlsize=size,
                            inputfilename=filename,
                            forcedistrust=distrust,
                            threshold=threshold,
                            verbose=verbose)

    sax.parse(src, ch)
    #check!
    if deleteafter:
        os.remove(src)

    pynet = del_ips(ch.getPyNetwork())

    if not pynet[0] or not pynet[1]:
        raise Exception(
            "Conversion failed! no edges or no nodes in this network, you might check the line in the i18n corresponding to the "
            + i18n[lang] + " language")

    cachedict = {'network': 'Wiki', 'lang': lang, 'date': date}
    if threshold > 1:
        cachedict['threshold'] = threshold

    # x^th percentile
    edges = pynet[1]
    edges.sort(lambda x, y: cmp(x[2], y[2]))
    perc90 = edges[len(edges) * 9 / 10][2]
    perc95 = edges[len(edges) * 95 / 100][2]

    assert save(cachedict, pynet, dst)

    cachedict['%'] = 90
    assert save(cachedict, perc90, dst)
    cachedict['%'] = 95
    assert save(cachedict, perc95, dst)
    del cachedict['%']

    if hasattr(ch, 'distrust') and ch.distrust:
        net = ch.getDistrustGraph()

        nodes = set(net.nodes())
        edges = net.edges()

        #if a node is in edges, isn't useful keep it in nodes
        for e in edges:
            nodes.discard(e[0])
            nodes.discard(e[1])

        assert save({
            'network': 'DistrustWiki',
            'lang': lang,
            'date': date
        }, (list(nodes), edges),
                    os.path.join(os.path.split(dst)[0], 'graphDistrust.c2'))

    if not downloadlists:
        return

    users, bots, blockedusers = get_list_users(
        lang, os.path.join(os.environ['HOME'], 'shared_datasets',
                           'WikiNetwork'))

    assert save({'lang': lang, 'list': 'bots'}, bots, dst)
    assert save({'lang': lang, 'list': 'blockedusers'}, blockedusers, dst)

    lenusers = len(users)
    assert save({'lang': lang, 'info': 'number of users'}, lenusers, dst)
def wikixml2graph(src,dst,distrust=False,threshold=0,downloadlists=True,verbose=False):
    '''
    This function is the only one that have to be called directly in this module.
    The other ones are only helpers for this one.
    This function is able to take a gzipped, 7zipped, or bzipped xml downloaded
    from www.download.wikimedia.org understand:
    1. what is the lang of the network (from the filename)
    2. what is the data in which the snapshot would be taken (from the filename)
    3. if it is current or history (from the filename)
    4. if is possible to create a distrust-graph (only for xml with pages)
    and unzip it, parse it, and create in the right folder (your_home/shared_datasets/WikiNetwork/lang/date/ )
    the c2 file with the network. (and if possible, the c2 with the distrust graph)
    threshold = the minimum weight on edges (the edges with weight < threshold, will be deleted)
    downloadlist = download the list of bots, and the list of blockedusers
    '''

    if not i18n:
        raise IOError( os.path.join( os.environ['HOME'], 'shared_datasets', 'WikiNetwork', 'languageparameters.c2' )+" does not exists! you have to sync your current directory (with sync_trustlet)") 


    assert dst.endswith('.c2')
    srcname = os.path.split(src)[1]

    if 'current' in srcname:
        WikiContentHandler = WikiCurrentContentHandler
        
    elif 'history' in srcname:
        WikiContentHandler = WikiHistoryContentHandler
    else:
        raise Error("I cannot understand the type of network (current or history?)")

    filename = os.path.split(src)[1] #rm dir
    size = os.stat(src).st_size

    s = os.path.split(src)[1]
    lang = s[:s.index('wiki')]
    assert lang in i18n, "The lang "+lang+" is not supported! (you can add it using the function addWikiLanguage in this package)"
    res = re.search('wiki-(\d{4})(\d{2})(\d{2})-',s)
    date = '-'.join([res.group(x) for x in xrange(1,4)])
    assert isdate(date)

    deleteafter = False
    # Support compressed file
    if type(src) is str:
        if src.endswith('.gz'):
            verbose = False
            src = GzipFile(src)
        elif BZ2 and src.endswith('.bz2'):
            src = BZ2File(src)
            verbose = False
        elif not BZ2 and src.endswith('.bz2'):
            if os.system( "bunzip2 -q -k -f "+src ):
                print 'an error has occourred! possible reason:'
                print '1. install bz2'
                print '2. no space left on device (in order to decompress your bzip)'
                print 'NB: consider install python-bz2'
                exit(1)

            src = src[:-4] # cut the last three chars
            deleteafter = True

        elif src.endswith('.7z'):
            verbose = False
            if SevenzipFile:
                src = SevenzipFile(src)
            else:
                print 'Install p7zip'
                exit(1)

    mkpath(os.path.split(dst)[0])

    
    ch = WikiContentHandler(lang,xmlsize=size,
                            inputfilename=filename,
                            forcedistrust=distrust,
                            threshold=threshold,
                            verbose=verbose)

    sax.parse(src,ch)
    #check!
    if deleteafter:
        os.remove( src )

    pynet = del_ips(ch.getPyNetwork())
    
    if not pynet[0] or not pynet[1]:
        raise Exception( "Conversion failed! no edges or no nodes in this network, you might check the line in the i18n corresponding to the "+i18n[lang]+" language" )

    cachedict = {'network':'Wiki','lang':lang,'date':date}
    if threshold>1:
        cachedict['threshold'] = threshold

    # x^th percentile
    edges = pynet[1]
    edges.sort(lambda x,y: cmp(x[2],y[2]))
    perc90 = edges[len(edges)*9/10][2]
    perc95 = edges[len(edges)*95/100][2]

    assert save(cachedict,pynet,dst)

    cachedict['%'] = 90
    assert save(cachedict,perc90,dst)
    cachedict['%'] = 95
    assert save(cachedict,perc95,dst)
    del cachedict['%']

    if hasattr(ch,'distrust') and ch.distrust:
        net = ch.getDistrustGraph()

        nodes = set(net.nodes())
        edges = net.edges()

        #if a node is in edges, isn't useful keep it in nodes
        for e in edges:
            nodes.discard(e[0])
            nodes.discard(e[1])

        assert save({'network':'DistrustWiki','lang':lang,'date':date},
                    (list(nodes),edges),
                    os.path.join(os.path.split(dst)[0],'graphDistrust.c2'))

    if not downloadlists:
        return

    users,bots,blockedusers = get_list_users(lang,
                                             os.path.join(os.environ['HOME'],'shared_datasets','WikiNetwork'))

    assert save({'lang':lang,'list':'bots'},bots,dst)
    assert save({'lang':lang,'list':'blockedusers'},blockedusers,dst)

    lenusers = len(users)
    assert save({'lang':lang,'info':'number of users'},lenusers,dst)