Python soupify Exemples, graph_info_csv_helpers.soupify Python Exemples

Exemple #1

0

Afficher le fichier

import zipfile
import networkx as nx
from io import BytesIO
import csv
import graph_info_csv_helpers as utils

__author__ = "Henry Carscadden"
__email__ = '*****@*****.**'
"""
This file downloads networks from Alex Arena's website and reads into the a buffer;
from the buffer, we extract node attributes and build a graph. The attributes and graph are written to 
file.
"""
data_url = "http://deim.urv.cat/~alexandre.arenas/data/"
base_url = "http://deim.urv.cat/~alexandre.arenas/data/welcome.htm"
parsed_html = utils.soupify(base_url)
for link in parsed_html.find_all('a'):
    if 'zip' in link.get('href'):
        url = data_url + link.get('href')
        pajek_lines = []
        graph_zipped = utils.get_zip_fp(url)
        for file in graph_zipped.infolist():
            ext = file.filename[-3:].lower()
            if ext == "net" or ext == "paj":
                pajek_lines = graph_zipped.read(file.filename).decode('utf-8')
                if 'jazz' in file.filename:
                    pajek_lines = "\n".join(
                        list(
                            map(
                                lambda x: " ".join(
                                    x.strip(' ').replace('\t', '').split(' ')),

Exemple #2

0

Afficher le fichier

Fichier : download_snap.py Projet : hcars/net.science_datasets

import os
import io
import numpy as np
import graph_info_csv_helpers as utils
import networkx as nx
import tarfile
import urllib.request

edge_list_path = '../snap_networks/edge_lists/'
node_id_path = '../snap_networks/node_id_mappings/'
snap_data_url = "https://sparse.tamu.edu/SNAP?per_page=All"
bytes_limit = 20000000

index_page_parsed = utils.soupify(snap_data_url)

rows = index_page_parsed.find_all('table')[1].find_all('tr')
for i in range(1, len(rows)):
    row = rows[i]
    row_data = [attr for attr in row.find_all('td')]
    name = row_data[1].string
    multigraph = 'multigraph' in row_data[6].string.lower()
    dataset_url = row.find_all('a')[-1].get('href')
    site = urllib.request.urlopen(dataset_url)
    metadata = site.info()
    if int(metadata['Content-Length']) > bytes_limit:
        file_size = metadata['Content-Length']
        utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size)
    else:
        ext = dataset_url[-3:].lower()
        if ext == ".gz":
            with urllib.request.urlopen(dataset_url) as tarred_mtx:

Exemple #3

0

Afficher le fichier

import graph_info_csv_helpers as utils
import urllib.request
import igraph
import os

base_url = "http://vlado.fmf.uni-lj.si/pub/networks/data/ucinet/"

ucinet_parsed = utils.soupify(base_url + "ucidata.htm")

for link in ucinet_parsed.find_all('a'):
    link_href = link.get('href')
    if link_href is not None:
        ext = link_href[-3:].lower()
        if ext == 'dat':
            with urllib.request.urlopen(base_url +
                                        link_href.split('/')[-1]) as dat_fp:
                file_data = dat_fp.read().decode('utf-8')
            with open('../dl_files/' + link_href.split('/')[-1] + '.dl',
                      'w',
                      newline='') as tmp_fp:
                tmp_fp.write(file_data)

Exemple #4

0

Afficher le fichier

import os
import io
import numpy as np
import graph_info_csv_helpers as utils
import networkx as nx
import tarfile
import urllib.request

base_dir = '../gleich_networks/'
edge_list_path = base_dir + 'edge_lists/'
node_id_path = base_dir + 'node_id_mappings/'
base_url = "https://sparse.tamu.edu/Gleich"
bytes_limit = 10000000

index_page_parsed = utils.soupify(base_url)

rows = index_page_parsed.find_all('table')[1].find_all('tr')
for i in range(1, len(rows)):
    row = rows[i]
    row_data = [attr for attr in row.find_all('td')]
    name = row_data[1].string
    multigraph = 'multigraph' in row_data[6].string.lower()
    dataset_url = row.find_all('a')[-1].get('href')
    site = urllib.request.urlopen(dataset_url)
    metadata = site.info()
    if int(metadata['Content-Length']) > bytes_limit:
        file_size = metadata['Content-Length']
        utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size)
    else:
        ext = dataset_url[-3:].lower()
        if ext == ".gz":

Exemple #5

0

Afficher le fichier

Fichier : download_network_repository.py Projet : hcars/net.science_datasets

import json

import scipy.io
import networkx as nx
import graph_info_csv_helpers as utils

__author__ = "Henry Carscadden"
__email__ = '*****@*****.**'
"""
This file downloads files from a large repository known as network repository.
"""
base_site = "http://networkrepository.com/"
base_url = "http://networkrepository.com/networks.php"
edge_list_path = '../network_repo_networks/edge_lists/'
node_id_path = '../network_repo_networks/node_id_mappings/'
parsed_networks_page = utils.soupify(base_url)


def node_id_write(G, url, edge_list_path, node_id_path, name):
    old_attributes = list(G.nodes)
    G = nx.convert_node_labels_to_integers(G)
    id_mapping = []
    node_list = list(G.nodes)
    for i in range(len(node_list)):
        id_mapping.append([old_attributes[i], str(node_list[i])])
    mapping_file = open(node_id_path + name + '.csv', 'w', newline='')
    mapping_file_writer = csv.writer(mapping_file)
    mapping_file_writer.writerow(['id', 'name'])
    for tup in id_mapping:
        mapping_file_writer.writerow(list(tup))
    mapping_file.close()

Exemple #6

0

Afficher le fichier

Fichier : download_vanheukelum.py Projet : hcars/net.science_datasets

import os
import io
import numpy as np
import graph_info_csv_helpers as utils
import networkx as nx
import tarfile
import urllib.request

base_dir = '../vanheukelum_networks/'
edge_list_path = base_dir + 'edge_lists/'
node_id_path = base_dir + 'node_id_mappings/'
van_heukelum_url = "https://sparse.tamu.edu/vanHeukelum"
bytes_limit = 10000000

index_page_parsed = utils.soupify(van_heukelum_url)

rows = index_page_parsed.find_all('table')[1].find_all('tr')
for i in range(1, len(rows)):
    row = rows[i]
    row_data = [attr for attr in row.find_all('td')]
    name = row_data[1].string
    multigraph = 'multigraph' in row_data[6].string.lower()
    dataset_url = row.find_all('a')[-1].get('href')
    site = urllib.request.urlopen(dataset_url)
    metadata = site.info()
    if int(metadata['Content-Length']) > bytes_limit:
        file_size = metadata['Content-Length']
        utils.insert_into_undownloaded_db(name, dataset_url, 0, file_size)
    else:
        ext = dataset_url[-3:].lower()
        if ext == ".gz":