コード例 #1
0
ファイル: createTable.py プロジェクト: lucacasarotti/HBOLD
def countExt():
    count = 0
    for end in mongo.getAllEndopoinLodex():
        p = mongo.getExtById(end['_id'])
        if len(p) != 0:
            count = count + 1
    return count
コード例 #2
0
ファイル: createTable.py プロジェクト: lucacasarotti/HBOLD
def countExt_month_year(month, year):
    count = 0
    for end in mongo.getAllEndopoinLodex():
        p = mongo.getExtById(end['_id'])
        if len(p) != 0:
            e = mongo.getLastRunById(end['_id'])
            if (e['date'].month == month) and (e['date'].year == year):
                count = count + 1
    return count
コード例 #3
0
def automaticExtraction(argv):
    if (argv[0] == 'all'):
        for end in mongo.getAllEndopoinLodex():
            endpointExtraction(end['_id'])

            print("Generating schema summary")
            generateSS(end['_id'])
            generateCS(end['_id'])

    elif isinstance(argv[0], str):
        url = argv[0]
        end = mongo.getEndopointByUrl(url)
        p = mongo.getExtById(end['_id'])

        endpointExtraction(end['_id'])

        print("Generating schema summary ")
        generateSS([str(end['_id'])])
        generateCS([str(end['_id'])])
    else:
        print("Something awful happened")
コード例 #4
0
from extractor import SchemaExtractorTestV3 as se
import threading
import time
from extractor.util import mongo         
import pymongo as pm

threads = []


# TODO choose number of thread

for a in mongo.getAllEndopoinLodex():
#     time.sleep(1)
    
    e= mongo.getLastRunById(a['_id'])
    logs=set()
    if e is not None:
        logs = set([l['phase'] for l in e['log']])
    
    if 'finish' not in logs:
    
            print '------------------'
            print len(threads)
        
            thread = threading.Thread(target=se.ExtractSchema, args=(a,False))
            thread.start()
        
            threads.append(thread)
            
            while len(threads) > 10:
                time.sleep(1)
コード例 #5
0
def downloadDataset(argv):
    if argv[0] in [
            "https://www.europeandataportal.eu/sparql",
            "https://io.datascience-paris-saclay.fr/sparql",
            "http://data.europa.eu/euodp/sparqlep"
    ]:
        sparql = SPARQLWrapper(argv[0])
        q = util.queryGenerator.QueryGenerator()
        if argv[0] == "https://www.europeandataportal.eu/sparql":
            sparql.setQuery(q.EuDownload().query)
        elif argv[0] == "https://io.datascience-paris-saclay.fr/sparql":
            sparql.setQuery(q.dataScienceParisDownload().query)
        elif argv[0] == "http://data.europa.eu/euodp/sparqlep":
            sparql.setQuery(q.dataEuDownload().query)
        sparql.setReturnFormat(XML)
        """print(sparql)"""
        #print("Extraction endpoints")
        results = sparql.queryAndConvert()
        #print("Parsing results\n")
        pprint(results)
        pprint(
            se.parseResponseForDatasetExtr(None, results, "test_connection",
                                           False))
        print("-----")

        return

        if se.parseResponseForDatasetExtr(None, results, "test_connection",
                                          False):
            endArr = []
            endDIct = {}

            for end in se.parseResponseForDatasetExtr(
                    None, results, "test_connection", False
            ):  # end è un oggetto con 'dataset', 'title' e 'url' dell'endpoint
                if 'title' in end:
                    if end['url'] in endDIct:
                        tmp = endDIct[end['url']]
                        tmp['name'].append(end['title'])
                        endDIct[end['url']] = tmp
                    else:
                        endDIct[end['url']] = {'name': [end['title']]}

            datasets = []
            urls = []
            count = mongo.getLastIdEndpointsLodex()
            copy = False

            for key in endDIct:
                endpoint = mongo.getAllEndopoinLodex()
                for e in endpoint:
                    if e["url"] == key:
                        copy = True

                if copy == False:
                    ds = {}
                    ds = {
                        'url': key,
                        '_id': count,
                        'name': endDIct[key]['name'][0]
                    }
                    urls.append(key)
                    count = count + 1
                    ds['datasets'] = [{'name': endDIct[key]['name'][0]}]
                    datasets.append(ds)

                copy = False

            # Stringa per il parsing
            print("Ricerca di nuovi dataset sul portale " + argv[0])
            print("Trovati " + str(len(datasets)) + " nuovi datasets")
            print(datasets)
            if len(datasets) > 0:
                mongo.inserLodexDatasets(datasets)
                for i in range(0, len(datasets)):
                    url = urls[i]
                    automaticExtraction([url])

    else:

        url = argv[0]
        sparql = SPARQLWrapper(url)
        q = util.queryGenerator.QueryGenerator()
        id = mongo.startTestNew(url)
        print(id)
        """in runInfo, id è il numero dentro a ObjectId"""
        copy = False
        count = mongo.getLastIdEndpointsLodex()
        datasets = []

        if se.testConnection(url, q, sparql, id):
            endpoint = mongo.getAllEndopoinLodex()
            for e in endpoint:
                if e["url"] == url:
                    copy = True

            if copy == False:
                ds = {}
                trash, name = itemgetter(0, 1)(url.split('//', 1))
                ds = {'url': url, '_id': count, 'name': name}
                datasets.append(ds)
            else:
                print("-----")
                print(url + " e' un endpoint valido, ")
                print("ma e' gia' presente in MongoDB; non lo aggiungo.")
                print("L'estrazione viene evitata in quanto sarebbe inutile.")

        else:
            print("-----")
            print(
                url +
                " non e' un endpoint valido o non e' al momento raggiungibile."
            )
            print("Estrazione fallita.")

        if len(datasets) > 0:
            mongo.inserLodexDatasets(datasets)
            mongo.deleteExtById(count)
            print(datasets)
            automaticExtraction([argv[0]])
            print("-----")
            print(url + " e' un endpoint valido, ")
            print("non presente su MongoDB; lo aggiungo.")
            print("Estrazione andata a buon fine.")
コード例 #6
0
ファイル: createTable.py プロジェクト: lucacasarotti/HBOLD
def countDataset():
    count = 0
    for end in mongo.getAllEndopoinLodex():
        count = count + 1
    return count
コード例 #7
0
from extractor import SchemaExtractorTestV3 as se
import threading
import time
from extractor.util import mongo
import pymongo as pm

threads = []

# TODO choose number of thread

for a in mongo.getAllEndopoinLodex():
    #     time.sleep(1)

    e = mongo.getLastRunById(a['_id'])
    logs = set()
    if e is not None:
        logs = set([l['phase'] for l in e['log']])

    if 'finish' not in logs:

        print '------------------'
        print len(threads)

        thread = threading.Thread(target=se.ExtractSchema, args=(a, False))
        thread.start()

        threads.append(thread)

        while len(threads) > 10:
            time.sleep(1)
            for t in threads: