Exemple #1
0
def main(argv):
    file_name = "datasets.json"
    if len(argv) > 0:
        file_name = argv[0]

    with open(file_name) as data_file:
        data = json.load(data_file)["datasets"]
        for dat in data:
            id = mongo.getLastIdEndpointsLodex()
            ds = {'_id': id, 'name': dat['name'], 'url': dat['url']}
            mongo.inserLodexDatasets(ds)
Exemple #2
0
def main(argv):
    file_name="datasets.json"
    if len(argv) > 0:
        file_name=argv[0]
    
    
    with open(file_name) as data_file:    
        data = json.load(data_file)["datasets"]
        for dat in data:
            id = mongo.getLastIdEndpointsLodex()
            ds={'_id':id,'name':dat['name'],'url':dat['url']}
            mongo.inserLodexDatasets(ds)
Exemple #3
0
def downloadDataset(argv):
    if argv[0] in [
            "https://www.europeandataportal.eu/sparql",
            "https://io.datascience-paris-saclay.fr/sparql",
            "http://data.europa.eu/euodp/sparqlep"
    ]:
        sparql = SPARQLWrapper(argv[0])
        q = util.queryGenerator.QueryGenerator()
        if argv[0] == "https://www.europeandataportal.eu/sparql":
            sparql.setQuery(q.EuDownload().query)
        elif argv[0] == "https://io.datascience-paris-saclay.fr/sparql":
            sparql.setQuery(q.dataScienceParisDownload().query)
        elif argv[0] == "http://data.europa.eu/euodp/sparqlep":
            sparql.setQuery(q.dataEuDownload().query)
        sparql.setReturnFormat(XML)
        """print(sparql)"""
        #print("Extraction endpoints")
        results = sparql.queryAndConvert()
        #print("Parsing results\n")
        pprint(results)
        pprint(
            se.parseResponseForDatasetExtr(None, results, "test_connection",
                                           False))
        print("-----")

        return

        if se.parseResponseForDatasetExtr(None, results, "test_connection",
                                          False):
            endArr = []
            endDIct = {}

            for end in se.parseResponseForDatasetExtr(
                    None, results, "test_connection", False
            ):  # end è un oggetto con 'dataset', 'title' e 'url' dell'endpoint
                if 'title' in end:
                    if end['url'] in endDIct:
                        tmp = endDIct[end['url']]
                        tmp['name'].append(end['title'])
                        endDIct[end['url']] = tmp
                    else:
                        endDIct[end['url']] = {'name': [end['title']]}

            datasets = []
            urls = []
            count = mongo.getLastIdEndpointsLodex()
            copy = False

            for key in endDIct:
                endpoint = mongo.getAllEndopoinLodex()
                for e in endpoint:
                    if e["url"] == key:
                        copy = True

                if copy == False:
                    ds = {}
                    ds = {
                        'url': key,
                        '_id': count,
                        'name': endDIct[key]['name'][0]
                    }
                    urls.append(key)
                    count = count + 1
                    ds['datasets'] = [{'name': endDIct[key]['name'][0]}]
                    datasets.append(ds)

                copy = False

            # Stringa per il parsing
            print("Ricerca di nuovi dataset sul portale " + argv[0])
            print("Trovati " + str(len(datasets)) + " nuovi datasets")
            print(datasets)
            if len(datasets) > 0:
                mongo.inserLodexDatasets(datasets)
                for i in range(0, len(datasets)):
                    url = urls[i]
                    automaticExtraction([url])

    else:

        url = argv[0]
        sparql = SPARQLWrapper(url)
        q = util.queryGenerator.QueryGenerator()
        id = mongo.startTestNew(url)
        print(id)
        """in runInfo, id è il numero dentro a ObjectId"""
        copy = False
        count = mongo.getLastIdEndpointsLodex()
        datasets = []

        if se.testConnection(url, q, sparql, id):
            endpoint = mongo.getAllEndopoinLodex()
            for e in endpoint:
                if e["url"] == url:
                    copy = True

            if copy == False:
                ds = {}
                trash, name = itemgetter(0, 1)(url.split('//', 1))
                ds = {'url': url, '_id': count, 'name': name}
                datasets.append(ds)
            else:
                print("-----")
                print(url + " e' un endpoint valido, ")
                print("ma e' gia' presente in MongoDB; non lo aggiungo.")
                print("L'estrazione viene evitata in quanto sarebbe inutile.")

        else:
            print("-----")
            print(
                url +
                " non e' un endpoint valido o non e' al momento raggiungibile."
            )
            print("Estrazione fallita.")

        if len(datasets) > 0:
            mongo.inserLodexDatasets(datasets)
            mongo.deleteExtById(count)
            print(datasets)
            automaticExtraction([argv[0]])
            print("-----")
            print(url + " e' un endpoint valido, ")
            print("non presente su MongoDB; lo aggiungo.")
            print("Estrazione andata a buon fine.")
        if 'name' in end:
            if end['u'] in endDIct:
                tmp=endDIct[end['u']]
                tmp['name'].append(end['name'])
                tmp['label'].append(end['label'])
                if 'desc' in end:
                    tmp['desc'].append(end['desc'])
                else:
                    tmp['desc'].append(' ')
                endDIct[end['u']]=tmp
            else:
                endDIct[end['u']]={'name':[end['name']],'label':[end['label']],'desc':[end['desc']] if 'desc' in end else [' ']}

#    print(endDict)
    datasets=[]
    count=mongo.getLastIdEndpointsLodex()
    for key in endDIct:
        ds={}
        if len(endDIct[key]['label']) > 1:
#             pprint.pprint(endDIct[key])
            labelSet=set(endDIct[key]['label'])
            globalName=reduce(lambda a,b: a if a>b else b,endDIct[key]['name'])
            ds={'url':key,'_id':count}
            count = count + 1
            ds['datasets']=[]
            for a in labelSet:
                desc=""
                name=""
                for b in range(len(endDIct[key]['label'])):
                    if endDIct[key]['label'][b] == a:
                        if 'desc' in endDIct[key] and len(endDIct[key]['desc'][b]) > len(desc):
                tmp=endDIct[end['u']]
                tmp['name'].append(end['name'])
                tmp['label'].append(end['label'])
                if 'desc' in end:
                    tmp['desc'].append(end['desc'])
                else:
                    tmp['desc'].append(' ')
                endDIct[end['u']]=tmp
            else:
                endDIct[end['u']]={'name':[end['name']],'label':[end['label']],'desc':[end['desc']] if 'desc' in end else [' ']}
        
#     print endDIct
    
    
    datasets=[]
    count=mongo.getLastIdEndpointsLodex()
    for key in endDIct:
        ds={}
        if len(endDIct[key]['label']) > 1:
#             pprint.pprint(endDIct[key])
            labelSet=set(endDIct[key]['label'])
            globalName=reduce(lambda a,b: a if a>b else b,endDIct[key]['name'])
            ds={'url':key,'_id':count}
            count = count +1
            ds['datasets']=[]
            for a in labelSet:
                desc=""
                name=""
                for b in range(len(endDIct[key]['label'])):
                    if endDIct[key]['label'][b] == a:
                        if 'desc' in endDIct[key] and len(endDIct[key]['desc'][b]) > len(desc):