Esempio n. 1
0
def import_gadm():

    total = 0
    limit = 2000
    f_path = "C:\Users\Al\PycharmProjects\AlignmentUI\src\UploadedFiles\gadm.ttl"
    b_path = "C:\Users\Al\PycharmProjects\AlignmentUI\src\UploadedFiles\gadm{}".format(
        Ut.batch_extension())

    # CREATE THE WRITERS
    writer = codecs.open(f_path, "wb", "utf-8")
    batch_writer = codecs.open(b_path, "wb", "utf-8")

    # GENERATING THE BATCH FILE TEXT
    graph = "{}gadm".format(Ns.dataset)
    stardog_path = '' if Ut.OPE_SYS == "windows" else Svr.settings[
        St.stardog_path]
    load_text = """echo "Loading data"
                {}stardog data add {} -g {} "{}"
                """.format(stardog_path, Svr.settings[St.database], graph,
                           f_path)
    batch_writer.write(to_unicode(load_text))
    batch_writer.close()

    print "1. GET THE TOTAL NUMBER OF TRIPLES TO LOAD"
    count_query = import_gadm_query(is_count=True)
    # print count_query
    count_res = Qry.virtuoso_request(count_query)
    result = count_res['result']
    if result is None:
        print "NO RESULT FOR THIS ENRICHMENT."
        return count_res

    print "2. PROCESSING THE COUNT RESULT"
    g = rdflib.Graph()
    g.parse(data=result, format="turtle")
    attribute = rdflib.URIRef("http://www.w3.org/2005/sparql-results#value")
    for subject, predicate, obj in g.triples((None, attribute, None)):
        total = int(obj)
    iterations = total / limit if total % limit == 0 else total / limit + 1
    print "\tTOTAL TRIPLES TO RETREIVE  : {} \n\tTOTAL NUMBER OF ITERATIONS : {}\n".format(
        total, iterations)

    # RUN THE ITERATIONS
    try:
        for i in range(0, iterations):

            offset = i * limit + 1
            print "ROUND: {} OFFSET: {}".format(i, offset)

            print "\tRUNNING THE QUERY"
            import_query = import_gadm_query(limit=limit,
                                             offset=offset,
                                             is_count=False)
            response = Qry.virtuoso_request(import_query)

            print "RESPONSE SIZE: ".format(response["result"])

            print "\tWRITING THE RESULT TO FILE"
            writer.write(response["result"])

            break

    except Exception as err:
        print str(err.message)

    # CLOSE THE IMPORT WRITER
    writer.close()
    print "4. RUNNING THE BATCH FILE"
    print "THE DATA IS BEING LOADED OVER HTTP POST." if Svr.settings[St.split_sys] is True \
        else "THE DATA IS BEING LOADED AT THE STARDOG LOCAL HOST FROM BATCH."
    print "PATH:", b_path
    os.system(b_path)
    print "JOB DONE!!!"
Esempio n. 2
0
def enrich(specs, directory, endpoint):

    # TODO RUN IT IF THERE IS NOT GRAPH ENRICHED WITH THE SAME NAME

    # specs[St.graph] = "http://grid.ac/20170712"
    print "ENRICHING DATA/GRAPH FROM EXPORT-ALIGNMENT"
    print "GRAPH:", specs[St.graph]
    print "ENTITY TYPE:", specs[St.entity_datatype]
    print "LAT PREDICATE:", specs[St.long_predicate]
    print "LONG PREDICATE:", specs[St.lat_predicate]
    print "FILE DIRECTORY:", directory
    name = Ut.get_uri_local_name(specs[St.graph])

    print endpoint
    data_1 = Qry.virtuoso_request(
        "ask {{ GRAPH <{}> {{ ?x ?y ?z . }} }}".format(specs[St.graph]),
        endpoint)
    data_1 = regex.findall("rs:boolean[ ]*(.*)[ ]*\.", data_1["result"])
    if len(data_1) > 0:
        data_1 = data_1[0].strip() == "true"
        if data_1 is False:
            print "GRAPH: {} {}".format(
                specs[St.graph], "DOES NOT EXIST AT THE REMOTE VIRTUOSO SITE.")

    # CHECKING WHETHER BOTH DATASETS ARE AT THE VIRTUOSO TRIPLE STORE
    data_2 = Qry.virtuoso_request(
        "ask {GRAPH <http://geo.risis.eu/gadm>{ ?x ?y ?z . }}", endpoint)
    data_2 = regex.findall("rs:boolean[ ]*(.*)[ ]*\.", data_2["result"])
    if len(data_2) > 0:
        data_2 = data_2[0].strip() == "true"
        if data_2 is False:
            print "GRAPH: {} {}".format(
                specs[St.graph], "DOES NOT EXIST AT THE REMOTE VIRTUOSO SITE.")

    if data_1 is False or data_2 is False:
        message = "BECAUSE BOTH DATASETS NEED TO BE PRESENT AT OUR TRIPLES STORE, WE ARE UNABLE TO EXECUTE THE REQUEST."
        return {
            St.message:
            message,
            St.result:
            'The dataset {} '
            'cannot be enriched with GADM boundary  at the moment.'.format(
                specs[St.graph])
        }

    total = 0
    limit = 20000
    date = datetime.date.isoformat(datetime.date.today()).replace('-', '')
    f_path = "{0}{1}{1}{2}_enriched_{3}.ttl".format(directory, os.path.sep,
                                                    name, date)
    b_path = "{0}{1}{1}{2}_enriched_{3}{4}".format(directory, os.path.sep,
                                                   name, date,
                                                   Ut.batch_extension())

    # MAKE SURE THE FOLDER EXISTS
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError as err:
        print "\n\t[utility_LOAD_TRIPLE_STORE:]", err
        return

    print "\n1. GETTING THE TOTAL NUMBER OF TRIPLES."
    count_query = enrich_query(specs, limit=0, offset=0, is_count=True)
    print count_query
    count_res = Qry.virtuoso_request(count_query, endpoint)
    result = count_res['result']

    # GET THE TOTAL NUMBER OF TRIPLES
    if result is None:
        print "NO RESULT FOR THIS ENRICHMENT."
        return count_res

    g = rdflib.Graph()
    g.parse(data=result, format="turtle")
    attribute = rdflib.URIRef("http://www.w3.org/2005/sparql-results#value")
    for subject, predicate, obj in g.triples((None, attribute, None)):
        total = int(obj)

    # NUMBER OF REQUEST NEEDED
    iterations = total / limit if total % limit == 0 else total / limit + 1
    print "\n2. TOTAL TRIPLES TO RETREIVE  : {} \n\tTOTAL NUMBER OF ITERATIONS : {}\n".format(
        total, iterations)

    writer = codecs.open(f_path, "wb", "utf-8")
    batch_writer = codecs.open(b_path, "wb", "utf-8")

    print "3. GENERATING THE BATCH FILE TEXT"
    enriched_graph = "{}_enriched".format(specs[St.graph])
    stardog_path = '' if Ut.OPE_SYS == "windows" else Svr.settings[
        St.stardog_path]

    load_text = """echo "Loading data"
            {}stardog data add {} -g {} "{}"
            """.format(stardog_path, Svr.settings[St.database], enriched_graph,
                       f_path)

    batch_writer.write(to_unicode(load_text))
    batch_writer.close()

    # RUN THE ITERATIONS
    for i in range(0, iterations):

        offset = i * 20000 + 1
        print "\tROUND: {} OFFSET: {}".format(i + 1, offset)

        # print "\t\t1. GENERATING THE ENRICHMENT QUERY"
        virtuoso = enrich_query(specs,
                                limit=limit,
                                offset=offset,
                                is_count=False)
        # print virtuoso
        # exit(0)
        # print Qry.virtuoso(virtuoso)["result"]

        # print "\t\t2. RUNNING THE QUERY + WRITE THE RESULT TO FILE"
        writer.write(Qry.virtuoso_request(virtuoso, endpoint)["result"])

    writer.close()
    print "\n4. RUNNING THE BATCH FILE"
    print "\tTHE DATA IS BEING LOADED OVER HTTP POST." if Svr.settings[St.split_sys] is True \
        else "\tTHE DATA IS BEING LOADED AT THE STARDOG LOCAL HOST FROM BATCH."
    # os.system(b_path)

    # RUN THE BATCH FILE
    print "\tFILE: {}".format(f_path)
    print "\tBATCH: {}\n".format(b_path)
    os.chmod(b_path, 0o777)
    Ut.batch_load(b_path)
    if os.path.exists(b_path) is True:
        os.remove(b_path)

    # TODO 1. REGISTER THE DATASET TO BE ENRICHED IF NOT YET REGISTER
    # TODO 2. ADD THE ENRICHED DATASET TO THE RESEARCH QUESTION (REGISTER).
    # TODO 3. MAYBE, CREATE THE LINKSET BETWEEN THE SOURCE AND THE RESULTING

    size = Qry.get_namedgraph_size(enriched_graph)

    print "JOB DONE...!!!!!!"

    return {
        St.message:
        "The select dataset was enriched with the GADM boundary as {}. "
        "{} triples were created.".format(enriched_graph, size),
        St.result:
        enriched_graph
    }