Ejemplo n.º 1
0
    def __init__(self):
        '''
        Constructor
        '''

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

        self.lookup = Lookup()
Ejemplo n.º 2
0
def apply_endpoint(entity):  # find types
    # print(entity)
    ep = DBpediaEndpoint()  # using ID/int
    ent2 = entity.getIdstr()
    types = ep.getTypesForEntity(ent2)  # limit to 5
    # print('types using endpoint id', types)
    if len(types) == 0:  # using entity: back up
        types = entity.getTypes()  # ont
        # print('types using entity', types, '\n')
    return types
Ejemplo n.º 3
0
 def __init__(self):
     '''
     Constructor
     '''
     #Set up
     #self.setUpRDFGraph()
     self.entities = set()
     self.types = set()
     #prop: True -> isObjectProperty
     self.propertyType = dict()
     
     
     self.dbp_endpoint = DBpediaEndpoint()        
     self.lookup = DBpediaLookup()
Ejemplo n.º 4
0
    def __init__(self):  #KGraph=KG.DBpedia
        '''
        Constructor
        '''
        #Return types from this knowledge graph
        #self.KGraph = KGraph

        self.dbpedia_onto = DBpediaOntology()
        self.dbpedia_onto.loadOntology(True)
        self.schema_onto = SchemaOrgOntology()
        self.schema_onto.loadOntology(True)

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()
Ejemplo n.º 5
0
def apply_endpoint_list(entity_list):  # find types
    types_list = []
    ep = DBpediaEndpoint()  # using ID/int
    for entity in entity_list:
        print(entity)
        # ['N/A']
        # < id: http://dbpedia.org/resource/Ana_Popović, label: Ana Popović, description: None, types: set(), source: DBpedia >
        if entity != ['N/A']:
            ent2 = entity.getIdstr()
            types = ep.getTypesForEntity(ent2)  # limit to 5
            # print('types using endpoint id', types)
            types_list.append(types)
            if len(types) == 0:  # using entity: back up
                types = entity.getTypes()  # ont
                # print('types using entity', types, '\n')
                types_list.append(types)
        else:
            types = []
            types_list.append(types)
    return types_list
Ejemplo n.º 6
0
def tablesToChallengeFormat(folder_gt, folder_tables, file_out_gt,
                            file_out_redirects_gt, file_out_gt_target,
                            max_tables):

    #csv_file_names = [f for f in listdir(folder_gt) if isfile(join(folder_gt, f))]

    csv_file_names = []
    csv_file_names.append("2014_Tour_of_Turkey#6.csv")

    f_out = open(file_out_gt, "w+")
    f_out_redirects = open(file_out_redirects_gt, "w+")
    f_out_target = open(file_out_gt_target, "w+")

    n_tables = 0
    wrong_entities = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    for csv_file_name in csv_file_names:

        #print(csv_file_name)

        with open(join(folder_gt, csv_file_name)) as csv_file:

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            table_id = csv_file_name.replace(".csv", "")

            csv_reader = csv.reader(csv_file,
                                    delimiter=',',
                                    quotechar='"',
                                    escapechar="\\")

            for row in csv_reader:

                #URI, text, row number
                #http://dbpedia.org/resource/Ahmet_%C3%96rken, Ahmet A\u0096rken, 1
                if len(row) < 3:
                    continue

                entity_uri = row[0]
                row_id = row[2]

                entity_mention = row[1]

                column_id = getColumnEntityMention(
                    join(folder_tables, csv_file_name), entity_mention)

                entities = set()
                new_entities = set()

                ##Consider redirects:
                #entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                #entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))

                #for e in entities:
                #    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                #    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                #entities.update(new_entities)

                if column_id >= 0:
                    #Output
                    #table id,column id, row id and DBPedia entity
                    #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
                    line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, column_id,
                                                         row_id)
                    f_out_target.write(line_str + '\n')

                    #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

                    f_out.write(line_str + ',\"%s\"\n' % entity_uri)

                    #for ent in entities:
                    #    line_str += ',\"'+ ent + '\"'
                    line_str += ',\"' + " ".join(entities) + '\"'

                    f_out_redirects.write(line_str + '\n')

                    #TODO
                    #Read with pandas: https://www.datacamp.com/community/tutorials/pandas-read-csv
                    #There are some errors with "\"
                    #writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
                    #writer.writerow()

                    #print('\"%s\",\"%s\",\"%s\",\"%s\"' % (table_id, column_id, row_id, entity_uri))
                else:
                    wrong_entities += 1

        ##Small dataset with only approx. 20k tables out of >400k
        if n_tables > max_tables:  #200000
            break
        n_tables += 1

    print("Panda errors: %d" % panda_errors)
    print("Wrong entities: %d" % wrong_entities)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()
Ejemplo n.º 7
0
def extensionWithWikiRedirects(file_gt, folder_tables, file_out_gt,
                               file_out_redirects_gt, file_out_gt_target,
                               max_rows):
    #print(csv_file_name)

    f_out = open(file_out_gt, "a+")
    f_out_redirects = open(file_out_redirects_gt, "a+")
    f_out_target = open(file_out_gt_target, "a+")

    n_rows = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    table_id = ""

    dict_entities = dict()

    #READ CURRENT CACHE
    #init dict_entities with current state of file_out_redirects_gt
    with open(file_out_redirects_gt) as csv_file_redirections:

        csv_reader = csv.reader(csv_file_redirections,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        #"1","0","1","http://dbpedia.org/resource/Uno_Von_Troil http://dbpedia.org/resource/Uno_von_Troil"
        for row in csv_reader:

            entity_list = row[3].split(" ")

            for e in entity_list:

                if e not in dict_entities:
                    dict_entities[e] = set(entity_list)

    with open(file_gt) as csv_file:

        csv_reader = csv.reader(csv_file,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        for row in csv_reader:

            #file, col, row, URI
            #note that in Okties GT it is given file, row, col
            #1,1,0,http://dbpedia.org/resource/Uno_von_Troil

            if len(row) < 4:
                continue

            #entity_uri = row[3]
            entity_uri = row[3].replace("\"", "%22")

            #To avoid cases from "http://sws.geonames.org/"
            #if entity_uri.startswith("http://sws.geonames.org/"):
            same_as_resources = set()
            if not entity_uri.startswith("http://dbpedia.org/resource/"):
                #print(getFilteredResources(dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia))
                same_as_resources = getFilteredResources(
                    dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia)
                #print(row[0])
                #print(row[1])
                #print(row[2])
                #print(entity_uri)

                if len(same_as_resources) == 0:
                    print("No dbpedia entity for: %s, %s, %s, %s" %
                          (row[0], row[1], row[2], entity_uri))
                else:
                    #We keep only one of the same_as dbpedia resoruces
                    for r in same_as_resources:
                        entity_uri = r

                    ##We will consider other same as later
                    same_as_resources.remove(entity_uri)

                #break

            entity_uri = row[3].replace("\"", "%22")

            #if int(row[0])<1000: #Jiaoyan starts from table file 1,000
            #if int(row[0])>=1000: #ernesto
            #if int(row[0])>=3: #ernesto
            #if int(row[0])<587 or int(row[0])>=1000:
            #    continue

            if not table_id == row[0]:

                #Change of table we close and then reopen again to keep a better storage of intermediate points
                f_out.close()
                f_out_redirects.close()
                f_out_target.close()

                table_id = row[0]
                print(table_id)

                f_out = open(file_out_gt, "a+")
                f_out_redirects = open(file_out_redirects_gt, "a+")
                f_out_target = open(file_out_gt_target, "a+")

            col_id = row[2]  #Reverse according to input
            row_id = row[1]

            csv_file_name = table_id + ".csv"

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            entities = set()

            ##Keep and index to avoid unnecessary queries
            if entity_uri in dict_entities:
                entities.update(dict_entities[entity_uri])
            else:
                #entities=set()
                new_entities = set()

                ##Consider redirects:
                entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))
                entities.update(
                    same_as_resources)  #in case there were more than one

                #two iterations
                for e in entities:
                    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                entities.update(new_entities)

                dict_entities[entity_uri] = set()
                dict_entities[entity_uri].update(entities)

            #Output
            #table id, column id, row id and DBPedia entity
            #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
            line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, col_id, row_id)
            f_out_target.write(line_str + '\n')

            #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

            f_out.write(line_str + ',\"%s\"\n' % entity_uri)

            #for ent in entities:
            #    line_str += ',\"'+ ent + '\"'
            line_str += ',\"' + " ".join(entities) + '\"'

            f_out_redirects.write(line_str + '\n')

            ##Number of rows
            if n_rows > max_rows:  #200000
                break
            n_rows += 1

    print("Panda errors: %d" % panda_errors)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()
Ejemplo n.º 8
0
from ontology.onto_access import DBpediaOntology

# unpickle
# load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval
load = unpickle('training_vectors/final_original_training_vectors_minus_tests'
                )  # created own testing data from splitting train
df_positive = pd.DataFrame(load)
df_positive['polarity'] = "1"
'''create more positive samples
do this by:
- getting a different but similar entity using SPARQLEndpoint
'''

onto_access = DBpediaOntology()
onto_access.loadOntology(True)
ep = DBpediaEndpoint()  # getEntitiesForType


def get_alt_entities(entity_types):
    lis = []
    for ls in entity_types:
        # print('ls', ls)
        enty = get_last(ls)  # only get finest entity
        # print('entity:', enty)
        try:
            # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version
            simty = ep.getEntitiesForType(enty, 0, 10)
            lis.append(simty)
            # print('similar entity', simty)
        except:
            pass
Ejemplo n.º 9
0
    def __init__(self):

        self.smartlookup = Lookup()
        self.smartendpoint = Endpoint()
        self.dbpedia_ep = DBpediaEndpoint()
''' author: Eleanor Bill @eljne '''
''' create vectors for additional training data - +ve - CONTINUED'''
''' takes about four-five hours w/10 samples per question - 180,000 '''

from kg.EB_classes import pickl, unpickle, nouns_list, noun_phrases_list
import pandas as pd
from kg.endpoints import DBpediaEndpoint

pos = unpickle('training_vectors/11_train_new_positive_samples')
new_positive_samples = pd.DataFrame(pos)
print('unpickled')
ep = DBpediaEndpoint()
# print(new_positive_samples.head)


def get_nouns(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nouns = nouns_list(labels)
    print('.')
    return nouns


def get_nps(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nps = noun_phrases_list(labels)
    print('..')
    return nps


def apply_endpoint_alt(entity):  # find types
    types = ep.getTypesForEntity(entity)  # limit to 5