Beispiel #1
0
class GerbilNIFCollection:
    def __init__(self, context: str, mention: str):
        self.collection = NIFCollection()
        self.context = context
        self.mention = mention
        self.phrases = self.collection.add_context(self.context, self.mention)

    @property
    def turtle(self) -> str:
        return self.collection.dumps(format='turtle')
Beispiel #2
0
def gerbil_handler():
    def extract_string() -> Tuple[str, str]:
        for triple in nif.triples():
            if 'isString' in triple[1]:
                return str(triple[0]), str(triple[2])

    nif = NIFCollection.loads(request.data.decode('utf-8'))
    hid = request.args['handler_id']

    if 'annotator' not in request.args:
        with lck:
            queries[hid]['test']['context'], queries[hid]['test'][
                'query'] = extract_string()

        a = _wait(lambda: queries[hid]['test']['answer'])

        with lck:
            queries[hid]['test']['answer'] = None

        return a

    else:
        with lck:
            an = queries[hid]['experiment']['annotators'][
                request.args['annotator']]
            an['context'], an['query'] = extract_string()

        a = _wait(lambda: an['answer'])

        with lck:
            an['answer'] = None

        return a
Beispiel #3
0
def main(mode, infile, outfile, format):
    """
    Conversion utility for NIF files.

    This converts the identifiers used to annotate mentions in documents
    across knowledge bases. For instance, the following will convert
    a NIF file with DBpedia identifiers to a NIF file with Wikidata identifiers:

       nifconverter --mode dbr:wd -i dbpedia_nif.ttl -o wikidata_nif.ttl

    """

    converter = registered_converters.get(mode)
    if converter is None:
        raise click.BadParameter('Invalid mode. Supported modes are: ' +
                                 get_allowed_modes())

    translator = NIFTranslator(converter)

    with click.open_file(infile) as f:
        nif = NIFCollection.loads(f.read())

    translator.translate_collection(nif)

    with click.open_file(outfile, 'w') as out:
        out.write(nif.dumps())
Beispiel #4
0
def train_classifier(collection, bow, pagerank, dataset, output, max_iter):
    """
    Trains a tag classifier on a NIF dataset.
    """
    if output is None:
        output = 'trained_classifier.pkl'
    b = BOWLanguageModel()
    b.load(bow)
    graph = WikidataGraph()
    graph.load_pagerank(pagerank)
    tagger = Tagger(collection, b, graph)
    d = NIFCollection.load(dataset)
    clf = SimpleTagClassifier(tagger)
    max_iter = int(max_iter)

    parameter_grid = []
    for max_distance in [50, 75, 150, 200]:
        for similarity, beta in [('one_step', 0.2), ('one_step', 0.1),
                                 ('one_step', 0.3)]:
            for C in [10.0, 1.0, 0.1]:
                for smoothing in [0.8, 0.6, 0.5, 0.4, 0.3]:
                    parameter_grid.append({
                        'nb_steps': 4,
                        'max_similarity_distance': max_distance,
                        'C': C,
                        'similarity': similarity,
                        'beta': beta,
                        'similarity_smoothing': smoothing,
                    })

    best_params = clf.crossfit_model(d, parameter_grid, max_iter=max_iter)
    print('#########')
    print(best_params)
    clf.save(output)
Beispiel #5
0
def processQueryNif():
    print("inside")
    content_format = request.headers.get('Content') or 'application/x-turtle'
    nif_body = request.data.decode("utf-8")
    print(nif_body)
    try:
        nif_doc = NIFCollection.loads(nif_body, format='turtle')
        #print(nif_doc)
        for context in nif_doc.contexts:
            vectors = v.vectorise(context.mention)
            entities = p.link(vectors)
            s = set()
            for idx, entityarr in entities.items():
                for ent in entityarr:
                    s.add(ent[0])
            for entity in s:
                context.add_phrase(
                    beginIndex=0,
                    endIndex=1,
                    taIdentRef='http://www.wikidata.org/entity/' + entity)
        resp = Response(nif_doc.dumps())
        print(nif_doc.dumps())
        resp.headers['content-type'] = content_format
        return resp
    except Exception as e:
        print(e)
        return ''
    return ''
Beispiel #6
0
def main(converter, target, infile, outfile, format):
    """
    Conversion utility for NIF files.

    This converts the identifiers used to annotate mentions in documents
    across knowledge bases. For instance, the following will convert
    a NIF file with DBpedia identifiers to a NIF file with Wikidata identifiers,
    using the default converter (which uses the DBpedia SameThing service):

       nifconverter -i dbpedia_nif.ttl -o wikidata_nif.ttl

    """

    converter_impl = registered_converters.get(converter)
    if converter_impl is None:
        raise click.BadParameter(
            'Invalid converter "{}". Supported converters are: {}'.format(
                converter, get_available_converters()))

    translator = NIFTranslator(converter_impl(target_prefix=target))

    with click.open_file(infile) as f:
        nif = NIFCollection.loads(f.read())

    translator.translate_collection(nif)

    with click.open_file(outfile, 'w') as out:
        out.write(nif.dumps())
Beispiel #7
0
def annotation2nif(collection_name, tweet):
    collection = NIFCollection(uri=collection_name)
    context_name = collection_name + str(tweet.idTweet)
    context = collection.add_context(uri=context_name, mention=tweet.text)
    if len(tweet.mentions) > 0:
        for i, mention in enumerate(tweet.mentions):
            if tweet.entities[i] != 'NIL':
                entity = tweet.entities[i].replace(
                    'dbr:', 'http://dbpedia.org/resource/')
            else:
                entity = 'http://optic.ufsc.br/resource/NIL/'
            context.add_phrase(beginIndex=int(mention[2]),
                               endIndex=int(mention[3]),
                               annotator='http://optic.ufsc.br',
                               taIdentRef=entity)
    nif = collection.dumps(format='turtle')
    return nif
Beispiel #8
0
def nif2json(lang="en"):
    paths = ["./VoxEL/rVoxEL-{}.ttl", "./VoxEL/sVoxEL-{}.ttl"]
    prefix = ["r", "s"]

    for path, p in zip(paths, prefix):
        with open(path.format(lang)) as f:
            data = NIFCollection.loads(f.read(), format='turtle')
        out = nif2dict(data)
        with open("./{}_{}.json".format(p, lang), "w") as f:
            json.dump(out, f, indent=4)
Beispiel #9
0
def nif_2_annotations(nif_collection):
    annotations = defaultdict(list)
    temp_annotations = defaultdict(list)
    keys = []

    parsed_collection = NIFCollection.loads(nif_collection, format='turtle')
    for context in parsed_collection.contexts:
        for phrase in context.phrases:
            id_annotation = phrase.context.rsplit('/', 1)[-1]
            entity = phrase.taIdentRef
            keys.append(int(id_annotation))
            temp_annotations[int(id_annotation)].append(entity)
    keys.sort()
    for key in keys:
        annotations[key] = temp_annotations[key]
    return annotations
Beispiel #10
0
def d2kb():
    data = request.data
    data = data.rstrip()
    data = data.lstrip()
    nif_post = NIFCollection.loads(data.decode('utf-8'), format='turtle')
    mentions = []
    for context in nif_post.contexts:
        tweet = Tweet()
        tweet.mentions = []
        tweet.idTweet = context.uri
        tweet.text = context.mention
        try:
            for phrase in context.phrases:
                single_mention = (phrase.mention, phrase.beginIndex,
                                  phrase.endIndex)
                mentions.append(single_mention)
        except:
            print('no mentions')
        if len(mentions) > 0:
            if VERBOSE == 'yes':
                print('\n\n:::: PREPROCESSING ::::\n\n')
            start = time.time()
            tweet = preprocessing_d2kb(tweet, mentions, VERBOSE)
            end = time.time()
            if VERBOSE == 'yes':
                print('Running time: {}'.format(end - start))
            if VERBOSE == 'yes':
                print('\n\n:::: ENTITY SELECTION ::::\n\n')
            start = time.time()
            tweet.candidates = select_candidates(tweet, vocab2idx, TYPE, MAX,
                                                 BOOST, VERBOSE)
            end = time.time()
            if VERBOSE == 'yes':
                print('Running time: {}'.format(end - start))
            if VERBOSE == 'yes':
                print('\n\n:::: DISAMBIGUATION ::::\n\n')
            start = time.time()
            tweet.entities = disambiguate_mentions(tweet, THRESHOLD, model,
                                                   device, vocab2idx, WS,
                                                   EXTRA, VERBOSE)
            end = time.time()
            if VERBOSE == 'yes':
                print('Running time: {}'.format(end - start))
        collection_name = "http://optic.ufsc.br/"
        nif = annotation2nif(collection_name, tweet)
    return nif
Beispiel #11
0
def nif_api(*args, **kwargs):
    content_format = request.headers.get('Content') or 'application/x-turtle'
    content_type_to_format = {
        'application/x-turtle': 'turtle',
        'text/turtle': 'turtle',
    }
    nif_body = request.body.read()
    nif_doc = NIFCollection.loads(nif_body)
    for context in nif_doc.contexts:
        logger.debug(context.mention)
        mentions = classifier.create_mentions(context.mention)
        classifier.classify_mentions(mentions)
        for mention in mentions:
            mention.add_phrase_to_nif_context(context)

    response.set_header('content-type', content_format)
    return nif_doc.dumps()
Beispiel #12
0
    def setUpClass(cls):
        cls.testdir = os.path.dirname(os.path.abspath(__file__))

        # Load dummy bow
        bow_fname = os.path.join(cls.testdir, 'data/sample_bow.pkl')
        cls.bow = BOWLanguageModel()
        cls.bow.load(bow_fname)

        # Load dummy graph
        graph_fname = os.path.join(cls.testdir,
                                   'data/sample_wikidata_items.npz')
        pagerank_fname = os.path.join(cls.testdir,
                                      'data/sample_wikidata_items.pgrank.npy')
        cls.graph = WikidataGraph()
        cls.graph.load_from_matrix(graph_fname)
        cls.graph.load_pagerank(pagerank_fname)

        # Load dummy profile
        cls.profile = IndexingProfile.load(
            os.path.join(cls.testdir, 'data/all_items_profile.json'))

        # Setup solr index (TODO delete this) and tagger
        cls.tf = TaggerFactory()
        cls.collection_name = 'wd_test_collection'
        try:
            cls.tf.create_collection(cls.collection_name)
        except CollectionAlreadyExists:
            pass
        cls.tf.index_stream(
            cls.collection_name,
            WikidataDumpReader(
                os.path.join(cls.testdir,
                             'data/sample_wikidata_items.json.bz2')),
            cls.profile)
        cls.tagger = Tagger(cls.collection_name, cls.bow, cls.graph)

        # Load NIF dataset
        cls.nif = NIFCollection.load(
            os.path.join(cls.testdir, 'data/five-affiliations.ttl'))

        cls.classifier = SimpleTagClassifier(cls.tagger,
                                             max_similarity_distance=10,
                                             similarity_smoothing=2)
Beispiel #13
0
    # ******************************** #
    # Start OPTIC
    count = 0

    # Read directory with tweets to be annotated
    inputs = set()
    for nif_temp in os.listdir(INPUT_PATH):
        # Initially, we works only with RDF turtle standard
        if (fnmatch.fnmatch(nif_temp, '*.ttl')):
            inputs.add(nif_temp)

    for nif_input in inputs:
        nif_file = ''
        with open(INPUT_PATH + nif_input, 'r') as f:
            nif_file = f.read()
        nif_post = NIFCollection.loads(nif_file, format='turtle')
        for context in nif_post.contexts:
            tweet = Tweet()
            tweet.idTweet = context.uri
            tweet.text = context.mention
            tweet.mentions = []

            # A2KB Mode
            # TODO
            if MODE == 'a2kb':
                continue

            # D2KB Mode
            else:
                mentions = []
                try:
Beispiel #14
0
from pynif import NIFCollection
import sys,os,json,re


d = json.loads(open('input/webqsp.test.entities.with_classes.json').read())

collection = NIFCollection(uri="http://sda.tech/webquestions")

for item in d:
    if not item['utterance']:
        continue
    uid = item['question_id']
    context = collection.add_context(uri="http://sda.tech/webquestions/%s"%uid,
    mention=item['utterance'])
    beg = 0
    for entity in item['entities']:
        if entity is None:
            continue
        context.add_phrase(taIdentRef='http://www.wikidata.org/entity/'+entity, beginIndex=beg, endIndex=beg+1)
        beg+=1

generated_nif = collection.dumps(format='turtle')
f = open('webqsp.test.nif','w')
f.write(generated_nif)
f.close()

Beispiel #15
0
from pynif import NIFCollection
import sys, os, json, re

d = json.loads(open('test.json').read())

collection = NIFCollection(uri="http://sda.tech/lcquadv2")

for item in d:
    if not item['question']:
        continue
    uid = item['uid']
    context = collection.add_context(uri="http://sda.tech/lcquadv2/%s" % uid,
                                     mention=item['question'])
    entities = re.findall(r'wd:([Q][0-9]*)', item['sparql_wikidata'])
    beg = 0
    for entity in entities:
        context.add_phrase(taIdentRef='http://www.wikidata.org/entity/' +
                           entity,
                           beginIndex=beg,
                           endIndex=beg + 1)
        beg += 1

generated_nif = collection.dumps(format='turtle')
f = open('lcquad2.0.test.nif', 'w')
f.write(generated_nif)
f.close()
Beispiel #16
0
from pynif import NIFCollection
import sys,os,json,re



gold = []
f = open('annotated_wd_data_train.txt')
for line in f.readlines():
    line = line.strip()
    s,p,o,q = line.split('\t')
    gold.append((s,q))

collection = NIFCollection(uri="http://sda.tech/simplequestions")

for idx,item in enumerate(gold):
    question = item[1]
    entity = item[0]
    uid = idx
    context = collection.add_context(uri="http://sda.tech/simplequestions/%s"%uid, mention=question)
    context.add_phrase(taIdentRef='http://www.wikidata.org/entity/'+entity, beginIndex=0, endIndex=1)

generated_nif = collection.dumps(format='turtle')
f = open('simplequestions.train.nif','w')
f.write(generated_nif)
f.close()

 def setUpClass(cls):
     testdir = os.path.dirname(os.path.abspath(__file__))
     cls.dbpedia_nif = NIFCollection.load(
         os.path.join(testdir, 'data/sample_dbpedia.ttl'))
     cls.wikipedia_nif = NIFCollection.load(
         os.path.join(testdir, 'data/sample_wikipedia.ttl'))
Beispiel #18
0
 def __init__(self, context: str, mention: str):
     self.collection = NIFCollection()
     self.context = context
     self.mention = mention
     self.phrases = self.collection.add_context(self.context, self.mention)
Beispiel #19
0
from pynif import NIFCollection
import sys, os, json, re

d = json.loads(open('input/webqsp.test.entities.with_classes.json').read())

collection = NIFCollection(uri="http://lc-quad2.sda.tech")

for item in d:
    if not item['utterance']:
        continue
    uid = item['question_id']
    context = collection.add_context(uri="http://webqsp.sda.tech/%s" % uid,
                                     mention=item['utterance'])
    for entity in item['entities']:
        if entity is None:
            continue
        print(entity)
        context.add_phrase(taIdentRef='http://www.wikidata.org/entity/' +
                           entity,
                           beginIndex=0,
                           endIndex=1)

generated_nif = collection.dumps(format='turtle')
f = open('webqsp.test.nif', 'w')
f.write(generated_nif)
f.close()