def export_entity(
        concept,
        wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt",
        appended_words=[],
        facet_dictionary_is_tempfile=False,
        commit=True):

    facet = "tag_ss"
    if concept.facet:
        facet = concept.facet.facet

    altLabels = []
    for alternate in Alternate.objects.filter(concept=concept.id):
        altLabels.append(alternate.altLabel)
    for hidden in Hidden.objects.filter(concept=concept.id):
        altLabels.append(hidden.hiddenLabel)

    entity_manager = Entity_Manager()

    uri = reverse('thesaurus:detail', args=[concept.pk])
    entity_manager.add(
        id=uri,
        types=[facet],
        preferred_label=concept.prefLabel,
        prefLabels=[concept.prefLabel],
        labels=altLabels,
        dictionary=facet,
        facet_dictionary_is_tempfile=facet_dictionary_is_tempfile)

    # Append single words of concept labels to wordlist of OCR word dictionary
    labels = [concept.prefLabel]
    labels.extend(altLabels)

    if wordlist_configfilename:
        wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8")
        for label in labels:
            words = label.split()
            for word in words:
                word = word.strip("(),")
                if word:
                    if word not in appended_words:
                        appended_words.append(word)
                        appended_words.append(word.upper())
                        wordlist_file.write(word + "\n")
                        wordlist_file.write(word.upper() + "\n")
        wordlist_file.close()

    if commit:
        # reload changed dictionary matcher dictionaries in schema of entities index
        urlopen(
            os.getenv('ONTO_TAGGER_SOLR_ENTITIES_URL',
                      default='http://localhost:8983/solr/') +
            'admin/cores?action=RELOAD&core=' +
            os.getenv('ONTO_TAGGER_SOLR_CORE_ENTITIES',
                      default='opensemanticsearch-entities'))

    return appended_words
Exemple #2
0
def export_entity(
        concept,
        wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt",
        appended_words=[]):

    facet = "tag_ss"
    if concept.facet:
        facet = concept.facet.facet

    altLabels = []
    for alternate in Alternate.objects.filter(concept=concept.id):
        altLabels.append(alternate.altLabel)
    for hidden in Hidden.objects.filter(concept=concept.id):
        altLabels.append(hidden.hiddenLabel)

    entity_manager = Entity_Manager()

    fields = {}

    # taxonomy
    taxonomy = get_taxonomy(concept)

    if taxonomy:
        fields['skos_broader_taxonomy_prefLabel_ss'] = taxonomy

    entity_manager.add(id=concept.pk,
                       types=[facet],
                       preferred_label=concept.prefLabel,
                       prefLabels=[concept.prefLabel],
                       labels=altLabels,
                       fields=fields)

    # Append single words of concept labels to wordlist of OCR word dictionary
    labels = [concept.prefLabel]
    labels.extend(altLabels)

    if wordlist_configfilename:
        wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8")
        for label in labels:
            words = label.split()
            for word in words:
                word = word.strip("(),")
                if word:
                    if word not in appended_words:
                        appended_words.append(word)
                        appended_words.append(word.upper())
                        wordlist_file.write(word + "\n")
                        wordlist_file.write(word.upper() + "\n")
        wordlist_file.close()

    return appended_words
Exemple #3
0
def export_entity(
        concept,
        wordlist_configfilename="/etc/opensemanticsearch/ocr/dictionary.txt",
        appended_words=[]):

    facet = "tag_ss"
    if concept.facet:
        facet = concept.facet.facet

    altLabels = []
    for alternate in Alternate.objects.filter(concept=concept.id):
        altLabels.append(alternate.altLabel)
    for hidden in Hidden.objects.filter(concept=concept.id):
        altLabels.append(hidden.hiddenLabel)

    entity_manager = Entity_Manager()

    uri = reverse('thesaurus:detail', args=[concept.pk])
    entity_manager.add(id=uri,
                       types=[facet],
                       preferred_label=concept.prefLabel,
                       prefLabels=[concept.prefLabel],
                       labels=altLabels)

    # Append single words of concept labels to wordlist of OCR word dictionary
    labels = [concept.prefLabel]
    labels.extend(altLabels)

    if wordlist_configfilename:
        wordlist_file = open(wordlist_configfilename, 'a', encoding="UTF-8")
        for label in labels:
            words = label.split()
            for word in words:
                word = word.strip("(),")
                if word:
                    if word not in appended_words:
                        appended_words.append(word)
                        appended_words.append(word.upper())
                        wordlist_file.write(word + "\n")
                        wordlist_file.write(word.upper() + "\n")
        wordlist_file.close()

    return appended_words
Exemple #4
0
    def import_entities(self,
                        filename,
                        types=[],
                        dictionary=None,
                        facet_dictionary_is_tempfile=False,
                        encoding="utf-8"):

        entity_manager = Entity_Manager()
        entity_manager.verbose = self.verbose

        if not dictionary:
            dictionary = types[0]
        if not dictionary:
            dictionary = filename

        # open and read plaintext file line for line

        file = open(filename, encoding=encoding)

        for line in file:

            value = line.strip()

            if value:

                if self.verbose:
                    print("Import entity {}".format(value))

                entity_manager.add(
                    id=value,
                    types=types,
                    preferred_label=value,
                    prefLabels=[value],
                    dictionary=dictionary,
                    facet_dictionary_is_tempfile=facet_dictionary_is_tempfile)

        file.close()
Exemple #5
0
    def import_entities(self, filename, types=[], encoding="utf-8"):

        entity_manager = Entity_Manager()
        entity_manager.verbose = self.verbose

        # open and read plaintext file line for line

        file = open(filename, encoding=encoding)

        for line in file:

            value = line.strip()

            if value:

                if self.verbose:
                    print("Import entity {}".format(value))

                entity_manager.add(id=value,
                                   types=types,
                                   preferred_label=value,
                                   prefLabels=[value])

        file.close()
    def test(self):

        # add test entity to entities index
        entity_manager = Entity_Manager()

        entity_manager.add(
            id="http://entity-unittest.local/entities/1",
            types=['entity-unittest_type_one', 'entity-unittest_type_two'],
            preferred_label=
            "entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two",
            prefLabels=["entity-unittest_preferredLabels"],
            labels=[
                "entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two",
                "entity-unittest_labels_two",
                "entity-unittest_labels_umlaut_äöüß"
            ])

        # extracts and normalizes/links all known entities/names/labels
        linker = Entity_Linker()

        # check if entity is found by preferred label
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two from a full text."
        )

        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if is_in_resultdata works ok and does not return true even on not existing id
        self.assertFalse(
            is_in_resultdata(
                resultdata=results,
                entity_id=
                'http://entity-unittest.local/entities/notexistententityid',
                fieldname='name',
                value='notexistant entity'))

        # check returned types of returned entity id
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='type',
                value=['entity-unittest_type_one',
                       'entity-unittest_type_two']))

        # check if entity is found by another preferred label
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_preferredLabels from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if entity is found by (alternate) labels
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_two from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # check if entity is found by alternate label with special chars
        results = linker.entities(
            text=
            "I want to extract the id of entity-unittest_labels_umlaut_äöüß from a full text."
        )
        self.assertTrue(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))

        # entity should not be linked by only a part of the label
        results = linker.entities(
            text=
            "I dont want to extract the id of entity-unittest_labels_one_part_one (missing second part of name) from a full text."
        )
        self.assertFalse(
            is_in_resultdata(
                resultdata=results,
                entity_id='http://entity-unittest.local/entities/1',
                fieldname='name',
                value=
                'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two'
            ))