Ejemplo n.º 1
0
def get_word_embeddings_feature_generator(model_location=None,
                                          additive=None,
                                          multiplicative=None):
    """
    :returns: nalaf.features.embeddings.WordEmbeddingsFeatureGenerator
    """
    global _SINGLETON_WE_GENERATOR

    if _SINGLETON_WE_GENERATOR is None:
        additive = 0 if additive is None else additive
        multiplicative = 1 if multiplicative is None else multiplicative

        import tarfile

        import pkg_resources
        import requests
        from nalaf.features.embeddings import WordEmbeddingsFeatureGenerator
        from nalaf import print_verbose, print_warning

        if model_location is None:
            # D=100, no discretization, epoch=1, window=10
            last_model = "word_embeddings_2016-03-28"
            we_model = pkg_resources.resource_filename(
                'nala.data', os.path.join(last_model, 'word_embeddings.model'))
            if not os.path.exists(we_model):
                print_warning(
                    'Downloading Word Embeddings Model (this may take a long time). Expected path: '
                    + we_model)
                # TODO requests doesn't support ftp, but better use: ftp://rostlab.org/jmcejuela/...last_model...
                tar = '{}.tar.gz'.format(last_model)
                model_url = '{}/{}'.format('https://rostlab.org/~cejuela', tar)
                we_model_tar_gz = pkg_resources.resource_filename(
                    'nala.data', tar)

                response = requests.get(url=model_url, stream=True)
                with open(we_model_tar_gz, 'wb') as file:
                    for chunk in response.iter_content(8048):
                        if chunk:
                            print('.', end="", flush=True)
                            file.write(chunk)
                    print()
                # Unpack the model
                print_verbose('Extracting')

                tar = tarfile.open(we_model_tar_gz)
                tar.extractall(
                    path=pkg_resources.resource_filename('nala.data', ''))
                tar.close()
            _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator(
                we_model, additive, multiplicative)
        else:
            _SINGLETON_WE_GENERATOR = WordEmbeddingsFeatureGenerator(
                model_location, additive, multiplicative)

    return _SINGLETON_WE_GENERATOR
Ejemplo n.º 2
0
    def __read_annjson(self, reader, filename, dataset):
        try:
            doc_id = os.path.basename(filename).replace('.ann.json', '').replace('.json', '')
            if not self.whole_basename_as_docid and '-' in doc_id:
                doc_id = doc_id.split('-')[-1]

            ann_json = json.load(reader)

            try:
                document = dataset.documents[doc_id]
            except Exception as err:
                print_warning("The annjson with docid={} was not in the whole plain dataset.".format(doc_id))
                return doc_id

            if not (ann_json['anncomplete'] or self.is_predicted) and self.delete_incomplete_docs:
                del dataset.documents[doc_id]

            else:

                for e in ann_json['entities']:

                    if self.read_only_class_id is None or e['classId'] in self.read_only_class_id:

                        part = document.parts[e['part']]

                        try:
                            normalizations = {key: obj['source']['id'] for key, obj in e['normalizations'].items()}
                        except KeyError as err:
                            print_warning("The normalization is badly formatted: (docid={}) {}".format(doc_id, str(e['normalizations'])))
                            normalizations = None

                        entity = Entity(
                            e['classId'],
                            e['offsets'][0]['start'],
                            e['offsets'][0]['text'],
                            e['confidence']['prob'],
                            norms=normalizations)

                        if self.is_predicted:
                            part.predicted_annotations.append(entity)
                        else:
                            part.annotations.append(entity)

                if self.read_relations:
                    for relation in ann_json['relations']:
                        # Note: no distinction with predicted_relations yet

                        part = document.parts[relation['entities'][0].split('|')[0]]

                        e1_start = int(relation['entities'][0].split('|')[1].split(',')[0])
                        e2_start = int(relation['entities'][1].split('|')[1].split(',')[0])

                        rel_id = relation['classId']

                        e1 = part.get_entity(e1_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies)
                        e2 = part.get_entity(e2_start, use_pred=False, raise_exception_on_incosistencies=self.raise_exception_on_incosistencies)

                        if (not self.raise_exception_on_incosistencies and (e1 is None or e2 is None)):
                            continue

                        rel = Relation(rel_id, e1, e2)

                        part.relations.append(rel)

                # delete parts that are not annotatable
                annotatable_parts = set(ann_json['annotatable']['parts'])
                part_ids_to_del = []
                for part_id, part in document.parts.items():
                    if part_id not in annotatable_parts:
                        part_ids_to_del.append(part_id)
                for part_id in part_ids_to_del:
                    del document.parts[part_id]

            return doc_id

        except Exception as err:
            if self.raise_exception_on_incosistencies:
                raise err
            else:
                pass