Python tokenize Beispiele, soweego.commons.text_utils.tokenize Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: discogs_dump_extractor.py Projekt: Wikidata/soweego

 def _extract_from_master_node(node, relationships_set):
     entity = DiscogsMasterEntity()
     entity.catalog_id = node.attrib['id']
     genres = set()
     for child in node:
         if child.tag == 'main_release':
             entity.main_release_id = child.text
         elif child.tag == 'genres':
             for genre in child:
                 genres.update(text_utils.tokenize(genre.text))
         elif child.tag == 'styles':
             for style in child:
                 genres.update(text_utils.tokenize(style.text))
         elif child.tag == 'title':
             entity.name = child.text
             entity.name_tokens = ' '.join(text_utils.tokenize(child.text))
         elif child.tag == 'data_quality':
             entity.data_quality = child.text.lower()
         elif child.tag == 'year':
             try:
                 entity.born = date(year=int(child.text), month=1, day=1)
                 entity.born_precision = 9
             except ValueError:
                 LOGGER.debug(
                     'Master with id %s has an invalid year: %s',
                     entity.catalog_id,
                     child.text,
                 )
         elif child.tag == 'artists':
             for artist in child:
                 relationships_set.add(
                     (entity.catalog_id, artist.find('id').text))
     entity.genres = ' '.join(genres)
     return entity

Beispiel #2

0

Datei anzeigen

Datei: discogs_dump_extractor.py Projekt: Wikidata/soweego

 def _denormalize_name_variation_entities(self,
                                          main_entity: DiscogsArtistEntity,
                                          name_variation_nodes):
     entity_class = type(main_entity)
     for node in name_variation_nodes:
         name_variation = node.text
         if not name_variation:
             LOGGER.debug(
                 'Artist %s: skipping empty <name> tag in <namevariations>',
                 main_entity.catalog_id,
             )
             continue
         variation_entity = entity_class()
         variation_entity.catalog_id = main_entity.catalog_id
         variation_entity.name = name_variation
         name_tokens = text_utils.tokenize(name_variation)
         if name_tokens:
             variation_entity.name_tokens = ' '.join(name_tokens)
         variation_entity.real_name = main_entity.real_name
         variation_entity.data_quality = main_entity.data_quality
         self.total_entities += 1
         if 'Musician' in entity_class.__name__:
             self.musicians += 1
         else:
             self.bands += 1
         yield variation_entity

Beispiel #3

0

Datei anzeigen

    def _alias_entities(entity: BaseEntity, aliases_class, aliases: []):
        for alias_label in aliases:
            alias_entity = aliases_class()
            alias_entity.catalog_id = entity.catalog_id
            alias_entity.born = entity.born
            alias_entity.born_precision = entity.born_precision
            alias_entity.died = entity.died
            alias_entity.died_precision = entity.died_precision
            alias_entity.birth_place = entity.birth_place
            alias_entity.death_place = entity.death_place

            alias_entity.name = alias_label
            name_tokens = text_utils.tokenize(alias_label)
            if name_tokens:
                alias_entity.name_tokens = ' '.join(name_tokens)
            yield alias_entity

Beispiel #4

0

Datei anzeigen

Datei: discogs_dump_extractor.py Projekt: Wikidata/soweego

 def _populate_nlp_entity(self, entity_array, infos: dict, entity_class):
     if infos.get('profile'):
         nlp_entity = entity_class()
         nlp_entity.catalog_id = infos['identifier']
         nlp_entity.description = infos['profile']
         description_tokens = text_utils.tokenize(infos['profile'])
         if description_tokens:
             nlp_entity.description_tokens = ' '.join(description_tokens)
         entity_array.append(nlp_entity)
         self.total_entities += 1
         if 'Musician' in entity_class.__name__:
             self.musician_nlp += 1
         else:
             self.band_nlp += 1
     else:
         LOGGER.debug('Artist %s has an empty <profile/> tag',
                      infos['identifier'])

Beispiel #5

0

Datei anzeigen

 def _fill_entity(self, entity, info, areas):
     entity.catalog_id = info['gid']
     entity.name = info['label']
     entity.tokens = " ".join(text_utils.tokenize(info['label']))
     birth_date = self._get_date_and_precision(
         info['b_year'], info['b_month'], info['b_day'])
     death_date = self._get_date_and_precision(
         info['d_year'], info['d_month'], info['d_day'])
     entity.born = birth_date[0]
     entity.born_precision = birth_date[1]
     entity.died = death_date[0]
     entity.died_precision = death_date[1]
     try:
         entity.birth_place = areas[info['b_place']]
     except KeyError:
         entity.birth_place = None
     try:
         entity.death_place = areas[info['d_place']]
     except KeyError:
         entity.death_place = None

Beispiel #6

0

Datei anzeigen

def similar_name_match(source, target, tokenize) -> dict:
    """Given a dictionaries ``{person_name: identifier}, a BaseEntity and a tokenization function``,
    match similar names and return a dataset ``{source_id: target_id}``.

    This strategy only applies to people names.
    """
    matches = defaultdict(list)
    to_exclude = set()

    for label, qid in source.items():
        if not label:
            continue

        to_exclude.clear()

        tokenized = tokenize(label)
        if len(tokenized) <= 1:
            continue

        # NOTICE: sets of size 1 are always exluded
        # Looks for sets equal or bigger containing our tokens
        for res in data_gathering.tokens_fulltext_search(
                target, True, tokenized):
            matches[qid].append(res.catalog_id)
            to_exclude.add(res.catalog_id)
        # Looks for sets contained in our set of tokens
        for res in data_gathering.tokens_fulltext_search(
                target, False, tokenized):
            res_tokenized = text_utils.tokenize(res.tokens)
            if len(res_tokenized) > 1 and res_tokenized.issubset(tokenized):
                matches[qid].append(res.catalog_id)

        if matches[qid]:
            matches[qid] = list(set(matches[qid]))
        else:
            del matches[qid]

    return matches

Beispiel #7

0

Datei anzeigen

Datei: discogs_dump_extractor.py Projekt: Wikidata/soweego

 def _fill_entity(entity: DiscogsArtistEntity, infos):
     # Base fields
     entity.catalog_id = infos['identifier']
     entity.name = infos['name']
     name_tokens = text_utils.tokenize(infos['name'])
     if name_tokens:
         entity.name_tokens = ' '.join(name_tokens)
     # Real name
     real_name = infos['realname']
     if real_name:
         entity.real_name = real_name
     else:
         LOGGER.debug('Artist %s has an empty <realname/> tag',
                      infos['identifier'])
     # Data quality
     data_quality = infos['data_quality']
     if data_quality:
         entity.data_quality = data_quality
     else:
         LOGGER.debug(
             'Artist %s has an empty <data_quality/> tag',
             infos['identifier'],
         )

Beispiel #8

0

Datei anzeigen

    def _fill_entity(self, entity, info, areas):
        entity.catalog_id = info['gid']
        entity.name = info['label']
        name_tokens = text_utils.tokenize(info['label'])
        if name_tokens:
            entity.name_tokens = ' '.join(name_tokens)
        try:
            birth_date = self._get_date_and_precision(info['b_year'],
                                                      info['b_month'],
                                                      info['b_day'])
            entity.born = birth_date[0]
            entity.born_precision = birth_date[1]
        except KeyError:
            entity.born = None
            entity.born_precision = None

        try:
            death_date = self._get_date_and_precision(info['d_year'],
                                                      info['d_month'],
                                                      info['d_day'])
            entity.died = death_date[0]
            entity.died_precision = death_date[1]
        except KeyError:
            entity.died = None
            entity.died_precision = None

        if isinstance(entity,
                      (MusicbrainzArtistEntity, MusicbrainzBandEntity)):
            try:
                entity.birth_place = areas[info['b_place']]
            except KeyError:
                entity.birth_place = None
            try:
                entity.death_place = areas[info['d_place']]
            except KeyError:
                entity.death_place = None

Beispiel #9

0

Datei anzeigen

    def extract_and_populate(self, dump_file_paths: List[str],
                             resolve: bool) -> None:
        """
        Extracts the data in the dumps (person and movie) and processes them.
        It then proceeds to add the appropriate data to the database.

        See
        :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy
        definition of the entities we use to save IMDB data.

        :param dump_file_paths: the absolute paths of the already downloaded
        dump files.
        """

        # the order of these files is specified in `self.get_dump_download_urls`
        person_file_path = dump_file_paths[0]
        movies_file_path = dump_file_paths[1]

        LOGGER.debug('Path to movie info dump: %s', movies_file_path)
        LOGGER.debug('Path to person info dump: %s', person_file_path)

        start = datetime.datetime.now()

        tables = [
            imdb_entity.ImdbActorEntity,
            imdb_entity.ImdbDirectorEntity,
            imdb_entity.ImdbMovieEntity,
            imdb_entity.ImdbMusicianEntity,
            imdb_entity.ImdbProducerEntity,
            imdb_entity.ImdbWriterEntity,
            imdb_entity.ImdbMoviePersonRelationship,
        ]

        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)

        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )

        LOGGER.info('Starting import of movies ...')

        # Here we open the movie dump file, and add everything to the DB
        for movie_info, entity_array in self._loop_through_entities(
                movies_file_path):

            # create the movie SQLAlchemy entity and populate it
            movie_entity = imdb_entity.ImdbMovieEntity()
            movie_entity.catalog_id = movie_info.get('tconst')
            movie_entity.title_type = movie_info.get('titleType')
            if movie_info.get('primaryTitle') is not None:
                movie_entity.name = movie_info.get('primaryTitle')
                movie_entity.name_tokens = ' '.join(
                    text_utils.tokenize(movie_info.get('primaryTitle')))
            movie_entity.is_adult = (True if movie_info.get('isAdult') == '1'
                                     else False)
            try:
                movie_entity.born = datetime.date(year=int(
                    movie_info.get('startYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.born_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No start year value for %s', movie_entity)
            try:
                movie_entity.died = datetime.date(year=int(
                    movie_info.get('endYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.died_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No end year value for %s', movie_entity)
            movie_entity.runtime_minutes = movie_info.get('runtimeMinutes')

            if movie_info.get('genres'):  # if movie has a genre specified
                movie_entity.genres = ' '.join(
                    text_utils.tokenize(movie_info.get('genres')))

            # Creates entity for alias
            alias = movie_info.get('originalTitle')
            if alias is not None and movie_entity.name != alias:
                alias_entity = copy.deepcopy(movie_entity)
                alias_entity.name = alias
                alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias))
                entity_array.append(alias_entity)

            entity_array.append(movie_entity)

            self.n_movies += 1

        # mark end for movie import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Movie import completed in %s. '
            'Total movies imported: %d',
            end - start,
            self.n_movies,
        )

        LOGGER.info('Starting import of people ...')

        # reset timer for persons import
        start = datetime.datetime.now()

        for person_info, entity_array in self._loop_through_entities(
                person_file_path):

            # IMDb saves the list of professions as a comma separated
            # string
            professions = person_info.get('primaryProfession')

            # if person has no professions then ignore it
            if not professions:
                LOGGER.debug('Person %s has no professions',
                             person_info.get('nconst'))
                continue

            professions = professions.split(',')

            # each person can be added to multiple tables in the DB,
            # each table stands for one of the main professions
            types_of_entities = []

            if 'actor' in professions or 'actress' in professions:
                self.n_actors += 1
                types_of_entities.append(imdb_entity.ImdbActorEntity())

            if 'director' in professions:
                self.n_directors += 1
                types_of_entities.append(imdb_entity.ImdbDirectorEntity())

            if 'producer' in professions:
                self.n_producers += 1
                types_of_entities.append(imdb_entity.ImdbProducerEntity())

            if any(prof in [
                    'sound_department',
                    'composer',
                    'music_department',
                    'soundtrack',
            ] for prof in professions):
                self.n_musicians += 1
                types_of_entities.append(imdb_entity.ImdbMusicianEntity())

            if 'writer' in professions:
                self.n_writers += 1
                types_of_entities.append(imdb_entity.ImdbWriterEntity())

            # if the only profession a person has is `miscellaneous` then we
            # add it to all tables
            if professions == ['miscellaneous']:
                self.n_misc += 1
                types_of_entities = [
                    imdb_entity.ImdbActorEntity(),
                    imdb_entity.ImdbDirectorEntity(),
                    imdb_entity.ImdbMusicianEntity(),
                    imdb_entity.ImdbProducerEntity(),
                    imdb_entity.ImdbWriterEntity(),
                ]

            # add person to every matching table
            for etype in types_of_entities:
                self._populate_person(etype, person_info, entity_array)

            # if person is known for any movies then add these to the
            # database as well
            if person_info.get('knownForTitles'):
                self.n_person_movie_links += 1
                self._populate_person_movie_relations(person_info,
                                                      entity_array)

            self.n_persons += 1

        # mark the end time for the person import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Person import completed in %s. '
            'Total people imported: %d - '
            'Actors: %d - Directors: %d - Musicians: %d - '
            'Producers: %d - Writers: %d - Misc: %d',
            end - start,
            self.n_persons,
            self.n_actors,
            self.n_directors,
            self.n_musicians,
            self.n_producers,
            self.n_writers,
            self.n_misc,
        )

Beispiel #10

0

Datei anzeigen

    def _populate_person(
        self,
        person_entity: imdb_entity.ImdbPersonEntity,
        person_info: Dict,
        entity_array: object,
    ) -> None:
        """
        Given an instance of
        :ref:`soweego.importer.models.imdb_entity.ImdbPersonEntity`
        this function populates its attributes according to
        the provided `person_info` dictionary. It then adds
        said instance to the SQLAlchemy session.

        :param person_entity: the entity which we want to populate
        :param person_info: the data we want to populate the
        entity with
        :param entity_array: an external array to which we'll add the
        entity once it is populated.
        """

        person_entity.catalog_id = person_info.get('nconst')
        person_entity.name = person_info.get('primaryName')
        person_entity.name_tokens = ' '.join(
            text_utils.tokenize(person_entity.name))

        # If either `actor` or `actress` in primary profession
        # (which is a comma separated string of professions)
        # then we can distinguish the gender
        if any(prof in person_info.get('primaryProfession')
               for prof in ['actor', 'actress']):
            person_entity.gender = ('male' if 'actor'
                                    in person_info.get('primaryProfession')
                                    else 'female')

        # IMDb only provides us with the birth and death year of
        # a person, so this is the only one we'll take into
        # account. Month and Day are set by default to 1. The
        # base `ImdbPersonEntity` defines a precision of 9 for the
        # birth and death dates, which (according to
        # `vocab.DATE_PRECISION`) means that only the year is correct.
        born_year = person_info.get('birthYear')
        if born_year:
            # datetime.date(year, month, day)
            person_entity.born = datetime.date(int(born_year), 1, 1)

        death_year = person_info.get('deathYear')
        if death_year:
            person_entity.died = datetime.date(int(death_year), 1, 1)

        # The array of primary professions gets translated to a list
        # of the QIDs that represent said professions in Wikidata
        if person_info.get('primaryProfession'):
            # get QIDs of occupations for person
            translated_occupations = self._translate_professions(
                person_info.get('primaryProfession').split(','))

            # only save those occupations which are not the main
            # occupation of the entity type (ie, for ActorEntity
            # don't include 'actor' occupation since it is implicit)
            person_entity.occupations = ' '.join(
                occ for occ in translated_occupations
                if occ != person_entity.table_occupation)

        entity_array.append(person_entity)