from soweego.commons import constants, target_database, utils from soweego.linker import train LOGGER = logging.getLogger(__name__) # Let the user pass extra kwargs to the classifier # This is for development purposes only, and is not explicitly documented @click.command(context_settings={ 'ignore_unknown_options': True, 'allow_extra_args': True }) @click.argument('classifier', type=click.Choice(constants.CLASSIFIERS)) @click.argument('catalog', type=click.Choice(target_database.supported_targets())) @click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.option('-k', '--k-folds', default=5, help="Number of folds, default: 5.") @click.option( '-s', '--single', is_flag=True, help='Compute a single evaluation over all k folds, instead of k ' 'evaluations.', ) @click.option( '-n',
import click import requests from pandas import read_csv from sqlalchemy.exc import SQLAlchemyError from tqdm import tqdm from soweego.commons import keys, target_database from soweego.commons.constants import SUPPORTED_ENTITIES from soweego.commons.db_manager import DBManager from soweego.importer.models import mix_n_match from soweego.wikidata.vocabulary import HUMAN_QID LOGGER = logging.getLogger(__name__) SUPPORTED_TARGETS = set(target_database.supported_targets()) ^ {keys.TWITTER} INPUT_CSV_HEADER = (keys.QID, keys.TID, keys.CONFIDENCE) COMMIT_EVERY = 10_000 # DB entity batch size MNM_DB = 's51434__mixnmatch_p' MNM_API_URL = 'https://tools.wmflabs.org/mix-n-match/api.php' MNM_API_ACTIVATION_PARAMS = { 'query': 'update_overview', 'catalog': None, # To be filled by activate_catalog } TIMESTAMP_FORMAT = '%Y%m%d%H%M%S' # 20190528131053 NOTE_FIELD = 'Uploaded by soweego' SEARCH_WP_FIELD = 'en' EXT_DESC_FIELD = 'soweego confidence score: {}' USER_FIELD = 0 # stands for 'automatically matched'
from soweego.importer.discogs_dump_extractor import DiscogsDumpExtractor from soweego.importer.imdb_dump_extractor import IMDbDumpExtractor from soweego.importer.musicbrainz_dump_extractor import MusicBrainzDumpExtractor LOGGER = logging.getLogger(__name__) DUMP_EXTRACTOR = { keys.DISCOGS: DiscogsDumpExtractor, keys.IMDB: IMDbDumpExtractor, keys.MUSICBRAINZ: MusicBrainzDumpExtractor, } ROTTEN_URLS_FNAME = '{catalog}_{entity}_rotten_urls.csv' @click.command() @click.argument('catalog', type=click.Choice(target_database.supported_targets())) @click.option( '--url-check', is_flag=True, help=( 'Check for rotten URLs while importing. Default: no. ' 'WARNING: this will dramatically increase the import time.' ), ) @click.option( '--dir-io', type=click.Path(file_okay=False), default=constants.WORK_DIR, help=f'Input/output directory, default: {constants.WORK_DIR}.', ) def import_cli(catalog: str, url_check: bool, dir_io: str) -> None:
# (retrieved, TIMESTAMP) reference object TODAY = date.today() TIMESTAMP = pywikibot.WbTime( site=REPO, year=TODAY.year, month=TODAY.month, day=TODAY.day, precision='day', ) RETRIEVED_REFERENCE = pywikibot.Claim(REPO, vocabulary.RETRIEVED, is_reference=True) RETRIEVED_REFERENCE.setTarget(TIMESTAMP) # We also support Twitter SUPPORTED_TARGETS = target_database.supported_targets() ^ {TWITTER} @click.command() @click.argument('catalog', type=click.Choice(SUPPORTED_TARGETS)) @click.argument('entity', type=click.Choice(target_database.supported_entities())) @click.argument('invalid_identifiers', type=click.File()) @click.option( '-s', '--sandbox', is_flag=True, help='Perform all edits on the Wikidata sandbox item Q4115189.', ) def delete_cli(catalog, entity, invalid_identifiers, sandbox): """Delete invalid identifiers.