Ejemplo n.º 1
0
    def testQuerySelectFull(self, mock_method):
        """Test SELECT query with full data."""
        mock_method.return_value = Container(
            SQL_RESPONSE_CONTAINER % ("%s, %s" % (ITEM_Q498787, ITEM_Q677525)))
        q = sparql.SparqlQuery()
        res = q.select('SELECT * WHERE { ?x ?y ?z }', full_data=True)
        self.assertIsInstance(res, list, 'Result is not a list')
        self.assertEqual(len(res), 2)

        self.assertIsInstance(res[0]['cat'], sparql.URI, 'Wrong type for URI')
        self.assertEqual(repr(res[0]['cat']),
                         '<http://www.wikidata.org/entity/Q498787>',
                         'Wrong URI representation')
        self.assertEqual(res[0]['cat'].getID(), 'Q498787', 'Wrong URI ID')

        self.assertIsInstance(res[0]['catLabel'], sparql.Literal,
                              'Wrong type for Literal')
        self.assertEqual(repr(res[0]['catLabel']), 'Muezza@en',
                         'Wrong literal representation')

        self.assertIsInstance(res[0]['d'], sparql.Literal,
                              'Wrong type for Literal')
        self.assertEqual(
            repr(res[0]['d']),
            '1955-01-01T00:00:00Z^^http://www.w3.org/2001/XMLSchema#dateTime',
            'Wrong URI representation')
Ejemplo n.º 2
0
def query_projects(filter: Optional[str] = None):
    """
    Queries for all software projects and returns them as an array of simplified dicts
    :return: the data splitted into projects with and without github
    """
    wikdata_sparql = sparql.SparqlQuery()
    sparql_free_software_items = "".join(
        open(Settings.sparql_file).readlines())
    response = wikdata_sparql.query(sparql_free_software_items)

    projects = []
    logger.info("{} projects were found by the sparql query".format(
        len(response["results"]["bindings"])))
    for project in response["results"]["bindings"]:
        # Remove bloating type information
        for key in project.keys():
            project[key] = project[key]["value"]

        if filter and filter not in project["projectLabel"]:
            continue

        if not Settings.repo_regex.match(project["repo"]):
            logger.info(" - Removing {}: {} {}".format(project["projectLabel"],
                                                       project["project"],
                                                       project["repo"]))
            continue

        projects.append(project)

    logger.info("{} projects remained after filtering".format(len(projects)))

    return projects
Ejemplo n.º 3
0
def processQuery(
    sparqlQuery,
    itemVar=None,
):
    """
    Returns (<item page generator>, <bindings>) for <sparqlQuery>
    """
    assert itemVar

    site = pywikibot.Site()
    repo = site.data_repository()
    query = sparql.SparqlQuery(repo=repo)
    idFor = lambda item: item.getID()
    bindings = {}
    for binding in query.select(sparqlQuery, full_data=True):
        item = binding[removeQMark(itemVar)]
        entry = bindings.get(item, [])
        bindings[idFor(item)] = entry + [binding]

    itemPageFor = lambda id: pywikibot.ItemPage(repo, id)
    itemsPages = (itemPageFor(id) for id in bindings.keys()
                  if pywikibot.ItemPage.is_valid_id(id))
    for binding in bindings.values():
        print pywikibot.output(binding)
    return (itemsPages, bindings)
Ejemplo n.º 4
0
    def query_to_lookup(query, item_label='item', value_label='value',
                        props=None):
        """
        Fetch sparql result and return it as a lookup table for wikidata id.

        If props are not provided the returned dict simply consists of
        value_label:item_label pairs. If props are provided the returned dict
        becomes value_label:{'wd':item_label, other props}

        :param item_label: the label of the selected wikidata id
        :param value_label: the label of the selected lookup key
        :param props: dict of other properties to save from the results using
            the format label_in_sparql:key_in_output.
        :return: dict
        """
        wdqs = sparql.SparqlQuery()
        result = wdqs.select(query, full_data=True)
        lookup = {}
        for entry in result:
            if entry[value_label] in lookup:
                raise pywikibot.Error('Non-unique value in lookup')
            key = str(entry[value_label])
            qid = entry[item_label].getID()
            if not props:
                lookup[key] = qid
            else:
                lookup[key] = {'wd': qid}
                for prop, label in props.items():
                    if entry[prop] and not entry[prop].type:
                        entry[prop] = repr(entry[prop])
                    lookup[key][label] = entry[prop]
        return lookup
Ejemplo n.º 5
0
    def load_creator_items():
        """Store all nsid people in Wikidata."""
        query = u'''\
# Nationalmuseum import
SELECT ?item ?itemLabel ?nsid
 (group_concat(distinct ?creator_template;separator="|") as ?creator_templates)
 (group_concat(distinct ?commons_cat;separator="|") as ?commons_cats)
 (group_concat(distinct ?death_date;separator="|") as ?death_dates)
WHERE
{
  ?item wdt:P2538 ?nsid .
  OPTIONAL {
    ?item wdt:P1472 ?creator_template .
  }
  OPTIONAL {
    ?item wdt:P373 ?commons_cat .
  }
  OPTIONAL {
    ?item wdt:P570 ?death_date .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "sv" }
}
group by ?item ?itemLabel ?nsid
'''
        s = sparql.SparqlQuery()
        data = s.select(query)
        pywikibot.output("Loaded %d artists from wikidata" % len(data))
        return NatmusInfo.clean_sparql_output(data, 'nsid')
Ejemplo n.º 6
0
def get_wd_items_using_prop(prop):
    """
    Get WD items that already have some value of a unique ID.

    Even if there are none before we start working,
    it's still useful to have in case an upload is interrupted
    and has to be restarted, or if we later want to enhance
    some items. When matching, these should take predecence
    over any hardcoded matching files.

    The output is a dictionary of ID's and items
    that looks like this:
    {'4420': 'Q28936211', '2041': 'Q28933898'}
    """
    items = {}
    print("WILL NOW DOWNLOAD WD ITEMS THAT USE " + prop)
    query = "SELECT DISTINCT ?item ?value  WHERE {?item p:" + \
        prop + "?statement. OPTIONAL { ?item wdt:" + prop + " ?value. }}"
    sparql_query = sparql.SparqlQuery()
    data = sparql_query.select(query)
    for x in data:
        key = sanitize_wdqs_result(x['item'])
        value = x['value']
        items[value] = key
    print("FOUND {} WD ITEMS WITH PROP {}".format(len(items), prop))
    return items
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--filter", default="")
    parser.add_argument("--github-oauth-token")
    parser.add_argument("--debug-http", action="store_true")
    parser.add_argument("--ignore-blacklist", action="store_true")
    parser.add_argument("--quiet",
                        action="store_true",
                        help="Do not log to stdout/stderr")
    args = parser.parse_args()

    configure_logging(args.quiet, args.debug_http)
    # https://www.wikidata.org/wiki/Wikidata:Edit_groups/Adding_a_tool#For_custom_bots
    edit_group_hash = "{:x}".format(random.randrange(0, 2**48))

    if args.github_oauth_token:
        github_oath_token = args.github_oauth_token
    else:
        with open("config.json") as config:
            github_oath_token = json.load(config)["github-oauth-token"]
    Settings.cached_session.headers.update(
        {"Authorization": "token " + github_oath_token})

    sparql_license_items = "".join(
        open(Settings.license_sparql_file).readlines())
    response = sparql.SparqlQuery().query(sparql_license_items)
    Settings.licenses = {
        row["spdx"]["value"]: row["license"]["value"][31:]
        for row in response["results"]["bindings"]
    }

    Settings.blacklist = get_filter_list(Settings.blacklist_page)
    Settings.whitelist = get_filter_list(Settings.whitelist_page)

    logger.info("# Querying Projects")
    projects = query_projects(args.filter, args.ignore_blacklist)
    logger.info("{} projects were found".format(len(projects)))

    logger.info("# Processing projects")
    for project in projects:
        logger.info("## " + project["projectLabel"] + ": " +
                    project["project"])

        try:
            properties = get_data_from_github(project["repo"], project)
        except requests.exceptions.HTTPError as e:
            logger.error("HTTP request for {} failed: {}".format(
                project["projectLabel"], e))
            continue

        if Settings.do_update_wikidata:
            try:
                update_wikidata(properties, edit_group_hash)
            except Exception as e:
                logger.error("Failed to update {}: {}".format(
                    properties.project, e))
                raise e

    logger.info("# Finished successfully")
Ejemplo n.º 8
0
def get_candidates(query_name):
    query = get_query(query_name)
    candidates = []
    sparql_query = sparql.SparqlQuery()
    data = sparql_query.select(query)
    for x in data:
        candidates.append((x["item"].split("/")[-1], x["librised"]))
    logging.info("{}: Retrieved {} candidates.".format(query_name,
                                                       len(candidates)))
    return candidates
Ejemplo n.º 9
0
 def testGetItems(self, mock_method):
     """Test item list retrieval via SPARQL."""
     mock_method.return_value = Container(
         SQL_RESPONSE_CONTAINER % '{}, {}, {}'.format(
             ITEM_Q498787, ITEM_Q677525, ITEM_Q677525))
     q = sparql.SparqlQuery()
     res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat')
     self.assertSetEqual(res, {'Q498787', 'Q677525'})
     res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat',
                       result_type=list)
     self.assertEqual(res, ['Q498787', 'Q677525', 'Q677525'])
    def testQueryAsk(self, mock_method):
        """Test ASK query."""
        mock_method.return_value = Container(RESPONSE_TRUE)
        q = sparql.SparqlQuery()

        res = q.ask('ASK { ?x ?y ?z }')
        self.assertTrue(res)

        mock_method.return_value = Container(RESPONSE_FALSE)
        res = q.ask('ASK { ?x ?y ?z }')
        self.assertFalse(res)
Ejemplo n.º 11
0
    def testQueryAsk(self, mock_method):
        """Test ASK query."""
        mock_method.return_value = Container(RESPONSE_TRUE)
        with skipping(pywikibot.exceptions.TimeoutError):
            q = sparql.SparqlQuery()

        res = q.ask('ASK { ?x ?y ?z }')
        self.assertTrue(res)

        mock_method.return_value = Container(RESPONSE_FALSE)
        res = q.ask('ASK { ?x ?y ?z }')
        self.assertFalse(res)
Ejemplo n.º 12
0
    def load_painting_items():
        """Store all natmus paintings in Wikidata."""
        query = u'''\
# Nationalmuseum import
SELECT ?item ?obj_id
  (group_concat(distinct ?type;separator="|") as ?types)
  (group_concat(distinct ?creator;separator="|") as ?creators)
  (group_concat(distinct ?creator_template;separator="|") as ?creator_templates)
  (group_concat(distinct ?creator_cat;separator="|") as ?creator_cats)
  (group_concat(distinct ?death_date;separator="|") as ?death_dates)
  (group_concat(distinct ?commons_cat;separator="|") as ?commons_cats)
  (group_concat(distinct ?depicted_person;separator="|") as ?depicted_persons)
  (group_concat(distinct ?depicted_cat;separator="|") as ?depicted_cats)
WHERE
{
  ?item wdt:P2539 ?obj_id .
  OPTIONAL {
    ?item wdt:P31 ?type .
  }
  OPTIONAL {
    ?item wdt:P170 ?creator .
    OPTIONAL {
      ?creator wdt:P570 ?death_date .
    }
    OPTIONAL {
      ?creator wdt:P1472 ?creator_template .
    }
    OPTIONAL {
      ?creator wdt:P373 ?creator_cat .
    }
  }
  OPTIONAL {
    ?item wdt:P373 ?commons_cat .
  }
  OPTIONAL {
    ?item wdt:P180 ?depicted_person .
    ?depicted_person wdt:P31 wd:Q5 .
    OPTIONAL {
      ?depicted_person wdt:P373 ?depicted_cat .
    }
  }
}
group by ?item ?obj_id
'''
        s = sparql.SparqlQuery()
        data = s.select(query)
        pywikibot.output("Loaded %d paintings from wikidata" % len(data))
        return NatmusInfo.clean_sparql_output(data, 'obj_id')
    def testQuerySelect(self, mock_method):
        """Test SELECT query."""
        mock_method.return_value = Container(
            SQL_RESPONSE_CONTAINER % ("%s, %s" % (ITEM_Q498787, ITEM_Q677525)))
        q = sparql.SparqlQuery()
        res = q.select('SELECT * WHERE { ?x ?y ?z }')
        self.assertIsInstance(res, list, 'Result is not a list')
        self.assertEqual(len(res), 2)

        self.assertDictEqual(res[0],
                             {'cat': 'http://www.wikidata.org/entity/Q498787',
                              'catLabel': 'Muezza', 'd': '1955-01-01T00:00:00Z'},
                             'Bad result')
        self.assertDictEqual(res[1],
                             {'cat': 'http://www.wikidata.org/entity/Q677525',
                              'catLabel': 'Orangey', 'd': '2015-06-22T00:00:00Z'},
                             'Bad result')
Ejemplo n.º 14
0
    def load_local_nsid_commonscats(qids):
        """
        Get commonscats for the locally loaded list of qids.

        qids: list of qids
        """
        query = u'''\
# Nationalmuseum import
SELECT ?item ?commons_cat WHERE {
  ?item wdt:P373 ?commons_cat .
  VALUES ?item { wd:%s } .
}
''' % ' wd:'.join(qids)
        s = sparql.SparqlQuery()
        data = s.select(query)
        pywikibot.output("Loaded %d cats via wikidata from local mappings" %
                         len(data))
        return NatmusInfo.clean_sparql_output(data, 'item')
Ejemplo n.º 15
0
def query_projects(project_filter: Optional[str] = None,
                   ignore_blacklist: bool = False) -> List[Dict[str, str]]:
    """
    Queries for all software projects and returns them as an array of simplified dicts
    :return: the data splitted into projects with and without github
    """
    wikdata_sparql = sparql.SparqlQuery()
    sparql_free_software_items = "".join(
        open(Settings.sparql_file).readlines())
    response = wikdata_sparql.query(sparql_free_software_items)

    projects = []
    logger.info("{} projects were found by the sparql query".format(
        len(response["results"]["bindings"])))
    for project in response["results"]["bindings"]:
        # Remove bloating type information

        project = {
            "projectLabel": project["projectLabel"]["value"],
            "project": project["project"]["value"],
            "repo": project["repo"]["value"],
        }

        if (project_filter and project_filter.lower()
                not in project["projectLabel"].lower()):
            continue
        if project["project"][
                31:] in Settings.blacklist and not ignore_blacklist:
            logger.info(
                f"{project['projectLabel']} ({project['project'][31:]}) is blacklisted"
            )
            continue

        if not Settings.repo_regex.match(project["repo"]):
            logger.info(" - Removing {}: {} {}".format(project["projectLabel"],
                                                       project["project"],
                                                       project["repo"]))
            continue

        projects.append(project)

    logger.info("{} projects remained after filtering".format(len(projects)))

    return projects
Ejemplo n.º 16
0
def query_projects():
    """
    Queries for all software projects and returns them as an array of simplified dicts
    :return: the data splitted into projects with and without github
    """
    wikdata_sparql = sparql.SparqlQuery()
    sparql_free_software_items = "".join(
        open(Settings.sparql_file).readlines())
    response = wikdata_sparql.query(sparql_free_software_items)

    # Split the data into those with repository and those without
    projects = []
    for project in response["results"]["bindings"]:
        # Remove bloating type information
        for key in project.keys():
            project[key] = project[key]["value"]

        projects.append(project)

    return projects
Ejemplo n.º 17
0
    def testQuerySelect(self, mock_method):
        """Test SELECT query."""
        mock_method.return_value = Container(
            SQL_RESPONSE_CONTAINER %
            '{}, {}'.format(ITEM_Q498787, ITEM_Q677525))
        with skipping(pywikibot.exceptions.TimeoutError):
            q = sparql.SparqlQuery()
        res = q.select('SELECT * WHERE { ?x ?y ?z }')
        self.assertIsInstance(res, list, 'Result is not a list')
        self.assertLength(res, 2)

        self.assertDictEqual(
            res[0], {
                'cat': 'http://www.wikidata.org/entity/Q498787',
                'catLabel': 'Muezza',
                'd': '1955-01-01T00:00:00Z'
            }, 'Bad result')
        self.assertDictEqual(
            res[1], {
                'cat': 'http://www.wikidata.org/entity/Q677525',
                'catLabel': 'Orangey',
                'd': '2015-06-22T00:00:00Z'
            }, 'Bad result')
Ejemplo n.º 18
0
def single_bbox_only_id_query(sw_x, sw_y, ne_x, ne_y):
    """
    :param sw_x: South-West x coordinate
    :param sw_y: South-West y coordinate
    :param ne_x: North-East x coordinate
    :param ne_y: North-East y coordinate
    :return: list of wikidata entity ids contained on the specified bounding box
    """
    query = f"""
    SELECT ?place
    WHERE{{
        SERVICE wikibase:box {{
            ?place wdt:P625 ?location . 
            bd:serviceParam wikibase:cornerSouthWest "Point({sw_x} {sw_y})"^^geo:wktLiteral.
            bd:serviceParam wikibase:cornerNorthEast "Point({ne_x} {ne_y})"^^geo:wktLiteral.
        }}
    }}
    """
    wikidata_site = pywikibot.Site("wikidata", "wikidata")
    query_object = spq.SparqlQuery(repo=wikidata_site)
    data = query_object.select(
        query)  # returns a list, where data[0] is the first item,
    return data
Ejemplo n.º 19
0
 def testGetItems(self, mock_method):
     """Test item list retrieval via SPARQL."""
     mock_method.return_value = TestContainer(SQL_RESPONSE)
     q = sparql.SparqlQuery()
     res = q.get_items('SELECT * WHERE { ?x ?y ?z }', 'cat')
     self.assertSetEqual(res, set(['Q498787', 'Q677525']))
Ejemplo n.º 20
0

if __name__ == '__main__':
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()
    query = """
    SELECT ?item ?itemLabel
    WHERE
    {
    ?item wdt:%s wd:%s.
    SERVICE wikibase:label { bd:serviceParam wikibase:language
    "[AUTO_LANGUAGE],en". }
    }
    """
    dependencies = {'endpoint': None, 'entity_url': None}
    query_object = sparql.SparqlQuery(**dependencies)
    science = re.compile(
        r'(field|stud|academic|discipline|science)\
                           (y\b|ies\b|ied\b|\b|s\b)', re.IGNORECASE)
    subject = re.compile(r'.+?(?= \(|:|-)')
    infos = re.compile(r'\((.+?)(-|:)(.+?)\)')
    sc = re.compile(r'(.+?)( - | : )(.+)')
    connection = MongoClient('localhost', 27017)
    db = connection.CNRS
    coll = db.annuaire
    titres = [
        re.search(subject, x).group(0) for x in read_file('cnrs_annuaire')
    ]
    print('Total titres: {}'.format(len(titres)))
    total = 0
    set_titre = set()
Ejemplo n.º 21
0
def convert(path_in,
            path_out,
            pathway_iri,
            wp_id,
            pathway_version,
            scale=100,
            theme="plain"):
    """Convert from GPML to another format like SVG.

    Keyword arguments:
    path_in -- path in, e.g., ./WP4542_103412.gpml
    path_out -- path out, e.g., ./WP4542_103412.svg
    pathway_iri -- e.g., http://identifiers.org/wikipathways/WP4542
    pathway_version -- e.g., 103412
    scale -- scale to use when converting to PNG (default 100)
    theme -- theme (plain or dark) to use when converting to SVG (default plain)"""
    if not path.exists(path_in):
        raise Exception(f"Missing file '{path_in}'")

    if path.exists(path_out):
        print(f"File {path_out} already exists. Skipping.")
        return True

    dir_in = path.dirname(path_in)
    base_in = path.basename(path_in)
    # example base_in: 'WP4542.gpml'
    [stub_in, ext_in_with_dot] = path.splitext(base_in)
    # gettting rid of the leading dot, e.g., '.gpml' to 'gpml'
    ext_in = LEADING_DOT_RE.sub("", ext_in_with_dot)

    if ext_in != "gpml":
        # TODO: how about *.gpml.xml?
        raise Exception(f"Currently only accepting *.gpml for path_in")
    gpml_f = path_in

    dir_out = path.dirname(path_out)
    # example base_out: 'WP4542.svg'
    base_out = path.basename(path_out)
    [stub_out, ext_out_with_dot] = path.splitext(base_out)
    # getting rid of the leading dot, e.g., '.svg' to 'svg'
    ext_out = LEADING_DOT_RE.sub("", ext_out_with_dot)

    tree = ET.parse(gpml_f, parser=parser)
    root = tree.getroot()

    if root is None:
        raise Exception("no root element")
    if root.tag is None:
        raise Exception("no root tag")

    gpml_version = re.sub(r"{http://pathvisio.org/GPML/(\w+)}Pathway", r"\1",
                          root.tag)
    if ext_out != "gpml" and gpml_version != LATEST_GPML_VERSION:
        old_f = f"{dir_in}/{stub_in}.{gpml_version}.gpml"
        rename(gpml_f, old_f)
        convert(old_f, gpml_f, pathway_iri, wp_id, pathway_version, scale)

    # trying to get wd ids via sparql via pywikibot
    site = pywikibot.Site("wikidata", "wikidata")
    repo = site.data_repository()  # this is a DataSite object
    wd_sparql = sparql.SparqlQuery(
        endpoint="https://query.wikidata.org/sparql", repo=repo)
    # (self, endpoint=None, entity_url=None, repo=None, 2 max_retries=None, retry_wait=None)

    if ext_out in ["gpml", "owl", "pdf", "pwf", "txt"]:
        subprocess.run(shlex.split(f"pathvisio convert {path_in} {path_out}"))
    elif ext_out == "png":
        # TODO: look at using --scale as an option (instead of an argument),
        #       for both pathvisio and gpmlconverter.
        # TODO: move the setting of a default value for scale into
        # pathvisio instead of here.
        subprocess.run(
            shlex.split(f"pathvisio convert {path_in} {path_out} {scale}"))
        # Use interlacing? See https://github.com/PathVisio/pathvisio/issues/78
        # It's probably not worthwhile. If we did it, we would need to install
        # imagemagick and then run this:
        #     mv "$path_out" "$path_out.noninterlaced.png"
        #     convert -interlace PNG "$path_out.noninterlaced.png" "$path_out"
    elif ext_out in ["json", "jsonld"]:
        gpml2json(path_in, path_out, pathway_iri, wp_id, pathway_version,
                  wd_sparql)
    elif ext_out in ["svg", "pvjssvg"]:
        #############################
        # SVG
        #############################

        json_f = f"{dir_out}/{stub_in}.json"
        if not path.isfile(json_f):
            gpml2json(path_in, json_f, pathway_iri, wp_id, pathway_version,
                      wd_sparql)

        json2svg(json_f, path_out, pathway_iri, wp_id, pathway_version, theme)
    else:
        raise Exception(f"Invalid output extension: '{ext_out}'")