Exemple #1
0
        def sparql_query(fallback: bool = False):
            if fallback == True:
                # Fall back to English as gloss language
                return execute_sparql_query(f'''
                        SELECT
                        ?sense ?gloss
                        WHERE {{
                          VALUES ?l {{wd:{self.lexeme_id}}}.
                          ?l ontolex:sense ?sense.
                          ?sense skos:definition ?gloss.
                          # Get only the swedish gloss, exclude otherwise
                          FILTER(LANG(?gloss) = "en")
                          # Exclude lexemes without a linked QID from at least one sense
                          # ?sense wdt:P5137 [].
                        }}'''

                                            # debug=True
                                            )
            else:
                return execute_sparql_query(f'''
                        SELECT
                        ?sense ?gloss
                        WHERE {{
                          VALUES ?l {{wd:{self.lexeme_id}}}.
                          ?l ontolex:sense ?sense.
                          ?sense skos:definition ?gloss.
                          # Get only the swedish gloss, exclude otherwise
                          FILTER(LANG(?gloss) = "{usage_example.record.language_code.value}")
                          # Exclude lexemes without a linked QID from at least one sense
                          # ?sense wdt:P5137 [].
                        }}'''

                                            # debug=True
                                            )
Exemple #2
0
 def fetch_forms_without_an_example(self):
     logger = logging.getLogger(__name__)
     # title:Forms that have no example demonstrating them and that have at least
     # one sense with P5137 (item for this sense)
     random_offset = random.randint(20, 1000)
     logger.info(f"random offset:{random_offset}")
     results = execute_sparql_query(f'''
         select ?lexeme ?form ?form_representation ?category  
         (group_concat(distinct ?feature; separator = ",") as ?grammatical_features)
         WHERE {{
             ?lexeme dct:language wd:{self.language_qid.value};
                     wikibase:lemma ?lemma;
                     wikibase:lexicalCategory ?category;
                     ontolex:lexicalForm ?form;
                     ontolex:sense ?sense.
             ?sense wdt:P5137 [].
             ?form ontolex:representation ?form_representation;
             wikibase:grammaticalFeature ?feature.
             MINUS {{
             ?lexeme p:P5831 ?statement.
             ?statement ps:P5831 ?example;
                      pq:P6072 [];
                      pq:P5830 ?form_with_example.
             }}
         }}
         group by ?lexeme ?form ?form_representation ?category
         offset {random_offset}
         limit {config.number_of_forms_to_fetch}''',
                                    debug=False)
     self.forms_without_an_example = []
     # pprint(results)
     if "results" in results:
         if "bindings" in results["results"]:
             #logger.debug(f"data:{results['results']['bindings']}")
             forms = results["results"]['bindings']
             logger.info(f"Got {len(forms)} lexemes")
             for entry in forms:
                 # logger.info(f"data:{entry.keys()}")
                 # logging.debug(f"lexeme_json:{entry}")
                 form = Form(entry)
                 logger.info(f"appending {form} to list of forms")
                 # logger.info("debug exit")
                 # exit(0)
                 self.forms_without_an_example.append(form)
         else:
             raise ValueError("Got no bindings dict from WD")
     else:
         raise ValueError("Got no results dict from WD")
     if len(self.forms_without_an_example) == 0:
         console.print(
             "Got no forms from Wikidata to work on for this language "
             "if you think this is a bug, please open an issue here "
             f"{tui.issue_url()}")
         exit()
     else:
         logger.info(
             f"Got {len(self.forms_without_an_example)} "
             f"forms from WDQS for language {self.language_code.name.title()}"
         )
 def calculate_total_lexemes(self):
     """Calculate how many lexemes exists in Wikidata"""
     result = (execute_sparql_query(f'''
     SELECT
     (COUNT(?l) as ?count)
     WHERE {{
       ?l a ontolex:LexicalEntry.
     }}'''))
     count: int = wdqs.extract_count(result)
     logging.debug(f"count:{count}")
     self.total_lexemes = count
Exemple #4
0
 def count_number_of_lexemes(self):
     """Returns an int"""
     logger = logging.getLogger(__name__)
     result = (execute_sparql_query(f'''
     SELECT
     (COUNT(?l) as ?count)
     WHERE {{
       ?l dct:language wd:{self.language_qid.value}.
     }}'''))
     logger.debug(f"result:{result}")
     count: int = wdqs.extract_count(result)
     logging.debug(f"count:{count}")
     return count
Exemple #5
0
 def lookup_qid(self):
     # Given a docuemnt id lookup the QID if any
     result = execute_sparql_query(f"""
         SELECT ?item
         WHERE 
         {{
           ?item wdt:P8433 "{self.id}".
         }}
         """)
     logging.info(f"result:{result}")
     self.document_qid = extract_the_first_wikibase_value_from_a_wdqs_result_set(
         result, "item")
     logging.info(f"document_qid:{self.document_qid}")
Exemple #6
0
 def count_number_of_senses_with_P5137(self):
     """Returns an int"""
     result = (execute_sparql_query(f'''
     SELECT
     (COUNT(?sense) as ?count)
     WHERE {{
       VALUES ?l {{wd:{self.id}}}.
       ?l ontolex:sense ?sense.
       ?sense skos:definition ?gloss.
       # Exclude lexemes without a linked QID from at least one sense
       ?sense wdt:P5137 [].
     }}'''))
     count: int = wdqs.extract_count(result)
     logging.debug(f"count:{count}")
     return count
Exemple #7
0
def get_records(
        form: Form = None,
        lexemes: Lexemes = None
) -> List[UsageExample]:
    logger = logging.getLogger(__name__)
    if form is None:
        raise ValueError("form was None")
    if lexemes is None:
        raise ValueError("language was None")
    if lexemes.language_code in config.fast_nlp_languages:
        limit = config.wikisource_max_results_size_fast_nlp
    else:
        limit = config.wikisource_max_results_size_slow_nlp
    logger.info(
        f"Fetching usage examples from the {lexemes.language_code.name.title()} Wikisource...")
    # search using sparql
    # borrowed from Scholia
    # thanks to Vigneron for the tip :)
    results = execute_sparql_query(f'''
 SELECT ?title ?titleUrl ?snippet WHERE {{
  SERVICE wikibase:mwapi {{
      bd:serviceParam wikibase:api "Search" .
      bd:serviceParam wikibase:endpoint "{lexemes.language_code.value}.wikisource.org" .
      bd:serviceParam mwapi:srsearch "{form.representation}" .
      bd:serviceParam mwapi:language "{lexemes.language_code.value}" .
      ?title wikibase:apiOutput mwapi:title .
      ?snippet_ wikibase:apiOutput "@snippet" .
  }}
  hint:Prior hint:runFirst "true" .
  BIND(CONCAT("https://br.wikisource.org/wiki/", ENCODE_FOR_URI(?title)) AS ?titleUrl)
  BIND(REPLACE(REPLACE(?snippet_, '</span>', ''), '<span class="searchmatch">', '') AS ?snippet)
}}
LIMIT {limit}
''')
    logger.debug(f"results:{results}")
    records = []
    for item in results["results"]["bindings"]:
        records.append(WikisourceRecord(json=item,
                                        lexemes=lexemes))
    length = len(records)
    logger.info(f"Got {length} records")
    if logger.getEffectiveLevel() == 10:
        for record in records:
            logging.debug(record)
    return process_records(form=form,
                           records=records,
                           lexemes=lexemes)
Exemple #8
0
 def count_number_of_senses_with_p5137(self):
     """Returns an int"""
     logger = logging.getLogger(__name__)
     result = (execute_sparql_query(f'''
     SELECT
     (COUNT(?sense) as ?count)
     WHERE {{
       ?l dct:language wd:{self.language_qid.value}.
       ?l ontolex:sense ?sense.
       ?sense skos:definition ?gloss.
       # Exclude lexemes without a linked QID from at least one sense
       ?sense wdt:P5137 [].
     }}'''))
     logger.debug(f"result:{result}")
     count: int = wdqs.extract_count(result)
     logging.debug(f"count:{count}")
     return count
Exemple #9
0
 def count_number_of_forms_without_an_example(self):
     """Returns an int"""
     # TODO fix this to count all senses in a given language
     result = (execute_sparql_query(f'''
     SELECT
     (COUNT(?form) as ?count)
     WHERE {{
       ?l dct:language wd:{self.language_qid.value}.
       ?l ontolex:lexicalForm ?form.
       ?l ontolex:sense ?sense.
       # exclude lexemes that already have at least one example
       MINUS {{?l wdt:P5831 ?example.}}
       # Exclude lexemes without a linked QID from at least one sense
       ?sense wdt:P5137 [].
     }}'''))
     count: int = wdqs.extract_count(result)
     logging.debug(f"count:{count}")
     self.number_of_forms_without_an_example = count