def analyse(url, fields, max_records, output_filesid): output_files = {} output_files["results"] = create_file(output_filesid + "-results.txt") output_files["concepts_graph"] = create_file(output_filesid + "-concepts_graph.txt") nb_results = sru.query2nbresults(url) metas = defaultdict(list) if nb_results < max_records: max_records = nb_results i = 1 print("Nombre total de résultats : ", nb_results) while i < max_records: metas = page_of_results(url, fields, i, max_records, metas) i += 100 analyse_corpus(metas, output_files, output_filesid) EOT([output_files[key] for key in output_files])
def launch_analyse(output_filepath, directory_name): errors_list = [] output_file = create_file(output_filepath) output_file.write(f"\nRépertoire analysé : {directory_name}") errors_list = analyse_dir(directory_name, errors_list, output_file) if (errors_list == []): output_file.write("Aucune erreur trouvée, tout est parfait") else: output_file.write(f"\nTotal : {str(len(errors_list))} erreur(s) de nommage constatée(s)")
# coding: utf-8 from stdf import create_file, line2report, file2list, sparql2dict import csv def file2analyse(liste_libelles, report): for libelle in liste_libelles: check_libelle(libelle, report) def check_libelle(libelle, report): query = """ PREFIX dcterms: <http://purl.org/dc/terms/> PREFIX skos: <http://www.w3.org/2004/02/skos/core#> select * where { ?ark dcterms:isPartOf ?rameau; a skos:Concept; skos:prefLabel ?label. FILTER contains(?label, " -- """ + libelle + """\") } """ results = sparql2dict("http://data.bnf.fr/sparql", query, ["ark", "label"]) for result in results: line2report([result[result.find("ark"):], results[result]["label"][0]], report) if __name__ == "__main__": liste_libelles_filename = input("Nom du fichier contenant les libellés : ") liste_libelles = file2list(liste_libelles_filename) report = create_file("Controles subdiv gf.txt") file2analyse(liste_libelles, report)
for rec in pep_record: xml_rec = pep_record[rec] key = "" if xml_rec is not None: key = sru.record2fieldvalue(xml_rec, "100$a") + sru.record2fieldvalue( xml_rec, "100$m") key = clean(key) print("key", arkA, key) return key arkA_treated = [] dict_key2ark = defaultdict(list) dict_arkA2row = defaultdict(list) output_file = create_file("pep_homonymes.txt") with open(input_filename, encoding="utf-8") as f: for row in f: arkA = row.split("\t")[2] if arkA not in arkA_treated: arkA_treated.append(arkA) key = ark2key(arkA) dict_key2ark[key].append(arkA) dict_arkA2row[arkA].append(row) for key in dict_key2ark: if len(dict_key2ark[key]) > 1: print("homonymes", key, arkA) for arkA in dict_key2ark[key]: for row in dict_arkA2row[arkA]:
liste = [ "gme-10025-000_0.jpg", "gme-10026-000_0.jpg", "gme-10170-001_0.jpg", "gme-10314-003_0.jpg", "gme-10437-000_0.jpg", "gme-11010-000_0.jpg", "gme-11571-000_0.jpg", "gme-11817-000_0.jpg", "gme-11616-000_1725.jpg", "gme-11748-000_1719.jpg", "gme-12544-000_1712.jpg", "gme-12545-000_1715.jpg", "gme-12546-000_1720.jpg", "gme-12547-000_1711.jpg", "gme-5313-000_1727.jpg", "gme-7077-000_1707.jpg", "gme-7996-000_1706.jpg", "gml-5446-000_1720.jpg", "gml-6536-001_1722.jpg", "gml-7723-000_1716.jpg", "gmlc-723-001_1721.jpg", "gmt-11162-002_1699.jpg", "gmt-18189-000_1698.jpg", "gmt-18842-001_1713.jpg", "gmt-19615-003_1697.jpg", "gmt-24806-001_1718.jpg", "gmt-25521-000_1702.jpg", "gmt-26627-019_1704.jpg", "gmt-26631-002_1705.jpg", "gmt-26637-077_1723.jpg", "gmt-26637-083_1708.jpg", "gmt-27015-007_1709.jpg", "gmt-27578-013_1723.jpg", "gmt-28575-000_1726.jpg", "gmt-30821-001_1709.jpg", "gmt-30821-002_1703.jpg", "gmt-30821-003_1709.jpg", "gmt-30821-004_1709.jpg", "gmt-30821-012_1703.jpg", "gmt-30821-014_1708.jpg", "gmt-30821-023_1708.jpg", "gmt-7766-002_1700.jpg", "c-138-000_63 2003.jpg", "gme-11015-000_109 2003.jpg", "gme-12308-001_203 2003.jpg", "gme-13052-000_259 2003.jpg", "gme-13087-002_192 2003.jpg", "gme-13272-000_137.jpg" ] for el in liste: output_filepath = os.path.join( r"C:\Users\Lully\Documents\Hélène\scripts\mn\verification_noms_de_fichiers\verification_noms_de_fichiers\tests", el) create_file(output_filepath)
sparql_query, ["uri_GEO", "type_GEO"]) for el in results: ark = el[el.find("ark"):] type_align = ark2type(ark) if type_align == "GEO": geo_id = ark if geo_id == "": geo_id = None return geo_id identifiant = input("Identifiant (nom du fichier) : ") sparql_query = """ select * where { ?ressource wdt:P268 ?idBnF; rdfs:label ?nom; wdt:P625 ?coordonnees_geo; wdt:P31 ?type_construction. ?type_construction wdt:P279+ wd:Q811979. FILTER (langMatches(lang(?nom), "FR")) } """ report = create_file(identifiant + ".txt") headers = ["ID Wikidata", "ARK", "Nom", "Coordonnées geo", "Conversion RAM > Geo ?", "ARK Rameau initial dans Wikidata"] line2report(headers, report) launch(sparql_query, report)
# coding: utf-8 from itertools import zip_longest from stdf import create_file i = 0 errors_file = create_file("errors_file.txt") liste_files = [ 'lot1_alb_clusters_dedupe.txt', 'lot1_dnm_clusters_dedupe.txt', 'lot1_gv_clusters_dedupe.txt', 'lot1_jh_clusters_dedupe.txt', 'lot2-oeuvres_sans_alignements_existants-clusters_sans_agregats.txt', 'lot3_clusters_sans_alignement_sans_agregats_pour_dedupe.txt', 'lot4_minhashing-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt', 'lot5_minhashing-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt', 'lot6-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt' ] liste_files = ["tous_clusters_dedupe.txt"] liste_files = [ "tous_clusters_dedupe-nett-100000.txt", "tous_clusters_dedupe-nett-1000000.txt", "tous_clusters_dedupe-nett-1100000.txt", "tous_clusters_dedupe-nett-1200000.txt", "tous_clusters_dedupe-nett-1300000.txt", "tous_clusters_dedupe-nett-1400000.txt", "tous_clusters_dedupe-nett-1500000.txt", "tous_clusters_dedupe-nett-1600000.txt", "tous_clusters_dedupe-nett-1700000.txt",
line = [ark] line.extend(extract_labels(ark, xml_record, "166")) line2report(line, report_sujet_lieu, i) elif(test167): # print(ark, "167 avec subdivision sujet") line = [ark] line.extend(extract_labels(ark, xml_record, "167")) line2report(line, report_lieu_sujet, i) def extract_labels(ark, xml_record, tag): intermarc2unimarc = {"166": "250", "167": "215"} intermarc_label = sru.record2fieldvalue(xml_record, tag) intermarc_subfields = "" for field in xml_record.xpath(f"*[@tag='{tag}']"): intermarc_subfields = sru.field2listsubfields(field) unimarc_record = sru.SRU_result(f'aut.persistentid any "{ark}"').dict_records[ark] unimarc_label = sru.record2fieldvalue(unimarc_record, intermarc2unimarc[tag]) unimarc_subfields = "" for field in unimarc_record.xpath(f"*[@tag='{intermarc2unimarc[tag]}']"): unimarc_subfields = sru.field2listsubfields(field) uri = "http://data.bnf.fr/" + ark label = uri2label(uri) return [intermarc_subfields, intermarc_label, unimarc_subfields, unimarc_label, label] if __name__ == "__main__": query = "aut.type any RAM" report_lieu_sujet = create_file("Notices_Lieu_Sujet_a_retourner.txt") report_sujet_lieu = create_file("Notices_Sujet_Lieu_a_conserver.txt") query2reports(query, report_lieu_sujet, report_sujet_lieu)