Ejemplo n.º 1
0
def analyse(url, fields, max_records, output_filesid):
    output_files = {}
    output_files["results"] = create_file(output_filesid + "-results.txt")
    output_files["concepts_graph"] = create_file(output_filesid +
                                                 "-concepts_graph.txt")
    nb_results = sru.query2nbresults(url)
    metas = defaultdict(list)
    if nb_results < max_records:
        max_records = nb_results
    i = 1
    print("Nombre total de résultats : ", nb_results)
    while i < max_records:
        metas = page_of_results(url, fields, i, max_records, metas)
        i += 100
    analyse_corpus(metas, output_files, output_filesid)
    EOT([output_files[key] for key in output_files])
Ejemplo n.º 2
0
def launch_analyse(output_filepath, directory_name):
    errors_list = []
    output_file = create_file(output_filepath)
    output_file.write(f"\nRépertoire analysé : {directory_name}")
 
    errors_list = analyse_dir(directory_name, errors_list, output_file)
    if (errors_list == []):
        output_file.write("Aucune erreur trouvée, tout est parfait")
    else:
        output_file.write(f"\nTotal : {str(len(errors_list))} erreur(s) de nommage constatée(s)")
Ejemplo n.º 3
0
# coding: utf-8

from stdf import create_file, line2report, file2list, sparql2dict
import csv

def file2analyse(liste_libelles, report):
    for libelle in liste_libelles:
        check_libelle(libelle, report)


def check_libelle(libelle, report):
    query = """
    PREFIX dcterms: <http://purl.org/dc/terms/>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    select * where {
    ?ark dcterms:isPartOf ?rameau; a skos:Concept;
    skos:prefLabel ?label.
    FILTER contains(?label, " -- """ + libelle + """\")
    }
    """
    results = sparql2dict("http://data.bnf.fr/sparql", query, ["ark", "label"])
    for result in results:
        line2report([result[result.find("ark"):], results[result]["label"][0]],
                    report)

if __name__ == "__main__":
    liste_libelles_filename = input("Nom du fichier contenant les libellés : ")
    liste_libelles = file2list(liste_libelles_filename)
    report = create_file("Controles subdiv gf.txt")
    file2analyse(liste_libelles, report)
Ejemplo n.º 4
0
    for rec in pep_record:
        xml_rec = pep_record[rec]
    key = ""
    if xml_rec is not None:
        key = sru.record2fieldvalue(xml_rec, "100$a") + sru.record2fieldvalue(
            xml_rec, "100$m")
    key = clean(key)
    print("key", arkA, key)
    return key


arkA_treated = []
dict_key2ark = defaultdict(list)
dict_arkA2row = defaultdict(list)

output_file = create_file("pep_homonymes.txt")

with open(input_filename, encoding="utf-8") as f:
    for row in f:
        arkA = row.split("\t")[2]
        if arkA not in arkA_treated:
            arkA_treated.append(arkA)
            key = ark2key(arkA)
            dict_key2ark[key].append(arkA)
        dict_arkA2row[arkA].append(row)

for key in dict_key2ark:
    if len(dict_key2ark[key]) > 1:
        print("homonymes", key, arkA)
        for arkA in dict_key2ark[key]:
            for row in dict_arkA2row[arkA]:
Ejemplo n.º 5
0
liste = [
    "gme-10025-000_0.jpg", "gme-10026-000_0.jpg", "gme-10170-001_0.jpg",
    "gme-10314-003_0.jpg", "gme-10437-000_0.jpg", "gme-11010-000_0.jpg",
    "gme-11571-000_0.jpg", "gme-11817-000_0.jpg", "gme-11616-000_1725.jpg",
    "gme-11748-000_1719.jpg", "gme-12544-000_1712.jpg",
    "gme-12545-000_1715.jpg", "gme-12546-000_1720.jpg",
    "gme-12547-000_1711.jpg", "gme-5313-000_1727.jpg", "gme-7077-000_1707.jpg",
    "gme-7996-000_1706.jpg", "gml-5446-000_1720.jpg", "gml-6536-001_1722.jpg",
    "gml-7723-000_1716.jpg", "gmlc-723-001_1721.jpg", "gmt-11162-002_1699.jpg",
    "gmt-18189-000_1698.jpg", "gmt-18842-001_1713.jpg",
    "gmt-19615-003_1697.jpg", "gmt-24806-001_1718.jpg",
    "gmt-25521-000_1702.jpg", "gmt-26627-019_1704.jpg",
    "gmt-26631-002_1705.jpg", "gmt-26637-077_1723.jpg",
    "gmt-26637-083_1708.jpg", "gmt-27015-007_1709.jpg",
    "gmt-27578-013_1723.jpg", "gmt-28575-000_1726.jpg",
    "gmt-30821-001_1709.jpg", "gmt-30821-002_1703.jpg",
    "gmt-30821-003_1709.jpg", "gmt-30821-004_1709.jpg",
    "gmt-30821-012_1703.jpg", "gmt-30821-014_1708.jpg",
    "gmt-30821-023_1708.jpg", "gmt-7766-002_1700.jpg", "c-138-000_63 2003.jpg",
    "gme-11015-000_109 2003.jpg", "gme-12308-001_203 2003.jpg",
    "gme-13052-000_259 2003.jpg", "gme-13087-002_192 2003.jpg",
    "gme-13272-000_137.jpg"
]

for el in liste:
    output_filepath = os.path.join(
        r"C:\Users\Lully\Documents\Hélène\scripts\mn\verification_noms_de_fichiers\verification_noms_de_fichiers\tests",
        el)
    create_file(output_filepath)
Ejemplo n.º 6
0
                          sparql_query, ["uri_GEO", "type_GEO"])
    
    for el in results:
        ark = el[el.find("ark"):]
        type_align = ark2type(ark)
        if type_align == "GEO":
            geo_id = ark
    if geo_id == "":
        geo_id = None
    return geo_id


identifiant = input("Identifiant (nom du fichier) : ")
sparql_query = """
select * where {
  ?ressource wdt:P268 ?idBnF;
             rdfs:label ?nom;
             wdt:P625 ?coordonnees_geo;
             wdt:P31 ?type_construction.
  ?type_construction wdt:P279+ wd:Q811979.
             
FILTER (langMatches(lang(?nom), "FR"))
             }
    """
report = create_file(identifiant + ".txt")
headers = ["ID Wikidata", "ARK", "Nom",
               "Coordonnées geo", "Conversion RAM > Geo ?",
               "ARK Rameau initial dans Wikidata"]
line2report(headers, report)
launch(sparql_query, report)
Ejemplo n.º 7
0
# coding: utf-8

from itertools import zip_longest

from stdf import create_file

i = 0

errors_file = create_file("errors_file.txt")

liste_files = [
    'lot1_alb_clusters_dedupe.txt', 'lot1_dnm_clusters_dedupe.txt',
    'lot1_gv_clusters_dedupe.txt', 'lot1_jh_clusters_dedupe.txt',
    'lot2-oeuvres_sans_alignements_existants-clusters_sans_agregats.txt',
    'lot3_clusters_sans_alignement_sans_agregats_pour_dedupe.txt',
    'lot4_minhashing-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt',
    'lot5_minhashing-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt',
    'lot6-oeuvres_sans_alignements_existants-clusters_sans_agregats-dedupe.txt'
]

liste_files = ["tous_clusters_dedupe.txt"]
liste_files = [
    "tous_clusters_dedupe-nett-100000.txt",
    "tous_clusters_dedupe-nett-1000000.txt",
    "tous_clusters_dedupe-nett-1100000.txt",
    "tous_clusters_dedupe-nett-1200000.txt",
    "tous_clusters_dedupe-nett-1300000.txt",
    "tous_clusters_dedupe-nett-1400000.txt",
    "tous_clusters_dedupe-nett-1500000.txt",
    "tous_clusters_dedupe-nett-1600000.txt",
    "tous_clusters_dedupe-nett-1700000.txt",
Ejemplo n.º 8
0
        line = [ark]
        line.extend(extract_labels(ark, xml_record, "166"))
        line2report(line, report_sujet_lieu, i)
    elif(test167):
        # print(ark, "167 avec subdivision sujet")
        line = [ark]
        line.extend(extract_labels(ark, xml_record, "167"))
        line2report(line, report_lieu_sujet, i)

def extract_labels(ark, xml_record, tag):
    intermarc2unimarc = {"166": "250", "167": "215"}
    intermarc_label = sru.record2fieldvalue(xml_record, tag)
    intermarc_subfields = ""
    for field in xml_record.xpath(f"*[@tag='{tag}']"):
        intermarc_subfields = sru.field2listsubfields(field)
    unimarc_record = sru.SRU_result(f'aut.persistentid any "{ark}"').dict_records[ark]
    unimarc_label = sru.record2fieldvalue(unimarc_record, intermarc2unimarc[tag])
    unimarc_subfields = ""
    for field in unimarc_record.xpath(f"*[@tag='{intermarc2unimarc[tag]}']"):
        unimarc_subfields = sru.field2listsubfields(field)

    uri = "http://data.bnf.fr/" + ark
    label = uri2label(uri)
    return [intermarc_subfields, intermarc_label, unimarc_subfields, unimarc_label, label]


if __name__ == "__main__":
    query = "aut.type any RAM"
    report_lieu_sujet = create_file("Notices_Lieu_Sujet_a_retourner.txt")
    report_sujet_lieu = create_file("Notices_Sujet_Lieu_a_conserver.txt")
    query2reports(query, report_lieu_sujet, report_sujet_lieu)