def clean_string(str_init, complet=False, clean_udecode=True): """Nettoyage de tous les signes de ponctuation (sauf le point)""" if str_init is None: str_init = "" if (complet): for signe in ponctuation: str_init = str_init.replace(signe, "").replace(" ", "") str_init = str_init.strip() if (clean_udecode): str_init = udecode(str_init.lower()) return str_init
def clean_title_field(field): field = udecode(clean_dollars(field)).lower() for el in skipwords: if el in field: field = field[:field.find(el)] for el in skipwords: if el in field: field = field[:field.find(el)] field = clean_stopwords(field) field = " ".join([el for el in field.split(" ") if el]) return field
def extract_params(params): dict_params = {} for el in params: el = el.split("=") if len(el) == 2: critere = el[0] value = el[1] dict_params[critere] = value elif len(el) > 2: critere = el[0] value = "=".join(el[1:]) dict_params[critere] = value dict_params["query"] = udecode(dict_params["query"].lower()) return dict_params
def searchGFinQuery(query_element): test = False split_criteres = [ " -- ", "--", '"', " or ", " and ", " any ", " all ", " adj ", " notice ", " dc.type " ] for el in split_criteres: query_element = query_element.replace(el, "¤") query_element = [ el.strip() for el in query_element.split("¤") if el.strip() ] for el in query_element: el = udecode(el).lower() if el in referentielGF_libelles: test = True if test is False: for el_gf in referentielGF_libelles: for el in query_element: el = udecode(el).lower() if el_gf in el: test = True print(test) return test
def clean_string(string, replaceSpaces=False, replaceTirets=False): """ Nettoyage d'une chaîne de caractères: accents, ponctuations, majuscules En option : - suppression des espaces - suppression des tirets """ string = udecode(string.lower()) for sign in punctuation: string = string.replace(sign, " ") string = string.replace("'", " ") if replaceTirets: string = string.replace("-", " ") if replaceSpaces: string = string.replace(" ", "") string = ' '.join(s for s in string.split() if s != "") string = string.strip() return string
def searchLOCinQuery(query_element): test = False split_criteres = [ " -- ", "--", '"', " or ", " and ", " all ", " adj ", " any ", " notice ", " dc.type " ] for el in split_criteres: query_element = query_element.replace(el, "¤") query_element = [ el.strip() for el in query_element.split("¤") if el.strip() ] for el in query_element: if el.lower in subdiv_lieu: test = True query_element = udecode(" ".join(query_element).lower()) if test is False: for el in subdiv_lieu: if el in query_element: test = True return test
def clean_content_field(field): field = udecode(field).lower() field = clean_dollars(field) field = clean_stopwords(field) return field
from lxml.html import fromstring from urllib import request import urllib.parse import urllib.error from udecode import udecode import re from aut2id_concepts import accesspoint2sru import SRUextraction as sru from stdf import * headers = "URL page,URL lien,Texte,Nb résultats,URL SRU Gallica,Requête initiale,A modifier ?,motif,Nouvelle requête GF,\ Nouvelle URL GF,Nouvelle requête Lieu,Nouvelle URL Lieu".split(",") referentielGF_libelles = file2list("referentiel_gf.txt") referentielGF_libelles = [udecode(el.lower()) for el in referentielGF_libelles] subdiv_lieu = file2list("subdiv_lieux.txt") subdiv_lieu = [udecode(el.lower()) for el in subdiv_lieu] query_done = {} entitesLieu = {} class Query: def __init__(self, url, text=""): self.url = url self.text = text self.params = url.split("?")[1].split("&") self.params = extract_params(self.params) self.query = urllib.parse.unquote(self.params["query"]) self.nb_results, self.sru_url = query2results(self.query) [self.gf_query, self.loc_query, self.hist_crit_query,