def clean_string(str_init, complet=False, clean_udecode=True):
    """Nettoyage de tous les signes de ponctuation (sauf le point)"""
    if str_init is None:
        str_init = ""
    if (complet):
        for signe in ponctuation:
            str_init = str_init.replace(signe, "").replace(" ", "")
        str_init = str_init.strip()
    if (clean_udecode):
        str_init = udecode(str_init.lower())
    return str_init
Exemple #2
0
def clean_title_field(field):
    field = udecode(clean_dollars(field)).lower()
    for el in skipwords:
        if el in field:
            field = field[:field.find(el)]
    for el in skipwords:
        if el in field:
            field = field[:field.find(el)]
    field = clean_stopwords(field)
    field = " ".join([el for el in field.split(" ") if el])
    return field
Exemple #3
0
def extract_params(params):
    dict_params = {}
    for el in params:
        el = el.split("=")
        if len(el) == 2:
            critere = el[0]
            value = el[1]
            dict_params[critere] = value
        elif len(el) > 2:
            critere = el[0]
            value = "=".join(el[1:])
            dict_params[critere] = value
    dict_params["query"] = udecode(dict_params["query"].lower())
    return dict_params
Exemple #4
0
def searchGFinQuery(query_element):
    test = False
    split_criteres = [
        " -- ", "--", '"', " or ", " and ", " any ", " all ", " adj ",
        " notice ", " dc.type "
    ]
    for el in split_criteres:
        query_element = query_element.replace(el, "¤")
    query_element = [
        el.strip() for el in query_element.split("¤") if el.strip()
    ]
    for el in query_element:
        el = udecode(el).lower()
        if el in referentielGF_libelles:
            test = True
    if test is False:
        for el_gf in referentielGF_libelles:
            for el in query_element:
                el = udecode(el).lower()
                if el_gf in el:
                    test = True
    print(test)
    return test
Exemple #5
0
def clean_string(string, replaceSpaces=False, replaceTirets=False):
    """
    Nettoyage d'une chaîne de caractères: accents, ponctuations, majuscules
    En option : 
        - suppression des espaces
        - suppression des tirets
    """
    string = udecode(string.lower())
    for sign in punctuation:
        string = string.replace(sign, " ")
    string = string.replace("'", " ")
    if replaceTirets:
        string = string.replace("-", " ")
    if replaceSpaces:
        string = string.replace(" ", "")
    string = ' '.join(s for s in string.split() if s != "")
    string = string.strip()
    return string
Exemple #6
0
def searchLOCinQuery(query_element):
    test = False
    split_criteres = [
        " -- ", "--", '"', " or ", " and ", " all ", " adj ", " any ",
        " notice ", " dc.type "
    ]
    for el in split_criteres:
        query_element = query_element.replace(el, "¤")
    query_element = [
        el.strip() for el in query_element.split("¤") if el.strip()
    ]
    for el in query_element:
        if el.lower in subdiv_lieu:
            test = True
    query_element = udecode(" ".join(query_element).lower())
    if test is False:
        for el in subdiv_lieu:
            if el in query_element:
                test = True
    return test
Exemple #7
0
def clean_content_field(field):
    field = udecode(field).lower()
    field = clean_dollars(field)
    field = clean_stopwords(field)
    return field
Exemple #8
0
from lxml.html import fromstring
from urllib import request
import urllib.parse
import urllib.error
from udecode import udecode

import re

from aut2id_concepts import accesspoint2sru
import SRUextraction as sru
from stdf import *

headers = "URL page,URL lien,Texte,Nb résultats,URL SRU Gallica,Requête initiale,A modifier ?,motif,Nouvelle requête GF,\
Nouvelle URL GF,Nouvelle requête Lieu,Nouvelle URL Lieu".split(",")
referentielGF_libelles = file2list("referentiel_gf.txt")
referentielGF_libelles = [udecode(el.lower()) for el in referentielGF_libelles]
subdiv_lieu = file2list("subdiv_lieux.txt")
subdiv_lieu = [udecode(el.lower()) for el in subdiv_lieu]
query_done = {}
entitesLieu = {}


class Query:
    def __init__(self, url, text=""):
        self.url = url
        self.text = text
        self.params = url.split("?")[1].split("&")
        self.params = extract_params(self.params)
        self.query = urllib.parse.unquote(self.params["query"])
        self.nb_results, self.sru_url = query2results(self.query)
        [self.gf_query, self.loc_query, self.hist_crit_query,