def make_helper_tables(): """creates regex table, regex_id map table basically creates the tables used later for reconstruction the matches they don't contain the scientific names""" for row in session.query( Mapping): #for each entry in the table mapping_cites # traitement common name entree = row.common_name.split( '; ') #create list with common names as entries for name in entree: #for each common name name.lower() #get letters to be in lowercase list_of_words = name.split( ' ') #create the a list of the words composing the common name for i in list_of_words: if len(i) <= 2: #let alone small words pass else: #big words - we do something with them: namely if type( i ) == str: #check if string, if not string we just sit idle res = word_to_regex(i) else: print('problem with input') res = None if len( i ) < 5: #for small words we have additional requirements before they are added to the list if ('\s' in res) or ('\w' in res) or (res == ''): print('exception, pass short error', res) res = None else: if session.query(Regex).filter( Regex.reg == res ).scalar( ) == None and res != None: #if there isn't already an entry and the regex isn't problematic entry = Regex(reg=res, word=word.lower().strip( ';')) #create entry entry.insertregex(session) #insert entry session.commit() #commit entry #for long words len>=5 elif session.query(Regex).filter(Regex.reg == res).scalar( ) == None and res != None: #if entry doesn't exist: create entry in regex database word = i.strip(';') entry = Regex(reg=res, word=word.lower().strip(';')) entry.insertregex(session) session.commit() else: #if no entry pass pass #next step: fillup Match_Regex_IdMap requested_re = session.query( Regex.id).filter(Regex.reg == res).scalar() request = session.query(Match_Regex_IdMap.id).filter_by( id_re=requested_re, id_map=row.id).scalar() if request == None and requested_re != None: #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re) entry = Match_Regex_IdMap(id_re=requested_re, id_map=row.id) #create entry entry.insertMatch(session) #insert session.commit() #commit
def re_generator_species(): """Create a dictionary {id_species : {common name 1 : regex1, ....}}. These regexes will be used to multiples times, so we create them for once for each parsing. We can change the code here to adapt the way of create these regexes. The adaption used can be used to create different classification. The classification 2 checks the presence of the words of one common name in the text. The classification 3 considers the order of the word and the proximity.""" regex_classification_2 = {} regex_classification_3 = {} for row in session.query(Mapping): #List of common names (stop_names are the common names too general that might match with everything [e.g. Little blue macaw]) cns = [ _.strip(" ") for _ in row.common_name.split(";") if (len(_.strip(" ")) > 0 and _.strip(" ") not in stop_names) ] #Add the scientific name cns.append(row.scientific_name_cites) #List of list of termes included in common names without little words cns_decomposed = [[ str.lower(_) for _ in re.split(" |-|'", first) if (len(_) > 2) ] for first in cns if (len(first) > 0)] #re.split(" -", first) #Drop useless words to caracterize a specie (e.g. "and") cns_decomposed = [[word for word in l if (word not in useless_words)] for l in cns_decomposed] #Drop common bird denomination since several ads don't mention it (it's trivial) (e.g. "parrot") #Keep it if there remains only one word cns_decomposed = [ list(filter(lambda x: x not in too_common_words, l)) if len(list(filter(lambda x: x not in too_common_words, l))) >= 2 else l for l in cns_decomposed ] print(cns_decomposed) #Replace each letter with its mitigation in the mitigation dic miss_cns = map( lambda list_words: ("".join([ mp_mit_2[char] if (char in mp_mit_2.keys()) else char for char in list(word) ]) for word in list_words), cns_decomposed) #Interpret map object miss_cns = [list(_) for _ in list(miss_cns)] dict_regex_2 = {} dict_regex_3 = {} #Populate the dict with regex according to each name for name_decomposed, name in zip(miss_cns, cns): #Regex to only find the words in the text. Thus we don't want another letter before to avoid matching "reared" searching for "red" reg_2 = "".join( [f"(?=.*[^0-9a-zÀ-ÿ]{word})" for word in name_decomposed]) reg_2 = f"^{reg_2}.*" dict_regex_2[name] = reg_2 #Regex to find the words in a specific order in a specific proximity reg_3 = ".{0,10}".join([f"{word}" for word in name_decomposed]) reg_3 = f"^(?=.*({reg_3})).*" dict_regex_3[name] = reg_3 regex_classification_2[row.id] = (dict_regex_2, row.annex_number_CITES) regex_classification_3[row.id] = (dict_regex_3, row.annex_number_CITES) return (regex_classification_2, regex_classification_3)
def check_update(browser, session): """Give TRUE whether the database is up to date""" firstad = browser.driver.find_elements_by_xpath( '//div[@class="row clearfix"][@style]')[0] ad_number = firstad.find_element_by_xpath( ".//input[@type=\"checkbox\"]").get_attribute("name") country = map_country(browser.driver.current_url) resp = session.query(exists().where( and_(Urls_ads.ad_number == ad_number, Urls_ads.country_id == country))).scalar() return resp
def make_dictionnary(): """Creates dictionnary for small multiple (Script classification_1_visualisation), this dictonary contains the words corresponding to the regex used for classification 1""" result_dict = {} for row in session.query(Mapping): #for each ligne in table mapping result = [] #temporary list #add scientific name result.append( row.scientific_name_cites.lower()) #append scientific name to list #add common names entry = row.common_name.split( '; ') #create a list with each common name as an entry for name in entry: #for each name in the list created in the previous line name.lower() #get all letters to lowercase list_of_words = name.split( ' ' ) #split the common name into it's components [this creates again a list] for i in list_of_words: #for each entry given here try: res = word_to_regex( i) #try to make a regular expression from the word except: res = None if len(i) <= 2: #let alone small words pass elif len(i) < 5 and ( ('\s' in res) or ('\w' in res) or (res == '') ): #if there is a problem in the creation of the word, let it alone #and with problem are ment words smaller than 5 letters containing space caracters, commonplace caracters or empty fields pass else: #big words - we do something with them: namely if type( i ) == str: #check if string, if not string we just sit idle if res != None: result.append(i.lower().strip( ';')) #append lower case result without ';' result_dict[ row. id] = result #for each entry in mapping append the list of used words return result_dict #return dictionary
if __name__ == '__main__': #Documentation cT = datetime.datetime.now() date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}" doc = Documentation() path_result='./results/getCodes/codes/' #Create the directory os.makedirs(os.path.dirname("./results/parseCodes/documentation/"), exist_ok=True) #Iterate through client codes for row in session.query(Ads_Codes).filter_by(status=0): #status=0 #Skip if already exists if session.query(exists().where(Parse_ads.ad_id == row.ad_id)).scalar(): pass else: #Copy the global variable containing the fields in the ad dic_champs=dict_champ.copy() filename=row.client_code #Set it up to use later dic_champs["Ad Number"]=row.ad_number #Obtain the HTML object objet = lxml.html.parse(f"{path_result}{filename}").getroot() #The main function that parses the HTML object dic_champs = get_champs(dic_champs, objet, doc) entry = create_entry(dic_champs, row)
Regex.id).filter(Regex.reg == res).scalar() request = session.query(Match_Regex_IdMap.id).filter_by( id_re=requested_re, id_map=row.id).scalar() if request == None and requested_re != None: #if there isn't an entry (request) and the regular_expression exists in Regex (requested_re) entry = Match_Regex_IdMap(id_re=requested_re, id_map=row.id) #create entry entry.insertMatch(session) #insert session.commit() #commit if __name__ == '__main__': if status_modified: #if we changed something with the generation of regexes or the mapping table make_helper_tables() #regenerate helper tables res = make_dictionnary() #générer list_scientific for row in session.query(Mapping): a = word_to_regex(row.scientific_name_cites) list_scientific.append(a) path_result = './results/parse/' #Documentation cT = datetime.datetime.now() date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}" doc = Documentation() #create entries for Parsing_Psittasiformes (one per ad) #the match_scientific and match_common only take values either 1 (match found) or 0 (match not found) #list_common lists all the regular expressions found for row in session.query(Parse_ads): #for each parsed ad if session.query( Parsing_Psittaciformes_or_no.ad_id
def resume_extraction(browser, session, pages): """Check if the first ad of the page is in the database. Otherwise pass n pages per n pages until the first new ad and then go back n pages further. The purpose is to locate the interval of n pages where the script has stopped. This function has been made because there is no way to select a specific page on the website or go n pages further (See the UPDATE WARNING in the main)""" #Obtain the total number of pages for this country at this moment, the purpose is to avoid reaching the last page wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, "//div[@style][contains(text(),\"Number of ads: \")]"))) raw_string = browser.driver.find_element_by_xpath( "//div[@style][contains(text(),\"Number of ads: \")]").text total_pages = ceil( int(re.findall("Number of ads: (\d*)\. .*", raw_string)[0]) / ADS_PER_PAGE) #Extract the first ad before the loop wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, '//div[@class="row clearfix"][@style]'))) firstad = browser.driver.find_elements_by_xpath( '//div[@class="row clearfix"][@style]')[0] ad_number = firstad.find_element_by_xpath( ".//input[@type=\"checkbox\"]").get_attribute("name") country = map_country(browser.driver.current_url) #Counter that counts the number of pages that have been passed counter = 0 while session.query(exists().where( and_(Urls_ads.ad_number == ad_number, Urls_ads.country_id == country))).scalar() and not counter == total_pages - 1: for n in range(pages): time.sleep(random.uniform(3, 3.2)) test = 0 while not test: try: wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, "//input[@name=\"button_hits_seen\"]"))) browser.driver.find_element_by_xpath( "//input[@name=\"button_hits_seen\"]").click() test = 1 except WebDriverException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back browser.driver.back() counter += 1 #Need to break otherwise an error pops up since we're trying to the next button that doesn't exist anymore at the end print( f"{country} - Skipped pages : {counter} / Total pages : {total_pages}\n" ) doc.addlog( f"{country} - Skipped pages : {counter} / Total pages : {total_pages}\n" ) if counter == total_pages - 1: break test = 0 while test: try: wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, '//div[@class="row clearfix"][@style]'))) firstad = browser.driver.find_elements_by_xpath( '//div[@class="row clearfix"][@style]')[0] ad_number = firstad.find_element_by_xpath( ".//input[@type=\"checkbox\"]").get_attribute("name") test = 1 except NoSuchWindowException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back browser.driver.back() except WebDriverException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back browser.driver.back() #Go back n pages whether we are not at the first page. We check with the presence of the previous button if check_exists_by_xpath(browser.driver, "//input[@name=\"previous_hits_button\"]"): #Number of pages to go back according to the number we have really skip back_pages = counter % pages if pages != counter else pages for n in range(back_pages + 1): time.sleep(random.uniform(0.2, 1)) test = 0 while not test: try: wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, "//input[@name=\"previous_hits_button\"]"))) browser.driver.find_element_by_xpath( "//input[@name=\"previous_hits_button\"]").click() test = 1 except WebDriverException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back browser.driver.back() counter -= 1 print(f"{country} : Go back {back_pages} pages\n") doc.addlog(f"{country} : Go back {back_pages} pages\n") doc.addlog( f"{country} : To resume the extraction : {counter} have been passed per {pages} pages interval" ) print(f"{country} : SUCCESS Resume")
def getads(browser, session, pages=20, update=True): """Go through all pages to collect articles' urls, number of pages to search the last stop. Whether there are new recent articles, the function updates the database rather than resume the extraction""" added_ad = 0 #Get the current country country = map_country(browser.driver.current_url) #Check we are updating print(f"{country} : Updating the ads") if not check_update( browser, session) else None doc.addlog(f"{country} : Updating the ads") #If the country has not yet been extracted, no resume. If number of entries < the pages interval, no resume. nbr_entries_country = len( session.query(Urls_ads).filter(Urls_ads.country_id == country).all()) if nbr_entries_country == 0 or nbr_entries_country < pages: pass else: #Resume the extraction print(f"{country} : Resuming extraction...") doc.addlog(f"{country} : Resuming extraction...") resume_extraction(browser, session, pages) #Just to avoid passing the first page counter = 0 #Allow to break whether we have updated the start of the ads and there is no more new ads counter_not_new = 0 #No need to wait between requests, it is on the same page, just javascript #When the next button disappears at the end while check_exists_by_xpath(browser.driver, "//input[@name=\"button_hits_seen\"]"): #Click on the "next button" / Except the first page time.sleep(random.uniform(2, 2.5)) #Try until it works or CTRL-C test = 0 while not test: try: wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, "//input[@name=\"button_hits_seen\"]"))) browser.driver.find_element_by_xpath( "//input[@name=\"button_hits_seen\"]").click( ) if counter == 1 else None counter = 1 test = 1 except TimeoutException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #Timeout because there is no more next button, break the loop pass except WebDriverException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back browser.driver.back() test = 0 while not test: try: wait = WebDriverWait(browser.driver, 90) wait.until( EC.presence_of_element_located( (By.XPATH, '//div[@class="row clearfix"][@style]'))) for ad in browser.driver.find_elements_by_xpath( '//div[@class="row clearfix"][@style]'): #The website is inconsistent, there is tag without ad ad_number = ad.find_element_by_xpath( ".//input[@type=\"checkbox\"]").get_attribute("name") url = ad.find_element_by_xpath(".//a").get_attribute( "href") #Avoid the stale element error test = 1 #Check if the entry already exists and do nothing in case according to the ad_number and country if session.query(exists().where( and_(Urls_ads.ad_number == ad_number, Urls_ads.country_id == country))).scalar(): counter_not_new += 1 else: #Refresh the counter since an entry has been added counter_not_new = 0 entry = Urls_ads( url=url, ad_id=f"{ad_number}_{get_abr_country(url)}", ad_number=int(ad_number), country_id=country) #Add the entry in the database entry.insertURL(session) added_ad += 1 print(f"{country} : Ad added (Tot : {added_ad})\n") doc.addlog( f"{country} : Ad added (Tot : {added_ad})\n") #Since the code is running a long time, sometimes we have a "stale element" except StaleElementReferenceException as e: print(f"{e}\n") doc.adderrorlog(f"{e}\n") # ~~~~~~~~~~~~~~~ Documentation - enregistrement (overwritten) ~~~~~~~~~~~~~~~ # with open( f'./results/getArticles/{date_extraction}_{filename_prefix}_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) #The webdriver is on a error page, go back print(f"{country} : Refreshing...\n") doc.addlog(f"{country} : Refreshing...") browser.driver.refresh() print( f"{country} :\n\tNext page\n\tNo new entry since {counter_not_new} entries\n" ) doc.addlog( f"{country} :\n\tNext page\n\tNo new entry since {counter_not_new} entries\n" ) if counter_not_new > (pages * ADS_PER_PAGE): #We have updated the country print(f"{country} : Ads have been updated") break print(f"{country} : No more ads\n") doc.addlog(f"{country} : No more ads\n")
#~~~~~~~~~~~~~~~ Catch'em all ~~~~~~~~~~~~~~~# """REMOVE STRING TO UPDATE THE COUNTRY => That's the only solution you can't go to the end of the ads' list and you can't select a specific page AND you can't sort by age (Advanced search doesn't work at the moment) For this reason, the code contains a great amount of try/except since we need to go across all pages manually, this increases the chance of a bug and we need to handle them UPDATE WARNING : Le code peut être grandement amélioré car le numéro de page dans l'URL est visible si l'on clique sur le bouton pour développer plus d'annonce. Le code ici ne prenait pas cela en compte. Il n'a pas été amélioré car le crawling avait déjà été fait et le présent code peut extraire les données à mettre à jour. Ici malheureusement, il faut être sûr d'avoir toutes les annonces jusqu'à la dernière page avant de pouvoir faire de la veille.""" completed_countries = [ ] #["UNITED STATES", "CANADA", "UNITED KINGDOM", "IRELAND", "AUSTRALIA", "NEW ZEALAND", "MALAYSIA", "INDONESIA", "HONG KONG", "INDIA", "SINGAPORE", "PHILIPPINES"] #REMOVE TO UPDATE for row in session.query(Country).all(): url = row.url info = getbirds(browser, url) doc.info['selenium'] = [] doc.info['selenium'].append(info) #Pass the country whether completed country = map_country(browser.driver.current_url) doc.addlog(f"{country} : info = getbirds(browser, url)") if country in completed_countries: print(f"{country} : Passed") doc.addlog(f"{country} : Passed") pass else:
if __name__ == '__main__': #Global variable which contains the expressions to match list_of_birds_test = ["bird", "brd", "amazon", "amazona", "parot", "prot", "african grey", "macaw", "mcw", "macw", "mcaw", "macow", "cockato", "winged", "paraket", "lovebird", "canary", "cnry"] #list_of_birds is the list of each regular expression created with the words in list_of_birds_test list_of_birds = create_regex_for_birds(list_of_birds_test) print(list_of_birds) #Documentation cT = datetime.datetime.now() date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}" doc = Documentation() path_result = './results/classification/' #Parse database for row in session.query(Parse_ads): #If ad (ad_id) not yet classified (0 or 1) if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None: #Step 1 : search in the title for each regular expression of list_of_birds for expression in list_of_birds: #The variable res is the string of the title res = re.search(str(expression), row.title) #If there is a match if res != None: #And if there isn't already an entry if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None: #The entry is the ad_id and the status is 1 entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1) entry.insertParse_bird(session)
#Strategy: Look in title for words describing birds with regular expressions list_of_birds_test = ["bird","brd","amazon","amazona","parot", "prot", "african grey","macaw","mcw","macw","mcaw","macow","cockato","winged","paraket"] #Global variable which contains re to match list_of_birds = [] for i in list_of_birds_test: a = word_to_regex(i) list_of_birds.append(a) if __name__ == '__main__': path_result = './results/parse/' #Documentation cT = datetime.datetime.now() date_parsing = f"{str(cT.year)}-{str(cT.month)}-{str(cT.day)}_{str(cT.hour)}-{str(cT.minute)}" doc = Documentation() #parse database c = 0 #counter to trace vow many ads have status 1 = classified as bird for row in session.query(Parse_ads): if session.query(Parsing_bird_or_no.ad_id).filter_by(ad_id=row.ad_id).scalar() == None: #step 1 search in title for expression in list_of_birds: #For each defined regular expression res = re.search(expression, row.title) #search in title if res != None: #if there is a match, go on if session.query(Parsing_bird_or_no.status_bird).filter_by(ad_id=row.ad_id).scalar() == None: #if there isn't already an entry entry = Parsing_bird_or_no(ad_id=row.ad_id, status_bird=1) entry.insertParse_bird(session) session.commit() c+=1 pass #step 2 search in description for expression in list_of_birds: if row.description != None:
except: price_final_f = montant_f #if error occurs; do nothing return price_final_f, res_currency def entry_ad_clean(row, id_vendor, price, currency, price_in_dollar): """function to make an entry into ads_clean table""" entry = Ads_clean(ad_id = row.ad_id, ad_number = row.ad_number, id_vendor=id_vendor, title = row.title,\ description = row.description, breed = row.breed, age = row.age, sex = row.sex, primary_color = row.primary_color,\ secondary_color = row.secondary_color, price = price, currency = currency, price_in_dollar= price_in_dollar,payment_forms = row.payment_forms) entry.insertAds_clean(session) session.commit() if __name__ == '__main__': for row in session.query(Parse_ads): #get status_parrot of the ad status_parrot = session.query( Classification_3_Ads.parrot).filter_by(ad_id=row.ad_id).scalar( ) #checks if the ad is classified as parrot # get email, website, phone of the ad if row.description != None: #if there is a description email = get_email(row) website = get_website(row) phone = get_phone(row) else: #else put it to None email = None website = None phone = None #create vendor entry if there isn't one if session.query(Vendor_analyse).filter_by(pseudo=row.pseudo).scalar(
#Create the directory os.makedirs(os.path.dirname("./results/classification_2_3/documentation/"), exist_ok=True) #~~~~~~~~~~~~~~ Create Regexes ~~~~~~~~~~~~~~ dic_regexes = re_generator_species() doc.info["regexes"] = dic_regexes doc.addlog("Create regexes") doc.info["cage_regex"] = re_hasCage doc.info["isbird_regex"] = re_isBird #2 Dict with regexes for the 2 classifications. for dr, classification in zip( dic_regexes, (Classification_2_Ads, Classification_3_Ads)): for row in session.query(Parse_ads): #Skip if already exists if session.query(exists().where( classification.ad_id == row.ad_id)).scalar(): pass else: entry = search_re(ad=row, regexes=dr, classification=classification) print(f"{row.ad_id}...\n") doc.addlog(f"Search in ad {row.ad_id}") entry.insert(session) #Write the doc several time to lost the documentation whether the script fails. with open( f'./results/classification_2_3/documentation/{date_parsing}_documentation.json',
from ressources.documentation import Documentation # fichier documentation.py qui se trouve dans le dossier ressources from ressources.db import session, updateURL, Url #fichier db.py qui se trouve dans le dossier ressources def saveData(browser, filename_prefix='selenium'): '''Fonction pour l'exemple qui enregistre le code client, la capture d'écran et code serveur''' browser.clientCode('./results/html/' + filename_prefix + '_clientCode.html') browser.screenshot( './results/screenshots/' + filename_prefix + '_screenshot.png', width=1080) #on fixe la largeur de la fenêtre avec width if __name__ == '__main__': doc = Documentation() # ~~~~~~~~~~~~~~~ Début Selenium ~~~~~~~~~~~~~~~ # browser = Chrome(headless=True) #ou Chrome(...) # ~~~~~~~~~~~~~~~ Récupération des URL à parcourir ~~~~~~~~~~~~~ # for i in session.query(Url).filter_by(status=0): doc.info['selenium'] = [] info = browser.get(i.url) doc.info['selenium'].append(info) saveData(browser, filename_prefix=str(i.id)) with open( './results/documentation/' + str(i.id) + '_documentation.json', 'wb') as f: f.write(str(doc).encode('utf-8')) updateURL(session, i) #met à jour le status de l'URL time.sleep(random.uniform(0.1, 0.2)) #attente entre 1.5 et 2.5 sec