def doSearch(p_search, p_location, p_csvPathA, p_csvPathB, p_minDelay, p_maxDelay): l_urlSearch = '{0}search?what={1}&where={2}'.format( g_url, urllib.parse.quote(p_search, safe=''), urllib.parse.quote(p_location, safe='') ) # open output csv file (main) l_fOutMain = open(p_csvPathA, 'w') l_fOutMain.write('ID;NAME;ADDRESS;CP;CITY;CREATION;SIRET;TYPE;COUNT;OWNER;' + 'TEL1;TEL2;TEL3;TEL4;MAIL;WEB1;WEB2;WEB3;WEB4;HOURS;BUSINESS;ADDITIONAL\n') # open output csv file (secondary) l_fOutSecondary = open(p_csvPathB, 'w') l_fOutSecondary.write('ID;TYPE;RAW;CLEAN;FLAG\n') # Create a new instance of the Firefox driver l_driver = webdriver.Firefox() # Resize the window to the screen width/height l_driver.set_window_size(1500, 1500) # Move the window to position x/y l_driver.set_window_position(1000, 1000) l_count = 0 l_finished = False while not l_finished: # go to the base Url l_driver.get(l_urlSearch) try: WebDriverWait(l_driver, 10).until(EC.presence_of_element_located( (By.XPATH, '//footer'))) except EX.TimeoutException: l_finished = True continue l_itemList = [] for l_article in l_driver.find_elements_by_xpath('//div[@class="listwrapper"]' + '//div[@class="box-company"]/div/a'): l_itemLink = l_article.get_attribute('href') print('l_itemLink:', l_itemLink) l_itemList += [l_itemLink] l_nextLink = '' for l_next in l_driver.find_elements_by_xpath('//ul[@class="pagination "]/li[last()]/a'): l_nextLink = l_next.get_attribute('href') print('l_nextLink:', l_nextLink) for l_link in l_itemList: getOneCompany(l_driver, l_fOutMain, l_fOutSecondary, urllib.parse.urljoin(g_url, l_link), l_count) l_count += 1 CommonFunctions.randomWait(p_minDelay, p_maxDelay) if l_nextLink == '': l_finished = True else: l_urlSearch = urllib.parse.urljoin(g_url, l_nextLink) print('Number of Items retrieved', l_count) l_driver.quit() l_fOutMain.close() l_fOutSecondary.close() return l_count
def doSearch(p_search, p_location, p_pathA, p_pathB, p_minDelay, p_maxDelay, p_distance): # http://www.118218.fr/recherche?category_id=&geo_id=&distance=&category=&what=plombier&where=75013 if p_distance > 0: l_baseUrl = '{0}recherche?category_id=&geo_id=&distance={3}&category=&what={1}&where={2}'.format( g_url, urllib.parse.quote(p_search, safe=''), urllib.parse.quote(p_location, safe=''), p_distance ) else: l_baseUrl = '{0}recherche?category_id=&geo_id=&distance=&category=&what={1}&where={2}'.format( g_url, urllib.parse.quote(p_search, safe=''), urllib.parse.quote(p_location, safe='') ) l_urlSearch = l_baseUrl # open output csv file (main) l_fOutMain = open(p_pathA, 'w') l_fOutMain.write('ID;NAME;ADDRESS;CP;CITY;CREATION;SIRET;TYPE;COUNT;OWNER;' + 'TEL1;TEL2;TEL3;TEL4;MAIL;WEB1;WEB2;WEB3;WEB4;HOURS;BUSINESS;ADDITIONAL\n') # open output csv file (secondary) l_fOutSecondary = open(p_pathB, 'w') l_fOutSecondary.write('ID;TYPE;RAW;CLEAN;FLAG\n') # Create a new instance of the Firefox driver l_driver = CommonFunctions.getDriver() # go to the base Url l_driver.get(l_urlSearch) l_finished = False l_linksList = [] l_currentPage = 1 l_wait = 60 # get all links in the result set while not l_finished: print('Result page:', l_currentPage) # Wait for the footer to appear if not waitFoFooter(l_driver): l_finished = True continue try: l_messageDisplay = l_driver.find_element_by_xpath( '//article/section[@class="staticContent ieWrapperFix"]') l_message = l_messageDisplay.text if re.match('Nos systèmes ont détecté un trafic important', l_message): print('Abuse message:', l_message) if l_currentPage <= 20 and l_wait <= 300: print('Waiting for {0} seconds ...'.format(l_wait)) time.sleep(l_wait) l_wait += 60 l_driver.get(l_urlSearch) continue l_finished = True continue except EX.NoSuchElementException: print('Ok apparently ...') l_wait = 60 try: l_resultCountLocation = l_driver.find_element_by_xpath('//p[@class="resultCount"]') l_resultCount = l_resultCountLocation.text print('l_resultCount:', l_resultCount) except EX.NoSuchElementException: print('No Results') l_finished = True continue l_countLink = 0 for l_link in l_driver.find_elements_by_xpath('//h2/a'): l_linkUrl = l_link.get_attribute('href') l_linksList += [l_linkUrl] print('l_linkUrl:', l_linkUrl) l_countLink += 1 try: l_found = False for l_link in l_driver.find_elements_by_xpath('//a'): # find next page link page if l_link.get_attribute('data-page') == str(l_currentPage + 1): l_found = True l_currentPage += 1 l_urlSearch = l_link.get_attribute('href') print('Link to next page:', l_urlSearch) # scroll to it, to make it visible, and then click it l_actions = ActionChains(l_driver) l_actions.move_to_element(l_link) l_actions.click() l_actions.perform() CommonFunctions.randomWait(p_minDelay, p_maxDelay) break if not l_found: # if the link was not found --> Finished print('No More Results') l_finished = True except EX.NoSuchElementException: print('No More Results') l_finished = True continue l_count = 0 for l_url in l_linksList: # Scrape one company and stops in case of failure if not doOneCompany(l_driver, l_url, l_fOutMain, l_fOutSecondary, p_minDelay, p_maxDelay, l_count): break l_count += 1 CommonFunctions.randomWait(p_minDelay, p_maxDelay) l_driver.quit() print('Number of items retrieved:', l_count) l_fOutMain.close() l_fOutSecondary.close() return l_count
l_totalCount = 0 # one tmp file per commune for l_communeId, l_communeName in l_communes: l_tmpA = os.path.join(g_misterWhatDir, '__tmpA_{0}.csv'.format(l_communeId)) l_tmpB = os.path.join(g_misterWhatDir, '__tmpB_{0}.csv'.format(l_communeId)) if not os.path.isfile(l_tmpA) and not os.path.isfile(l_tmpA): print('Search for "{0}" in "{1}" ...'.format(l_search, l_communeName)) l_count = doSearch(l_search, l_communeName, l_tmpA, l_tmpB, l_minDelay, l_maxDelay) l_totalCount += l_count print('Search for "{0}" in "{1}" Complete'.format(l_search, l_communeName)) if l_count == 0: CommonFunctions.randomWait(l_minDelay, l_maxDelay) # if l_totalCount > 300: # break print('Total number of items retrieved:', l_totalCount) # merge the tmp files CommonFunctions.concatTmp(g_misterWhatDir, [i for i, c in l_communes], l_pathA, l_pathB) # sort the result CommonFunctions.csvSort(l_pathA, p_départements=True) else: # otherwise, do an ordinary search doSearch(l_search, l_location, l_pathA, l_pathB, l_minDelay, l_maxDelay) # and sort the results as well
def doSearch(p_search, p_location, p_pathA, p_pathB, p_minDelay, p_maxDelay): # open output csv file (main) l_fOutMain = open(p_pathA, 'w') l_fOutMain.write('ID;NAME;ADDRESS;CP;CITY;CREATION;SIRET;TYPE;COUNT;OWNER;' + 'TEL1;TEL2;TEL3;TEL4;MAIL;WEB1;WEB2;WEB3;WEB4;HOURS;BUSINESS;ADDITIONAL\n') # open output csv file (secondary) l_fOutSecondary = open(p_pathB, 'w') l_fOutSecondary.write('ID;TYPE;RAW;CLEAN;FLAG\n') # Create a new instance of the Firefox driver l_driver = CommonFunctions.getDriver() # go to the base Url l_driver.get(g_url) try: # locate the keyword search input text box and enter the search string l_quoiQui = WebDriverWait(l_driver, 10).until(EC.presence_of_element_located( (By.XPATH, '//input[@id="pj_search_quoiqui"]'))) print('l_quoiQui placeholder:', l_quoiQui.get_attribute('placeholder')) l_quoiQui.send_keys(p_search) # locate the location input text box and enter the location string l_ou = l_driver.find_element_by_id('pj_search_ou') print('l_ou placeholder:', l_ou.get_attribute('placeholder')) l_ou.send_keys(p_location) # submit the form l_driver.find_element_by_xpath('//button[@class="button primary icon large-button"]').click() except EX.NoSuchElementException: print('[01] Something is badly wrong (Element not found) ...') return 0 except EX.TimeoutException: print('[02] Something is badly wrong (Timeout) ...') return 0 l_finished = False l_count = 0 while not l_finished: try: # WebDriverWait(driver,5).until( # lambda driver: driver.find_elements(By.ID,"a") or driver.find_elements(By.ID,"b")) WebDriverWait(l_driver, 10).until( lambda p_driver: \ p_driver.find_elements(By.XPATH, '//h2[@class="company-name"]') \ or p_driver.find_elements(By.XPATH, '//div[@class="no-response"]')) #WebDriverWait(l_driver, 10).until(EC.presence_of_element_located( # (By.XPATH, '//h2[@class="company-name"]'))) except EX.TimeoutException: print('[03] Something is badly wrong (Timeout) ...') return 0 if killPopup(l_driver): continue try: l_driver.find_element_by_xpath('//div[@class="no-response"]') print('No results') l_finished = True continue except EX.NoSuchElementException: print('There should be results') try: # reformulation l_reformulation = l_driver.find_element_by_xpath( '//span[@class="denombrement"]/strong[@id="SEL-nbresultat"]') l_resultCount = l_reformulation.text print('l_resultCount:', l_resultCount) except EX.NoSuchElementException: print('No reformulation ?! ...') l_articleList = [] try: for l_company in l_driver.find_elements_by_xpath('//h2[@class="company-name"]/../../../..'): l_articleId = l_company.get_attribute('id') print('l_articleId:', l_articleId) l_articleList += [l_articleId] except EX.NoSuchElementException: print('[04] Something is badly wrong (Element not found) ...') return 0 try: l_article = 0 for l_articleId in l_articleList: if killPopup(l_driver): print('Popup Killed, waiting for 10 s.') time.sleep(10) print('+ l_articleId:', l_articleId) l_company = l_driver.find_element_by_xpath( '//article[@id="{0}"]//h2[@class="company-name"]/a[2]'.format(l_articleId)) #l_driver.execute_script("return arguments[0].scrollIntoView();", l_company) l_name = l_company.text print('Fetching:', l_name) l_driver.execute_script("return arguments[0].scrollIntoView();", l_company) l_driver.execute_script("window.scrollBy(0, -300);") # Save the window opener (current window, do not mistaken with tab... not the same) l_mainWindow = l_driver.current_window_handle # l_company.send_keys(Keys.CONTROL + Keys.RETURN) # scroll to it, to make it visible, and then click it l_actions = ActionChains(l_driver) l_actions.move_to_element(l_company) l_actions.context_click() l_actions.send_keys(Keys.ARROW_DOWN) l_actions.send_keys(Keys.ENTER) l_actions.perform() # Switch tab to the new tab, which we will assume is the next one on the right l_driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.TAB) # Put focus on current window which will, in fact, put focus on the current visible tab l_driver.switch_to_window(l_mainWindow) if doOneCompany(l_driver, l_fOutMain, l_fOutSecondary, l_count): l_count += 1 CommonFunctions.randomWait(p_minDelay, p_maxDelay) # Close current tab l_driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 'w') # Put focus on current window which will be the window opener l_driver.switch_to_window(l_mainWindow) except EX.NoSuchElementException: print('[05] Something is badly wrong (Element not found) ...') return 0 # locate the next button and click it try: l_next = l_driver.find_element_by_id('pagination-next') # scroll to it, to make it visible, and then click it l_actions = ActionChains(l_driver) l_actions.move_to_element(l_next) l_actions.click() l_actions.perform() except EX.NoSuchElementException: print('No more results') l_finished = True print('Number of items retrieved:', l_count) l_fOutMain.close() l_fOutSecondary.close() l_driver.quit() return l_count