def doOneCompany(p_driver, p_url, p_fOutMain, p_fOutSecondary, p_minDelay, p_maxDelay, p_id): print('---[{0}]---'.format(p_id)) l_wait = 60 l_finished = False while not l_finished: # go to the base Url p_driver.get(p_url) try: WebDriverWait(p_driver, 10).until(EC.presence_of_element_located( (By.XPATH, '//footer'))) except EX.TimeoutException: print('Footer not found ... Something is not right') return # get page HTML l_pageHtml = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML') # extract a full xml/html tree from the page l_tree = html.fromstring(l_pageHtml) print(p_url, '-->', len(l_pageHtml)) l_name = CommonFunctions.getUnique(l_tree, '//h1[@itemprop="name"]') l_name = CommonFunctions.cleanField(l_name) print(' l_name :', l_name) if l_name == '': print('Waiting for {0} seconds ...'.format(l_wait)) time.sleep(l_wait) l_wait *= 2 continue l_address = CommonFunctions.getUnique(l_tree, '//span[@itemprop="streetAddress"]') l_address = CommonFunctions.cleanField(l_address) print(' l_address :', l_address) l_zip = CommonFunctions.getUnique(l_tree, '//span[@itemprop="postalCode"]') l_zip = re.sub('[^\d]', '', l_zip) l_zip = CommonFunctions.cleanField(l_zip) print(' l_zip :', l_zip) l_city = CommonFunctions.getUnique(l_tree, '//span[@itemprop="addressLocality"]') l_city = CommonFunctions.cleanField(l_city) print(' l_city :', l_city) l_web = CommonFunctions.getUnique(l_tree, '//p[@class="websiteAndShare"]/a') l_web = CommonFunctions.cleanField(l_web) print(' l_web :', l_web) p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( # ID p_id, # TYPE 'WebSite', # RAW l_web, # CLEAN l_web, # FLAG '') ) l_telList = [] for l_telItem in l_tree.xpath('//p[@itemprop="telephone"]'): l_oneTel = l_telItem.text_content() print(' Tel :', l_oneTel) p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( # ID p_id, # TYPE 'UnspecifiedPhone', # RAW l_oneTel, # CLEAN l_oneTel, # FLAG '') ) l_telList += [l_oneTel] l_businessList = [] for l_businessRow in l_tree.xpath('//li[@class="label-child"]'): l_businessCategory = l_businessRow.text_content() l_businessCategory = CommonFunctions.cleanField(l_businessCategory) print(' Business :', l_businessCategory) l_businessList += [l_businessCategory] p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( # ID p_id, # TYPE 'BusinessCategory', # RAW l_businessCategory, # CLEAN l_businessCategory, # FLAG '') ) l_telList = (l_telList + ['', '', '', ''])[0:4] # output to CSV file (main) p_fOutMain.write( ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' + '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}";"{16}";"{17}";"{18}"\n').format( # ID p_id, # NAME re.sub('"', '""', CommonFunctions.cleanField(l_name)), # ADDRESS re.sub('"', '""', CommonFunctions.cleanField(l_address)), # CP re.sub('"', '""', CommonFunctions.cleanField(l_zip)), # CITY re.sub('"', '""', CommonFunctions.cleanField(l_city)), # CREATION '', # SIRET '', # TYPE '', # COUNT '', # OWNER '', # TEL1 - TEL4 '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(t)) for t in l_telList]) , # MAIL '', # WEB1 re.sub('"', '""', CommonFunctions.cleanField(l_web)), # WEB2 '', # WEB3 '', # WEB4 '', # HOURS '', # BUSINESS re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))), # ADDITIONAL '' )) p_fOutMain.flush() p_fOutSecondary.flush() l_finished = True return True
def getOneCompany(p_driver, p_fOutMain, p_fOutSecondary, p_url, p_id): print('---[{0}]---'.format(p_id)) p_driver.get(p_url) # l_request.encoding = l_request.apparent_encoding WebDriverWait(p_driver, 10).until(EC.presence_of_element_located( (By.XPATH, '//div[@class="shareButtons"]'))) # get page HTML l_pageHtml = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML') # extract a full xml/html tree from the page l_tree = html.fromstring(l_pageHtml) print(p_url, '-->', len(l_pageHtml)) l_name = CommonFunctions.getUnique(l_tree, '//h2/span[@itemprop="name"]') print(' l_name :', l_name) l_address = CommonFunctions.getUnique(l_tree, '//div/span[@itemprop="streetAddress"]') print(' l_address :', l_address) l_zip = CommonFunctions.getUnique(l_tree, '//div/span[@itemprop="postalCode"]') print(' l_zip :', l_zip) l_city = CommonFunctions.getUnique(l_tree, '//div/a[@itemprop="addressLocality"]/..') print(' l_city :', l_city) l_creation = CommonFunctions.getUnique(l_tree, '//div/b[.="Créé:"]/../following-sibling::div[1]') print(' l_creation :', l_creation) l_tvaSiret = CommonFunctions.getUnique(l_tree, '//div/b[.="TVA / SIRET:"]/../following-sibling::div[1]') print(' l_tvaSiret :', l_tvaSiret) l_type = CommonFunctions.getUnique(l_tree, '//div/b[.="Type d\'entreprise:"]/../following-sibling::div[1]') print(' l_type :', l_type) l_headCount = CommonFunctions.getUnique(l_tree, '//div/b[.="Nombre d\'employés:"]/../following-sibling::div[1]') print(' l_headCount :', l_headCount) l_owner = CommonFunctions.getUnique(l_tree, '//div/b[.="Propriétaire / PDG:"]/../following-sibling::div[1]') print(' l_owner :', l_owner) l_name = CommonFunctions.cleanField(l_name) l_address = CommonFunctions.cleanField(l_address) l_zip = CommonFunctions.cleanField(l_zip) l_city = CommonFunctions.cleanField(l_city) l_creation = CommonFunctions.cleanField(l_creation) l_tvaSiret = CommonFunctions.cleanField(l_tvaSiret) l_type = CommonFunctions.cleanField(l_type) l_headCount = CommonFunctions.cleanField(l_headCount) l_owner = CommonFunctions.cleanField(l_owner) l_telNumber = '' for l_tel in l_tree.xpath('//div/span[@itemprop="telephone"]'): l_telNumber = CommonFunctions.cleanField(l_tel.text_content()) print(' l_telNumber :', l_telNumber) l_faxNumber = '' for l_fax in l_tree.xpath('//div/span[@itemprop="faxNumber"]'): l_faxNumber = CommonFunctions.cleanField(l_fax.text_content()) print(' l_faxNumber :', l_faxNumber) l_mobileNumber = '' for l_mobile in l_tree.xpath('//span/i[@class="icon-mobile-phone"]/../../span'): l_mobileNumber = CommonFunctions.cleanField(l_mobile.text_content()) print(' l_mobileNumber:', l_mobileNumber) l_webSite = '' for l_web in l_tree.xpath('//span/i[@class="icon-globe"]/../../a'): l_webSite = CommonFunctions.cleanField(l_web.text_content()) print(' l_website :', l_webSite) l_mailAddressRaw = '' l_mailAddress = '' for l_mail in l_tree.xpath('//span/i[@class="icon-envelope-alt"]/../../a'): if l_mail.text_content() == "afficher l'email": l_mailImgUrl = getEmail(p_driver, p_url) print(' mail img :', l_mailImgUrl) l_mailImgPathRaw = os.path.join(g_emailDir, 'mail_{0}_{1}.png'.format(p_id, re.sub('\s+', '_', l_name))) l_mailImgPath = os.path.join(g_emailDir, 'mail_{0}_{1}-X.png'.format(p_id, re.sub('\s+', '_', l_name))) urllib.request.urlretrieve(l_mailImgUrl, l_mailImgPathRaw) # convert email.gif -resize 2000x -unsharp 0x8 -threshold 95% x.png subprocess.call([ 'convert', l_mailImgPathRaw, '-flatten', '-resize', '10000x', #'-morphology', 'erode:1', 'square', #'-unsharp', '0x8', '-threshold', '30%', l_mailImgPath ]) l_mailAddressRaw = subprocess.check_output([ 'tesseract', l_mailImgPath, 'stdout' ]) print(' Mail raw :', l_mailAddressRaw) l_work = repr(l_mailAddressRaw) l_work = re.sub(r'Q\\xef\\xac\\x81', '@', l_work).strip() l_work = re.sub('\\\\', r'\\', l_work).strip() l_mailAddressRaw = eval(l_work) l_mailAddress = l_mailAddressRaw.decode('utf-8') # l_mailAddress = re.sub('Qfi', '@', l_mailAddress).strip() l_mailAddress = re.sub('\s+', '', l_mailAddress).strip() l_mailAddress = re.sub('[‒–—―]', '-', l_mailAddress).strip() for l_end in ['com', 'fr', 'net']: l_mailAddress = re.sub('([^\.]){0}$'.format(l_end), r'\1.' + l_end, l_mailAddress).strip() # for b in l_mailAddressRaw: # print(b, '--->', chr(b)) l_mailAddress = CommonFunctions.cleanField(l_mailAddress) print(' l_mailAddress :', l_mailAddress) l_businessList = [] for l_businessRow in l_tree.xpath('//dl/div[@class="col-sm-9"]/a'): l_businessCategoryRaw = l_businessRow.text_content() l_match = re.match('^([^-]+)\s-', l_businessCategoryRaw) if l_match: l_businessCategory = l_match.group(1) print(' Business :', l_businessCategory) l_businessList += [l_businessCategory] # ID;;;; p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( # ID p_id, # TYPE 'BusinessCategory', # RAW l_businessCategoryRaw, # CLEAN l_businessCategory, # FLAG '') ) if l_telNumber != '': p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'FixedPhone', l_telNumber, l_telNumber, '')) if l_faxNumber != '': p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'Fax', l_faxNumber, l_faxNumber, '')) if l_mobileNumber != '': p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'MobilePhone', l_mobileNumber, l_mobileNumber, '')) if l_webSite != '': p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'WebSite', l_webSite, l_webSite, '')) if l_mailAddress != '': p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'Email', l_mailAddressRaw, l_mailAddress, '')) # output to CSV file (main) p_fOutMain.write( ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' + '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}";"{16}";' + '"{17}";"{18}";"{19}";"{20}";"{21}"\n').format( # ID p_id, # NAME re.sub('"', '""', CommonFunctions.cleanField(l_name)), # ADDRESS re.sub('"', '""', CommonFunctions.cleanField(l_address)), # CP re.sub('"', '""', CommonFunctions.cleanField(l_zip)), # CITY re.sub('"', '""', CommonFunctions.cleanField(l_city)), # CREATION re.sub('"', '""', CommonFunctions.cleanField(l_creation)), # SIRET re.sub('"', '""', CommonFunctions.cleanField(l_tvaSiret)), # TYPE re.sub('"', '""', CommonFunctions.cleanField(l_type)), # COUNT re.sub('"', '""', CommonFunctions.cleanField(l_headCount)), # OWNER re.sub('"', '""', CommonFunctions.cleanField(l_owner)), # TEL1 re.sub('"', '""', CommonFunctions.cleanField(l_telNumber)), # TEL2 re.sub('"', '""', CommonFunctions.cleanField(l_faxNumber)), # TEL3 re.sub('"', '""', CommonFunctions.cleanField(l_mobileNumber)), # TEL4 '', # MAIL re.sub('"', '""', CommonFunctions.cleanField(l_mailAddress)), # WEB1 re.sub('"', '""', CommonFunctions.cleanField(l_webSite)), # WEB2 '', # WEB3 '', # WEB4 '', # HOURS '', # BUSINESS re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))), # ADDITIONAL '' )) p_fOutMain.flush() p_fOutSecondary.flush()
def doOneCompany(p_driver, p_fOutMain, p_fOutSecondary, p_id): print('---[{0}]---'.format(p_id)) l_finished = False l_name = '' while not l_finished: try: l_nameItem = WebDriverWait(p_driver, 10).until(EC.presence_of_element_located( (By.XPATH, '//h1[@itemprop="name"]'))) l_name = l_nameItem.text l_name = re.sub('\s+Afficher le numéro$', '', l_name).strip() print(' l_name :', l_name) except EX.TimeoutException: print('[06] Something is badly wrong (Timeout) ...') return False if killPopup(p_driver): continue l_finished = True try: l_html = p_driver.find_element_by_xpath('//body').get_attribute('innerHTML') l_tree = html.fromstring(l_html) l_address = CommonFunctions.getUnique(l_tree, '//span[@itemprop="streetAddress"]') print(' l_address :', l_address) l_zip = CommonFunctions.getUnique(l_tree, '//span[@itemprop="postalCode"]') l_zip = re.sub('^[,;:\.]\s+', '', l_zip).strip() print(' l_zip :', l_zip) l_city = CommonFunctions.getUnique(l_tree, '//span[@itemprop="addressLocality"]') print(' l_city :', l_city) # extract a full xml/html tree from the fragment l_telList = [] for l_telRow in l_tree.xpath('//div[@id="coord-list-container-1"]//ul/li'): l_telType = CommonFunctions.getUnique(l_telRow, './span[@class="num-tel-label"]') l_telType = re.sub('\s+:$', '', l_telType).strip() if l_telType == 'tél': l_telType = 'FixedPhone' elif l_telType == 'Mobile': l_telType = 'MobilePhone' elif l_telType == 'Fax': l_telType = 'Fax' else: l_telType = 'UnspecifiedPhone' print(' l_telType :', l_telType) l_tel = CommonFunctions.getUnique(l_telRow, './span[@class="coord-numero"]') l_tel = re.sub('^\.', '', l_tel).strip() l_tel = re.sub('\s+$', '', l_tel).strip() l_tel = re.sub('^\s+', '', l_tel).strip() print(' l_tel :', l_tel) l_tel = CommonFunctions.cleanField(l_tel) l_telList += [l_tel] p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, l_telType, l_tel, l_tel, '')) l_webList = [] # l_root = l_tree.getroottree() for l_webRow in l_tree.xpath( '//article/div/div/h3[text()="Sites et réseaux sociaux"]/..//ul/li/a/span[@class="value"]'): l_webSite = l_webRow.text_content().strip() # print(' l_webSite path :', l_root.getpath(l_webRow)) print(' l_webSite :', l_webSite) l_webSite = CommonFunctions.cleanField(l_webSite) l_webList += [l_webSite] p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'WebSite', l_webSite, l_webSite, '')) # bloc-info-horaires l_hoursList = [] for l_hoursRow in l_tree.xpath( '//ul[@class="liste-horaires-principaux"]//ul/li[@itemprop="openingHours"]'): l_hours = l_hoursRow.get('content').strip() print(' l_hours :', l_hours) l_hours = CommonFunctions.cleanField(l_hours) l_hoursList += [l_hours] p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'OpeningHours', l_hours, l_hours, '')) l_businessList = [] for l_businessRow in l_tree.xpath('//span[@class="activite-premiere-visibilite activite"]'): l_businessCategory = l_businessRow.text_content().strip() l_businessCategory = re.sub('\s+[\.;,:]$', '', l_businessCategory).strip() l_businessCategory = re.sub('^[\.;,:]\s+', '', l_businessCategory).strip() l_businessCategory = CommonFunctions.cleanField(l_businessCategory) print(' l_businessCat. :', l_businessCategory) l_businessList += [l_businessCategory] p_fOutSecondary.write('{0};"{1}";"{2}";"{3}";"{4}"\n'.format( p_id, 'BusinessCategory', l_businessCategory, l_businessCategory, '')) # description l_additional = '' for l_description in l_tree.xpath('//div[@itemprop="description"]'): l_additional = l_description.text_content().strip() l_additional = re.sub('\s+', ' ', l_additional).strip() print(' l_additional :', l_additional) l_telList = (l_telList + ['', '', '', ''])[0:4] l_webList = (l_webList + ['', '', '', ''])[0:4] # output to CSV file (main) p_fOutMain.write( ('{0};"{1}";"{2}";"{3}";"{4}";"{5}";"{6}";"{7}";"{8}";"{9}";' + '"{10}";"{11}";"{12}";"{13}";"{14}";"{15}"\n').format( # ID p_id, # NAME re.sub('"', '""', CommonFunctions.cleanField(l_name)), # ADDRESS re.sub('"', '""', CommonFunctions.cleanField(l_address)), # CP re.sub('"', '""', CommonFunctions.cleanField(l_zip)), # CITY re.sub('"', '""', CommonFunctions.cleanField(l_city)), # CREATION '', # SIRET '', # TYPE '', # COUNT '', # OWNER '', # TEL1 - TEL4 '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(t)) for t in l_telList]), # MAIL '', # WEB1 - WEB4 '";"'.join([re.sub('"', '""', CommonFunctions.cleanField(w)) for w in l_webList]), # HOURS re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_hoursList))), # BUSINESS re.sub('"', '""', CommonFunctions.cleanField('|'.join(l_businessList))), # ADDITIONAL re.sub('"', '""', CommonFunctions.cleanField(l_additional)) )) p_fOutMain.flush() p_fOutSecondary.flush() except EX.NoSuchElementException: print('[07] Something is badly wrong (Element not found) ...') return False return True