def find_contact_info(self): page = BeautifulSoup(urlopen(self.link).read()) contact = page.find('a', text=re.compile('(C|c)ontact.*')) if contact is not None: if contact.has_attr('href'): if check_link(contact['href']) is "working": page = BeautifulSoup(urlopen(contact['href'])) else: base = self.get_base() link = urljoin(base, contact['href']) if check_link(link) is "working": page = BeautifulSoup(urlopen(link).read()) # first look for tag with class = phone phone = page.find({'class': 'phone'}) if phone is not None: self.resource_contact_phone = phone.text else: # if not class = phone, look for phone number phone = page.find(text=re.compile('\+(9[976]\d|8[987530]\d|6[987]\d' '|5[90]\d|42\d|3[875]\d|2[98654321]' '\d|9[8543210]|8[6421]|6[6543210]|5' '[87654321]|4[987654310]|3[9643210]' '|2[70]|7|1)\s*(\(\d+\)|\d+)(\s|-)[0-9]+(-*)[0-9]+')) if phone is not None: self.resource_contact_phone = phone.strip() email = page.find({'class': 'email'}) if email is not None: self.resource_contact_email = email.text else: email = page.find(text=re.compile('[A-Za-z0-9-._]+(@|\(at\)| at )+[A-Za-z0-9-._]+\.[A-Za-z0-9-._]+')) if email is not None: self.resource_contact_email = email
def ym_check_link(url, islogined): from seleniumDEMO import webdriver import time, urllib2, sys import codecs sys.path.append("/root/home/projects/emao/public") import login, logout, ff_configure, check_link, wait_time if islogined == True: check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\ '/root/home/projects/emao/test_report/emao_log_error.txt') print u'验证完毕!' else: #ff_driver=ff_configure.ff_configure() ff_driver = webdriver.PhantomJS() ff_driver.get(url) wait_time.wait_time(ff_driver, 1, 60) ff_driver.maximize_window() wait_time.wait_time(ff_driver, 1, 60) check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\ '/root/home/projects/emao/test_report/emao_log_error.txt') print u'验证完毕!' ff_driver.close()
def ym_check_link(url,islogined): from seleniumDEMO import webdriver import time,urllib2,sys import codecs sys.path.append("/root/home/projects/emao/public") import login,logout,ff_configure,check_link,wait_time if islogined==True: check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\ '/root/home/projects/emao/test_report/emao_log_error.txt') print u'验证完毕!' else: #ff_driver=ff_configure.ff_configure() ff_driver = webdriver.PhantomJS() ff_driver.get(url) wait_time.wait_time(ff_driver,1,60) ff_driver.maximize_window() wait_time.wait_time(ff_driver,1,60) check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\ '/root/home/projects/emao/test_report/emao_log_error.txt') print u'验证完毕!' ff_driver.close()
def find_links(self): if self.status is "working": try: soup = BeautifulSoup(urlopen(self.link, timeout=7).read()) for link_tag in soup.find_all('a', href=True): if check_link(link_tag['href']) is not "working": new_url = urljoin(self.link, link_tag['href']) if check_link(new_url) is "working" and new_url != self.link: if new_url not in self.links_found: self.links_found.append(new_url) else: if link_tag['href'] != self.link: if link_tag['href'] not in self.links_found: self.links_found.append(link_tag['href']) except URLError as e: self.status = "{} {} {}".format(self.link, e.reason)
def __init__(self, url): link_status = check_link(url) if link_status is "working": self.link = url self.status = link_status else: print("Error with url.") self.status = link_status print(self.status) print("Please check your link (perhaps use http://www...) and try again") exit()
# Deploy bot for Python # TODO # [X] check all links on page for 404 # check all images to see if they have an alt text # compress html # compress javascript # compress css import check_link from bs4 import BeautifulSoup import urllib.request from multiprocessing import Process # creates a global check_link object check_link_obj = check_link.check_link() def get_all_links(address): # get all links on a website, return a set resp = urllib.request.urlopen(address) soup = BeautifulSoup(resp, 'html.parser') links = soup.find_all('a') return {link.get('href') for link in links if link.get('href') and link.get('href')[0:4]=='http'} def threader(website): # this function is used to create new threads response = check_link_obj.check(website) if response != True: print("HTTP " + str(response) + " " + website) def main():