Example #1
0
 def find_contact_info(self):
     page = BeautifulSoup(urlopen(self.link).read())
     contact = page.find('a', text=re.compile('(C|c)ontact.*'))
     if contact is not None:
         if contact.has_attr('href'):
             if check_link(contact['href']) is "working":
                 page = BeautifulSoup(urlopen(contact['href']))
             else:
                 base = self.get_base()
                 link = urljoin(base, contact['href'])
                 if check_link(link) is "working":
                     page = BeautifulSoup(urlopen(link).read())
     # first look for tag with class = phone
     phone = page.find({'class': 'phone'})
     if phone is not None:
         self.resource_contact_phone = phone.text
     else:
         # if not class = phone, look for phone number
         phone = page.find(text=re.compile('\+(9[976]\d|8[987530]\d|6[987]\d'
                                           '|5[90]\d|42\d|3[875]\d|2[98654321]'
                                           '\d|9[8543210]|8[6421]|6[6543210]|5'
                                           '[87654321]|4[987654310]|3[9643210]'
                                           '|2[70]|7|1)\s*(\(\d+\)|\d+)(\s|-)[0-9]+(-*)[0-9]+'))
         if phone is not None:
             self.resource_contact_phone = phone.strip()
     email = page.find({'class': 'email'})
     if email is not None:
         self.resource_contact_email = email.text
     else:
         email = page.find(text=re.compile('[A-Za-z0-9-._]+(@|\(at\)| at )+[A-Za-z0-9-._]+\.[A-Za-z0-9-._]+'))
         if email is not None:
             self.resource_contact_email = email
Example #2
0
def ym_check_link(url, islogined):
    from seleniumDEMO import webdriver
    import time, urllib2, sys
    import codecs

    sys.path.append("/root/home/projects/emao/public")
    import login, logout, ff_configure, check_link, wait_time

    if islogined == True:

        check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\
         '/root/home/projects/emao/test_report/emao_log_error.txt')
        print u'验证完毕!'

    else:
        #ff_driver=ff_configure.ff_configure()
        ff_driver = webdriver.PhantomJS()
        ff_driver.get(url)
        wait_time.wait_time(ff_driver, 1, 60)

        ff_driver.maximize_window()
        wait_time.wait_time(ff_driver, 1, 60)

        check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\
         '/root/home/projects/emao/test_report/emao_log_error.txt')
        print u'验证完毕!'
        ff_driver.close()
def ym_check_link(url,islogined):
	from seleniumDEMO import webdriver
	import time,urllib2,sys
	import codecs

	sys.path.append("/root/home/projects/emao/public")
	import login,logout,ff_configure,check_link,wait_time


	if islogined==True:

		check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\
			'/root/home/projects/emao/test_report/emao_log_error.txt')
		print u'验证完毕!'	

	else:
		#ff_driver=ff_configure.ff_configure()		
		ff_driver = webdriver.PhantomJS()
		ff_driver.get(url)
		wait_time.wait_time(ff_driver,1,60)

		ff_driver.maximize_window()
		wait_time.wait_time(ff_driver,1,60)

		check_link.check_link(url,'/root/home/projects/emao/test_report/emao_log_ok.txt',\
			'/root/home/projects/emao/test_report/emao_log_error.txt')
		print u'验证完毕!'	
		ff_driver.close()
Example #4
0
 def find_links(self):
     if self.status is "working":
         try:
             soup = BeautifulSoup(urlopen(self.link, timeout=7).read())
             for link_tag in soup.find_all('a', href=True):
                 if check_link(link_tag['href']) is not "working":
                     new_url = urljoin(self.link, link_tag['href'])
                     if check_link(new_url) is "working" and new_url != self.link:
                         if new_url not in self.links_found:
                             self.links_found.append(new_url)
                 else:
                     if link_tag['href'] != self.link:
                         if link_tag['href'] not in self.links_found:
                             self.links_found.append(link_tag['href'])
         except URLError as e:
             self.status = "{} {} {}".format(self.link, e.reason)
Example #5
0
 def __init__(self, url):
     link_status = check_link(url)
     if link_status is "working":
         self.link = url
         self.status = link_status
     else:
         print("Error with url.")
         self.status = link_status
         print(self.status)
         print("Please check your link (perhaps use http://www...) and try again")
         exit()
Example #6
0
# Deploy bot for Python
# TODO
# [X] check all links on page for 404
# check all images to see if they have an alt text
# compress html
# compress javascript
# compress css

import check_link
from bs4 import BeautifulSoup
import urllib.request
from multiprocessing import Process

# creates a global check_link object
check_link_obj = check_link.check_link()

def get_all_links(address):
    # get all links on a website, return a set
    resp = urllib.request.urlopen(address)
    soup = BeautifulSoup(resp, 'html.parser')
    links = soup.find_all('a')
    return {link.get('href') for link in links
                if link.get('href') and link.get('href')[0:4]=='http'}

def threader(website):
    # this function is used to create new threads
    response = check_link_obj.check(website)
    if response != True:
        print("HTTP " + str(response) + " " +  website)

def main():