class UrlComparator: def __init__(self, url1, url2): self.ut1 = URL(url1) self.ut2 = URL(url2) self.params = self._allparams() def _allparams(self): all = [] add = all.append for ut in [self.ut1, self.ut2]: for p in ut.getParamNames(): if not p in all: add(p) return all def diff(self): msg = [] add = msg.append if self.ut1.getBaseUrl() != self.ut2.getBaseUrl(): add("baseUrls are different") p1 = self.ut1.getParamMap() p2 = self.ut2.getParamMap() for p in self._allparams(): if not p1.has_key(p): add("'%s' is not defined in 1" % p) elif not p2.has_key(p): add("'%s' is not defined in 2" % p) elif p1[p] != p2[p]: add("different values for '%s'" % p) add("\t1 - %s\n\t2 - %s" % (p1[p], p2[p])) if not msg: return "no diff" else: return '\n'.join(msg)
def cannonicalize(self): parsed_url = urlsplit(self.url) url = URL(parsed_url[:]) self.url = url.make() if self.url == "": return None self.url = self.url.replace("https://", "http://") self.url = re.sub('#.*', "", self.url) return self.url
def setUpClass(self): self.utility = Utility() # CHANGE THE LOG FILE NAME IN THE NEXT LINE****************************************************************************************** self.log = open(self.utility.logpath + "/WV-00.txt", "a+") self.suite_start_time = time.time() self.log.write("Suite started at {}\n".format( str(time.ctime(int(self.suite_start_time))))) self.url = URL() self.loginPageStaticTexts = LoginPageStaticText() self.loginPageTestData = LoginPageTestData() self.configTestCase = configparser.RawConfigParser() # CHANGE THE CONFIG PROPERTY FILE NAME IN THE NEXT LINE****************************************************************************************** self.configTestCase.read( os.path.dirname(os.getcwd()) + '/TestCases/WV_00_Config.properties') self.configECG = configparser.RawConfigParser() self.configECG.read( os.path.dirname(os.getcwd()) + '/Scripts/ECGRelatedData.properties') self.configDevice = configparser.RawConfigParser() self.configDevice.read( os.path.dirname(os.getcwd()) + '/Scripts/DeviceRelatedData.properties') self.sendECG = SendECG() yield self.suite_end_time = time.time() self.total_time_taken_suite = self.suite_end_time - self.suite_start_time self.log.write("Suite ended at {}\n".format( str(time.ctime(int(self.suite_end_time))))) self.log.write( "Total time taken by Test Suite to finish: {} seconds\n".format( self.total_time_taken_suite)) self.log.close()
def get_resp(): _url = request.args.get("url") if not _url: return json_resp({"error": "invalid_url"}) url = URL(_url) n = str(url) return json_resp(get_meta_data_json(n, url))
def processURL(s, raw_url): tweet_id = s.tweet_id url = raw_url['expanded_url'] my_url = URL(tweet_id, url) # only grab external URLs if not (url.startswith('https://twitter.com/')): s.urls.append(my_url)
def __init__(self, url, method='GET', headers=None, cookies=None, referer=None, data=None, user_agent=DEFAULT_USER_AGENT, **kwargs): ''' ''' if isinstance(url, URL): self._url = url else: self._url = URL(url) self._method = method self.id = uuid.uuid1() self._headers = {} if headers: self._headers.update(headers) self._cookies = cookies self._referer = referer self._user_agent = user_agent if self._cookies: self._headers.update({"Cookie": self._cookies}) if self._referer: self._headers.update({"Referer": self._referer}) if self._user_agent: self._headers.update({"User-Agent": self._user_agent}) self._get_data = self._url.get_querystring() self._post_data = data if data else ""
def audit(self, origin, response): """ :origin: original url. all url need match with original url :return: list url obj """ while len(self.QUEUES) > 0: url_ = self.QUEUES.pop() self.debug(" [*] Crawling URL: " + url_.get_url()) # print debug self.RESULTS.append(url_) header, response = self.connect_getdata(url_.domain, url_.port, url_.get_module()) links = self.get_links(response, self.domain, self.port, url_.folder) for link in links: url = URL(link) if not self.is_in_results(url): if not self.is_in_queues(url): self.QUEUES.insert(0, url) self.debug(url.get_url()) self.debug_socket(url.get_url()) self.RESULTS = filter(None, self.RESULTS)
def setup(request, setUpClass): print("initiating chrome driverd") driver = Browser().getbrowser("chrome") url = URL() driver.get(url.webViewerUAT) utility = Utility() # utility.createLogFolder() log = open(utility.logpath + "/WV-00.txt", "a+") driverUtility = DriverUtility(driver, log) loginPageObject = LoginPageObject(driverUtility, log) request.cls.driver = driver request.cls.url1 = url request.cls.utility = utility request.cls.driverUtility = driverUtility request.cls.loginPageObject = loginPageObject print("setup ended") yield driver driver.close()
import collections import httplib2 from URL import URL import urllib.request from SrcubOrigUrls import scrub_orig_urls # Define list to store data directly from list url_data = collections.defaultdict(set) # Define list to store objects of URLs urls = [] # Define source data file and reader url_data_file = open('data.csv', 'r') my_reader = csv.reader(url_data_file) # Put from csv file into list print("Getting data from file...") for row in my_reader: url_data[row[0]].add(row[1]) print("Done getting data from file!") # Form objects from data and put into list print("Putting objects in list...") for url in url_data: new_url = URL(url, url_data[url]) urls.append(new_url) print("Done putting objects in list!") del urls[0] scrub_orig_urls(urls)
import sys from DB import DB from URL import URL db = DB('citeseerx.db') db.create_tables() # db.del_all() # http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057 if len(sys.argv) == 2: url = URL(sys.argv[1]) url.open() db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()}) else: print 'Please supply proper URL.'
from URL import URL from BSOUP import BSOUP from time import sleep import re # change hosts path according to your OS hosts_path = r"C:\Windows\System32\drivers\etc\hosts" # localhost's IP redirect = "127.0.0.1" u=URL() b=BSOUP() urlis=[] class CWBP: def cwblocker(self,lis): global urlis self.ur=u.giveurl() self.a=re.search('/',self.ur) self.kw=b.keyword("https://www."+self.ur) for i in lis: if i in self.kw: urlis.append("www."+self.ur[:self.a.start()]) with open(hosts_path, 'r+') as file: self.content = file.read() if self.ur in self.content: pass else: # mapping hostnames to your localhost IP address file.write(redirect + " " +"www."+self.ur[:self.a.start()]+ "\n") def unblocker(self): global urlis with open(hosts_path, 'r+') as file:
def setup(request, setUpClass): print("initiating chrome driverd") driver = Browser().getbrowser("chrome") url = URL() driver.get(url.webViewerUAT) utility = Utility() # utility.createLogFolder() log = open(utility.logpath + "/WV-00.txt", "a+") driverUtility = DriverUtility(driver, log) loginPageObject = LoginPageObject(driverUtility, log) request.cls.driver = driver request.cls.url1 = url request.cls.utility = utility request.cls.driverUtility = driverUtility request.cls.loginPageObject = loginPageObject print("setup ended") yield driver driver.close() # from datetime import datetime # def pytest_logger_config(logger_config): # logger_config.add_loggers(['foo', 'bar', 'baz'], stdout_level='debug') # logger_config.set_log_option_default('foo,bar') # def pytest_logger_logdirlink(config): # print("1") # path = os.path.dirname(os.getcwd()) + '/Logs/' # foldername = datetime.now().strftime("%Y%m%d-%H%M%S") # logpath = path+foldername # try: # # return os.mkdir(logpath) # return os.path.join(path, foldername) # # return logpath # except OSError as e: # print("Creation of the directory failed") # print(traceback.format_exc()) # else: # print("Successfully created the directory") # return os.path.join(os.path.dirname(__file__), 'mylogs') # @pytest.yield_fixture(scope='session') # def session_thing(): # foo.debug('constructing session thing') # yield # foo.debug('destroying session thing') # @pytest.yield_fixture # def testcase_thing(): # foo.debug('constructing testcase thing') # yield # foo.debug('destroying testcase thing') # @pytest.fixture(scope="class") # def setup(request): # print("initiating chrome driver") # driver = Browser().getbrowser("chrome") #if not added in PATH # url = URL() # utility = Utility() # # driver.maximize_window() # request.cls.d = driver # request.cls.u = utility # request.cls.url1 = url # yield # driver.close() # import pytest # from selenium import webdriver # @pytest.fixture(scope="class") # def setup(request): # print("initiating chrome driver") # driver = Browser().getbrowser("chrome") #if not added in PATH # url = URL() # utility = Utility() # # driver.maximize_window() # request.cls.d = driver # request.cls.u = utility # request.cls.url1 = url # yield driver # driver.close() # @pytest.fixture(scope='session') # def config(): # with open('WV_00_Config.json') as config_file: # data = json.load(config_file) # for r in data['Enabled']: # print (r[b]) # return data
from URL import URL from DB import DB from bs4 import BeautifulSoup db = DB('citeseerx.db') count = 0 while db.count_unpr(): # url = URL('http://citeseerx.ist.psu.edu/viewdoc/summary?cid=4320') count = count + 1 url = db.get_unpr() print url url = URL(url) url.open() db.update_link(url.get_doi(), 2) if (not db.exists('link', url.get_doi()) and url.redirect_occured()): db.insert('link', { 'doi': url.get_doi(), 'url': url.get_redirect_url() }) if (not db.exists('metadata', url.get_doi())): html = url.fetch() # extract abstract soup = BeautifulSoup(html, "html.parser") title = soup.find('h2').findAll(text=True)[0] abstract_div = soup.find("div", {"id": "abstract"}) for tag in abstract_div: if tag.name == 'p': abstract = tag.findAll(text=True)
def __init__(self, url1, url2): self.ut1 = URL(url1) self.ut2 = URL(url2) self.params = self._allparams()
if self.ut1.getBaseUrl() != self.ut2.getBaseUrl(): add("baseUrls are different") p1 = self.ut1.getParamMap() p2 = self.ut2.getParamMap() for p in self._allparams(): if not p1.has_key(p): add("'%s' is not defined in 1" % p) elif not p2.has_key(p): add("'%s' is not defined in 2" % p) elif p1[p] != p2[p]: add("different values for '%s'" % p) add("\t1 - %s\n\t2 - %s" % (p1[p], p2[p])) if not msg: return "no diff" else: return '\n'.join(msg) def cmpUrls(url1, url2): ct = UrlComparator(url1, url2) ct.ut1.report("URL 1") ct.ut2.report("URL 2") print "\nDiff:" print ct.diff() if __name__ == "__main__": ut = URL(ssDoc) print ut.getTuple()
class ParseURL(object): """A python module to parse url emulates urllib.urlparse().""" def __init__(self, url): self.url = url self.url_string = url self.parsed_url = URL() self.parse() def parse_scheme(self, url): """A method to parse the url scheme.""" dot_ind = url.find('.') col_ind = url.find(':') scheme = '' if col_ind < dot_ind and col_ind != -1: scheme = url[:col_ind] self.url = url[col_ind+3:] return scheme.lower() def parse_netloc(self, url): """A method to parse the url netloc.""" self.url = '' return url def parse_path(self, url): """A method to parse the url path.""" ind = url.find('/') if ind != -1: self.url = url.replace(url[ind:], '') return url[ind:] return '' def parse_params(self, url): raise NotImplemented def parse_query(self, url): """A method to parse the url query.""" start_ind = url.find('?') if start_ind == -1: return dict() end_ind = url.find('#') if end_ind == -1: query_string = url[start_ind+1:] else: query_string = url[start_ind+1:end_ind] self.url = url.replace('?' + query_string, '') return self.query_parse(query_string) def parse_fragment(self, url): """A method to parse the url fragment.""" ind = url.find('#') if ind != -1: self.url = url[:ind] return url[ind+1:] return '' def parse_username(self, url): """A method to parse the url username.""" if '@' in url: at_char = '@' elif '%40' in url: at_char = '%40' else: at_char = None if at_char == None: return url = url.split(at_char) url = url[0] if not ':' in url: return url else: url = url.split(':') return url[0] def parse_password(self, url): """A method to parse the url password.""" if '@' in url: at_char = '@' elif '%40' in url: at_char = '%40' else: at_char = None if at_char == None: return url = url.split(at_char) url = url[0] self.url = self.url.replace(url+at_char, '') if not ':' in url: return url else: url = url.split(':') return url[1] def parse_hostname(self, url): """A method to parse the url hostname.""" if 'www.' in url: # self.url = url.replace('www.', '') return self.url.replace('www.', '') def parse_port(self, url): """A method to parse the url port.""" port = "" if self.parsed_url.get_attr('scheme').lower() != 'urn': if ':' in url: col_id = url.find(':') if url[-1].isdigit(): port = url[col_id+1:] self.url = url.replace(':'+port, '') if port == '': return port return int(port) def parse(self): """A method to parse the url. It is called from the class constructor.""" url = self.url self.parsed_url.set_attr('scheme', self.parse_scheme(self.url)) # self.parsed_url.set_attr('params', self.parse_params(self.url)) self.parsed_url.set_attr('fragment', self.parse_fragment(self.url)) self.parsed_url.set_attr('query', self.parse_query(self.url)) self.parsed_url.set_attr('path', self.parse_path(self.url)) self.parsed_url.set_attr('username', self.parse_username(self.url)) self.parsed_url.set_attr('password', self.parse_password(self.url)) self.parsed_url.set_attr('port', self.parse_port(self.url)) # self.parsed_url.set_attr('hostname', self.parse_hostname(self.url)) self.parsed_url.set_attr('netloc', self.parse_netloc(self.url)) # print(self.parsed_url) # print self.url return def get_url(self): """Getter method to retrieve url string.""" return self.url_string def url_join(self, url, str): raise NotImplemented def url_defrag(self): raise NotImplemented def url_unparse(self, parsed_url=None): """A method to reconstruct the url string from the parsed url.""" if parsed_url is None: parsed_url = self.parsed_url url = list() if type(parsed_url) is URL: scheme = parsed_url.get_attr('scheme') if scheme != '': url.extend([scheme,'://']) username = parsed_url.get_attr('username') if username is not None: url.append(username) password = parsed_url.get_attr('password') if password is not None: url.extend([':', password, '@']) netloc = parsed_url.get_attr('netloc') url.append(netloc) port = parsed_url.get_attr('port') if port != '': url.extend([':', port]) path = parsed_url.get_attr('path') if path != '': url.append(path) query = self.query_unparse(parsed_url.get_attr('query')) if len(query) != 0: url.extend(['?', query]) fragment = parsed_url.get_attr('fragment') if fragment != '': url.extend(['#', fragment]) return ''.join(url) elif type(parsed_url) is dict: if parsed_url['scheme'] != '': url = url + parsed_url['scheme'] + '://' if parsed_url['username'] is not None: url = url + parsed_url['username'] if parsed_url['password'] is not None: url = url + ':' + parsed_url['password'] + '@' url = url + parsed_url['netloc'] if parsed_url['port'] != '': url = url + ':' + parsed_url['port'] if parsed_url['path'] != '': url = url + parsed_url['path'] query = self.query_unparse(parsed_url['query']) if len(query) != 0: url = url + '?' + query if parsed_url['fragment'] != '': url = url + '#' + parsed_url['fragment'] return url def query_unparse(self, query): output = '' for key in query: output = output + key + '=' + query[key] + '&' output = output[:-1] return output def query_parse(self, query_string): query = dict() for raw_query in query_string.split('&'): raw_query = raw_query.split('=') query[raw_query[0]] = raw_query[1] return query def update_query(self, query): """Method to update the query in the parsed url.""" if type(query) == dict: self.parsed_url.set_attr('query', query) else: self.parsed_url.set_attr('query', self.query_parse(query)) # help(ParseURL)
def __init__(self, url): self.url = url self.url_string = url self.parsed_url = URL() self.parse()
class Request(object): ''' ''' #Default user agent string DEFAULT_USER_AGENT = cfg["scan_signature"] if cfg.has_key( "scan_signature") else "TScanner/1.0" def __init__(self, url, method='GET', headers=None, cookies=None, referer=None, data=None, user_agent=DEFAULT_USER_AGENT, **kwargs): ''' ''' if isinstance(url, URL): self._url = url else: self._url = URL(url) self._method = method self.id = uuid.uuid1() self._headers = {} if headers: self._headers.update(headers) self._cookies = cookies self._referer = referer self._user_agent = user_agent if self._cookies: self._headers.update({"Cookie": self._cookies}) if self._referer: self._headers.update({"Referer": self._referer}) if self._user_agent: self._headers.update({"User-Agent": self._user_agent}) self._get_data = self._url.get_querystring() self._post_data = data if data else "" def get_get_param(self): ''' ''' return self._get_data def get_post_param(self): ''' ''' return self._post_data def get_url(self): ''' ''' return self._url def get_method(self): ''' ''' return self._method def get_id(self): ''' ''' return self.id def get_headers(self): ''' ''' return self._headers def get_cookies(self): ''' ''' return self._cookies def set_method(self, method): ''' ''' self._method = method.upper() def set_post_data(self, postdata): ''' ''' self._post_data = postdata def set_get_data(self, getdata): ''' ''' self._get_data = getdata def set_referer(self, referer): ''' ''' self._referer = referer def set_cookies(self, cookies): ''' ''' self._cookies = cookies def __eq__(self, other): ''' ''' if self._url == other._url and self._method == other._method: return True else: return False def __str__(self): ''' ''' result_string = self._method result_string += " " + self._url.url_string + " HTTP/1.1\r\n" headers = copy.deepcopy(self._headers) headers.update({"Host": self._url.get_host()}) for key, value in headers.iteritems(): result_string += key + ": " + value result_string += "\r\n" result_string += "\r\n" if self._method == "POST": result_string += str(self._post_data) result_string = result_string.encode("utf-8") return result_string def __repr__(self): ''' ''' vals = { 'method': self.get_method(), 'url': self.get_url().url_string, 'id': self.get_id() } return '<Request | %(method)s | %(url)s | %(id)s>' % vals