Beispiel #1
0
class UrlComparator:
    def __init__(self, url1, url2):
        self.ut1 = URL(url1)
        self.ut2 = URL(url2)
        self.params = self._allparams()

    def _allparams(self):
        all = []
        add = all.append
        for ut in [self.ut1, self.ut2]:
            for p in ut.getParamNames():
                if not p in all:
                    add(p)
        return all

    def diff(self):
        msg = []
        add = msg.append
        if self.ut1.getBaseUrl() != self.ut2.getBaseUrl():
            add("baseUrls are different")
        p1 = self.ut1.getParamMap()
        p2 = self.ut2.getParamMap()

        for p in self._allparams():
            if not p1.has_key(p):
                add("'%s' is not defined in 1" % p)
            elif not p2.has_key(p):
                add("'%s' is not defined in 2" % p)
            elif p1[p] != p2[p]:
                add("different values for '%s'" % p)
                add("\t1 - %s\n\t2 - %s" % (p1[p], p2[p]))
        if not msg:
            return "no diff"
        else:
            return '\n'.join(msg)
Beispiel #2
0
    def cannonicalize(self):
        parsed_url = urlsplit(self.url)
        url = URL(parsed_url[:])
        self.url = url.make()

        if self.url == "":
            return None

        self.url = self.url.replace("https://", "http://")
        self.url = re.sub('#.*', "", self.url)

        return self.url
Beispiel #3
0
 def setUpClass(self):
     self.utility = Utility()
     # CHANGE THE LOG FILE NAME IN THE NEXT LINE******************************************************************************************
     self.log = open(self.utility.logpath + "/WV-00.txt", "a+")
     self.suite_start_time = time.time()
     self.log.write("Suite started at {}\n".format(
         str(time.ctime(int(self.suite_start_time)))))
     self.url = URL()
     self.loginPageStaticTexts = LoginPageStaticText()
     self.loginPageTestData = LoginPageTestData()
     self.configTestCase = configparser.RawConfigParser()
     # CHANGE THE CONFIG PROPERTY FILE NAME IN THE NEXT LINE******************************************************************************************
     self.configTestCase.read(
         os.path.dirname(os.getcwd()) +
         '/TestCases/WV_00_Config.properties')
     self.configECG = configparser.RawConfigParser()
     self.configECG.read(
         os.path.dirname(os.getcwd()) +
         '/Scripts/ECGRelatedData.properties')
     self.configDevice = configparser.RawConfigParser()
     self.configDevice.read(
         os.path.dirname(os.getcwd()) +
         '/Scripts/DeviceRelatedData.properties')
     self.sendECG = SendECG()
     yield
     self.suite_end_time = time.time()
     self.total_time_taken_suite = self.suite_end_time - self.suite_start_time
     self.log.write("Suite ended at {}\n".format(
         str(time.ctime(int(self.suite_end_time)))))
     self.log.write(
         "Total time taken by Test Suite to finish: {} seconds\n".format(
             self.total_time_taken_suite))
     self.log.close()
Beispiel #4
0
def get_resp():
    _url = request.args.get("url")
    if not _url:
        return json_resp({"error": "invalid_url"})
    url = URL(_url)
    n = str(url)
    return json_resp(get_meta_data_json(n, url))
Beispiel #5
0
 def processURL(s, raw_url):
     tweet_id = s.tweet_id
     url = raw_url['expanded_url']
     my_url = URL(tweet_id, url)
     # only grab external URLs
     if not (url.startswith('https://twitter.com/')):
         s.urls.append(my_url)
Beispiel #6
0
    def __init__(self,
                 url,
                 method='GET',
                 headers=None,
                 cookies=None,
                 referer=None,
                 data=None,
                 user_agent=DEFAULT_USER_AGENT,
                 **kwargs):
        '''
        '''
        if isinstance(url, URL):
            self._url = url
        else:
            self._url = URL(url)

        self._method = method

        self.id = uuid.uuid1()

        self._headers = {}
        if headers:
            self._headers.update(headers)

        self._cookies = cookies

        self._referer = referer

        self._user_agent = user_agent

        if self._cookies:
            self._headers.update({"Cookie": self._cookies})

        if self._referer:
            self._headers.update({"Referer": self._referer})

        if self._user_agent:
            self._headers.update({"User-Agent": self._user_agent})

        self._get_data = self._url.get_querystring()

        self._post_data = data if data else ""
Beispiel #7
0
	def audit(self, origin, response):
		"""
			:origin: original url.
			all url need match with original url
			:return: list url obj
		"""
		while len(self.QUEUES) > 0:
			url_ = self.QUEUES.pop()
			self.debug("       [*] Crawling URL: " + url_.get_url())  # print debug
			self.RESULTS.append(url_)
			header, response = self.connect_getdata(url_.domain, url_.port, url_.get_module())
			links = self.get_links(response, self.domain, self.port, url_.folder)
			for link in links:
				url = URL(link)
				if not self.is_in_results(url):
					if not self.is_in_queues(url):
						self.QUEUES.insert(0, url)
						self.debug(url.get_url())
						self.debug_socket(url.get_url())
			self.RESULTS = filter(None, self.RESULTS)
Beispiel #8
0
def setup(request, setUpClass):
    print("initiating chrome driverd")
    driver = Browser().getbrowser("chrome")
    url = URL()
    driver.get(url.webViewerUAT)
    utility = Utility()
    # utility.createLogFolder()
    log = open(utility.logpath + "/WV-00.txt", "a+")
    driverUtility = DriverUtility(driver, log)
    loginPageObject = LoginPageObject(driverUtility, log)

    request.cls.driver = driver
    request.cls.url1 = url
    request.cls.utility = utility
    request.cls.driverUtility = driverUtility
    request.cls.loginPageObject = loginPageObject

    print("setup ended")
    yield driver
    driver.close()
Beispiel #9
0
import collections
import httplib2
from URL import URL
import urllib.request

from SrcubOrigUrls import scrub_orig_urls

# Define list to store data directly from list
url_data = collections.defaultdict(set)

# Define list to store objects of URLs
urls = []

# Define source data file and reader
url_data_file = open('data.csv', 'r')
my_reader = csv.reader(url_data_file)

# Put from csv file into list
print("Getting data from file...")
for row in my_reader:
    url_data[row[0]].add(row[1])

print("Done getting data from file!")
# Form objects from data and put into list
print("Putting objects in list...")
for url in url_data:
    new_url = URL(url, url_data[url])
    urls.append(new_url)
print("Done putting objects in list!")
del urls[0]
scrub_orig_urls(urls)
Beispiel #10
0
import sys
from DB import DB
from URL import URL

db = DB('citeseerx.db')
db.create_tables()
# db.del_all()

# http://citeseerx.ist.psu.edu/viewdoc/summary?cid=16057
if len(sys.argv) == 2:
    url = URL(sys.argv[1])
    url.open()
    db.insert('link', {'doi': url.get_doi(), 'url': url.get_url()})
else:
    print 'Please supply proper URL.'
from URL import URL
from BSOUP import BSOUP
from time import sleep
import re

# change hosts path according to your OS 
hosts_path = r"C:\Windows\System32\drivers\etc\hosts"
# localhost's IP 
redirect = "127.0.0.1"
u=URL()
b=BSOUP()
urlis=[]
class CWBP:
    def cwblocker(self,lis):
        global urlis
        self.ur=u.giveurl()
        self.a=re.search('/',self.ur)
        self.kw=b.keyword("https://www."+self.ur)
        for i in lis:
            if i in self.kw:
                urlis.append("www."+self.ur[:self.a.start()])
                with open(hosts_path, 'r+') as file: 
                    self.content = file.read() 
                    if self.ur in self.content: 
                        pass    
                    else: 
			           # mapping hostnames to your localhost IP address 
                        file.write(redirect + " " +"www."+self.ur[:self.a.start()]+ "\n")
    def unblocker(self):
        global urlis
        with open(hosts_path, 'r+') as file: 
Beispiel #12
0
def setup(request, setUpClass):
    print("initiating chrome driverd")
    driver = Browser().getbrowser("chrome")
    url = URL()
    driver.get(url.webViewerUAT)
    utility = Utility()
    # utility.createLogFolder()
    log = open(utility.logpath + "/WV-00.txt", "a+")
    driverUtility = DriverUtility(driver, log)
    loginPageObject = LoginPageObject(driverUtility, log)

    request.cls.driver = driver
    request.cls.url1 = url
    request.cls.utility = utility
    request.cls.driverUtility = driverUtility
    request.cls.loginPageObject = loginPageObject

    print("setup ended")
    yield driver
    driver.close()


# from datetime import datetime

# def pytest_logger_config(logger_config):

#     logger_config.add_loggers(['foo', 'bar', 'baz'], stdout_level='debug')
#     logger_config.set_log_option_default('foo,bar')

# def pytest_logger_logdirlink(config):
# 	print("1")
# 	path = os.path.dirname(os.getcwd()) + '/Logs/'
# 	foldername = datetime.now().strftime("%Y%m%d-%H%M%S")
# 	logpath = path+foldername
# 	try:
# 		# return os.mkdir(logpath)
# 		return os.path.join(path, foldername)
# 		# return logpath
# 	except OSError as e:
# 		print("Creation of the directory failed")
# 		print(traceback.format_exc())
# 	else:
# 		print("Successfully created the directory")

# return os.path.join(os.path.dirname(__file__), 'mylogs')

# @pytest.yield_fixture(scope='session')
# def session_thing():
#     foo.debug('constructing session thing')
#     yield
#     foo.debug('destroying session thing')

# @pytest.yield_fixture
# def testcase_thing():
#     foo.debug('constructing testcase thing')
#     yield
#     foo.debug('destroying testcase thing')

# @pytest.fixture(scope="class")
# def setup(request):
#     print("initiating chrome driver")
#     driver = Browser().getbrowser("chrome") #if not added in PATH
#     url = URL()
#     utility = Utility()

#     # driver.maximize_window()
#     request.cls.d = driver
#     request.cls.u = utility
#     request.cls.url1 = url
#     yield
#     driver.close()

# import pytest
# from selenium import webdriver

# @pytest.fixture(scope="class")
# def setup(request):
#     print("initiating chrome driver")
#     driver = Browser().getbrowser("chrome") #if not added in PATH
#     url = URL()
#     utility = Utility()
#     # driver.maximize_window()
#     request.cls.d = driver
#     request.cls.u = utility
#     request.cls.url1 = url

#     yield driver
#     driver.close()

# @pytest.fixture(scope='session')
# def config():
# 	with open('WV_00_Config.json') as config_file:
# 		data = json.load(config_file)
# 		for r in data['Enabled']:
# 			print (r[b])
# 	return data
Beispiel #13
0
from URL import URL
from DB import DB
from bs4 import BeautifulSoup

db = DB('citeseerx.db')

count = 0
while db.count_unpr():
    # url = URL('http://citeseerx.ist.psu.edu/viewdoc/summary?cid=4320')
    count = count + 1
    url = db.get_unpr()
    print url
    url = URL(url)
    url.open()
    db.update_link(url.get_doi(), 2)

    if (not db.exists('link', url.get_doi()) and url.redirect_occured()):
        db.insert('link', {
            'doi': url.get_doi(),
            'url': url.get_redirect_url()
        })

    if (not db.exists('metadata', url.get_doi())):
        html = url.fetch()
        # extract abstract
        soup = BeautifulSoup(html, "html.parser")
        title = soup.find('h2').findAll(text=True)[0]
        abstract_div = soup.find("div", {"id": "abstract"})
        for tag in abstract_div:
            if tag.name == 'p':
                abstract = tag.findAll(text=True)
Beispiel #14
0
 def __init__(self, url1, url2):
     self.ut1 = URL(url1)
     self.ut2 = URL(url2)
     self.params = self._allparams()
Beispiel #15
0
        if self.ut1.getBaseUrl() != self.ut2.getBaseUrl():
            add("baseUrls are different")
        p1 = self.ut1.getParamMap()
        p2 = self.ut2.getParamMap()

        for p in self._allparams():
            if not p1.has_key(p):
                add("'%s' is not defined in 1" % p)
            elif not p2.has_key(p):
                add("'%s' is not defined in 2" % p)
            elif p1[p] != p2[p]:
                add("different values for '%s'" % p)
                add("\t1 - %s\n\t2 - %s" % (p1[p], p2[p]))
        if not msg:
            return "no diff"
        else:
            return '\n'.join(msg)


def cmpUrls(url1, url2):
    ct = UrlComparator(url1, url2)
    ct.ut1.report("URL 1")
    ct.ut2.report("URL 2")
    print "\nDiff:"
    print ct.diff()


if __name__ == "__main__":
    ut = URL(ssDoc)
    print ut.getTuple()
Beispiel #16
0
class ParseURL(object):
    """A python module to parse url emulates urllib.urlparse()."""
    def __init__(self, url):
        self.url = url
        self.url_string = url
        self.parsed_url = URL()
        self.parse()

    def parse_scheme(self, url):
        """A method to parse the url scheme."""
        dot_ind = url.find('.')
        col_ind = url.find(':')
        scheme = ''
        if col_ind < dot_ind and col_ind != -1:
            scheme = url[:col_ind]
        self.url = url[col_ind+3:]
        return scheme.lower()

    def parse_netloc(self, url):
        """A method to parse the url netloc."""
        self.url = ''
        return url

    def parse_path(self, url):
        """A method to parse the url path."""
        ind = url.find('/')
        if ind != -1:
            self.url = url.replace(url[ind:], '')
            return url[ind:]
        return ''

    def parse_params(self, url):
        raise NotImplemented

    def parse_query(self, url):
        """A method to parse the url query."""
        start_ind = url.find('?')
        if start_ind == -1:
            return dict()
        end_ind = url.find('#')
        if end_ind == -1:
            query_string = url[start_ind+1:]
        else:
            query_string = url[start_ind+1:end_ind]
        self.url = url.replace('?' + query_string, '')
        return self.query_parse(query_string)

    def parse_fragment(self, url):
        """A method to parse the url fragment."""
        ind = url.find('#')
        if ind != -1:
            self.url = url[:ind]
            return url[ind+1:]
        return ''

    def parse_username(self, url):
        """A method to parse the url username."""
        if '@' in url: at_char = '@'
        elif '%40' in url: at_char = '%40'
        else: at_char = None
        if at_char == None:
            return
        url = url.split(at_char)
        url = url[0]
        if not ':' in url:
            return url
        else:
            url = url.split(':')
            return url[0]

    def parse_password(self, url):
        """A method to parse the url password."""
        if '@' in url: at_char = '@'
        elif '%40' in url: at_char = '%40'
        else: at_char = None
        if at_char == None:
            return
        url = url.split(at_char)
        url = url[0]
        self.url = self.url.replace(url+at_char, '')
        if not ':' in url:
            return url
        else:
            url = url.split(':')
            return url[1]

    def parse_hostname(self, url):
        """A method to parse the url hostname."""
        if 'www.' in url:
            # self.url = url.replace('www.', '')
            return self.url.replace('www.', '')

    def parse_port(self, url):
        """A method to parse the url port."""
        port = ""
        if self.parsed_url.get_attr('scheme').lower() != 'urn':
            if ':' in url:
                col_id = url.find(':')
                if url[-1].isdigit():
                    port = url[col_id+1:]
                self.url = url.replace(':'+port, '')
        if port == '':
            return port
        return int(port)

    def parse(self):
        """A method to parse the url. It is called from the class constructor."""
        url = self.url
        self.parsed_url.set_attr('scheme', self.parse_scheme(self.url))
        # self.parsed_url.set_attr('params', self.parse_params(self.url))
        self.parsed_url.set_attr('fragment', self.parse_fragment(self.url))
        self.parsed_url.set_attr('query', self.parse_query(self.url))
        self.parsed_url.set_attr('path', self.parse_path(self.url))
        self.parsed_url.set_attr('username', self.parse_username(self.url))
        self.parsed_url.set_attr('password', self.parse_password(self.url))
        self.parsed_url.set_attr('port', self.parse_port(self.url))
        # self.parsed_url.set_attr('hostname', self.parse_hostname(self.url))
        self.parsed_url.set_attr('netloc', self.parse_netloc(self.url))
        # print(self.parsed_url)
        # print self.url
        return

    def get_url(self):
        """Getter method to retrieve url string."""
        return self.url_string

    def url_join(self, url, str):
        raise NotImplemented

    def url_defrag(self):
        raise NotImplemented

    def url_unparse(self, parsed_url=None):
        """A method to reconstruct the url string from the parsed url."""
        if parsed_url is None: parsed_url = self.parsed_url
        url = list()
        if type(parsed_url) is URL:
            scheme = parsed_url.get_attr('scheme')
            if scheme != '':
                url.extend([scheme,'://'])
            username = parsed_url.get_attr('username')
            if username is not None:
                url.append(username)
            password = parsed_url.get_attr('password')
            if password is not None:
                url.extend([':', password, '@'])
            netloc = parsed_url.get_attr('netloc')
            url.append(netloc)
            port = parsed_url.get_attr('port')
            if port != '':
                url.extend([':', port])
            path = parsed_url.get_attr('path')
            if path != '':
                url.append(path)
            query = self.query_unparse(parsed_url.get_attr('query'))
            if len(query) != 0:
                url.extend(['?', query])
            fragment = parsed_url.get_attr('fragment')
            if fragment != '':
                url.extend(['#', fragment])
            return ''.join(url)
        elif type(parsed_url) is dict:
            if parsed_url['scheme'] != '':
                url = url + parsed_url['scheme'] + '://'
            if parsed_url['username'] is not None:
                url = url + parsed_url['username']
            if parsed_url['password'] is not None:
                url = url + ':' + parsed_url['password'] + '@'
            url = url + parsed_url['netloc']
            if parsed_url['port'] != '':
                url = url + ':' + parsed_url['port']
            if parsed_url['path'] != '':
                url = url + parsed_url['path']
            query = self.query_unparse(parsed_url['query'])
            if len(query) != 0:
                url = url + '?' + query
            if parsed_url['fragment'] != '':
                url = url + '#' + parsed_url['fragment']
            return url

    def query_unparse(self, query):
        output = ''
        for key in query:
            output = output + key + '=' + query[key] + '&'
        output = output[:-1]
        return output

    def query_parse(self, query_string):
        query = dict()
        for raw_query in query_string.split('&'):
            raw_query = raw_query.split('=')
            query[raw_query[0]] = raw_query[1]
        return query

    def update_query(self, query):
        """Method to update the query in the parsed url."""
        if type(query) == dict:
            self.parsed_url.set_attr('query', query)
        else:
            self.parsed_url.set_attr('query', self.query_parse(query))

# help(ParseURL)
Beispiel #17
0
 def __init__(self, url):
     self.url = url
     self.url_string = url
     self.parsed_url = URL()
     self.parse()
Beispiel #18
0
class Request(object):
    '''
    '''
    #Default user agent string
    DEFAULT_USER_AGENT = cfg["scan_signature"] if cfg.has_key(
        "scan_signature") else "TScanner/1.0"

    def __init__(self,
                 url,
                 method='GET',
                 headers=None,
                 cookies=None,
                 referer=None,
                 data=None,
                 user_agent=DEFAULT_USER_AGENT,
                 **kwargs):
        '''
        '''
        if isinstance(url, URL):
            self._url = url
        else:
            self._url = URL(url)

        self._method = method

        self.id = uuid.uuid1()

        self._headers = {}
        if headers:
            self._headers.update(headers)

        self._cookies = cookies

        self._referer = referer

        self._user_agent = user_agent

        if self._cookies:
            self._headers.update({"Cookie": self._cookies})

        if self._referer:
            self._headers.update({"Referer": self._referer})

        if self._user_agent:
            self._headers.update({"User-Agent": self._user_agent})

        self._get_data = self._url.get_querystring()

        self._post_data = data if data else ""

    def get_get_param(self):
        '''
        '''
        return self._get_data

    def get_post_param(self):
        '''
        '''
        return self._post_data

    def get_url(self):
        '''
        '''
        return self._url

    def get_method(self):
        '''
        '''
        return self._method

    def get_id(self):
        '''
	'''
        return self.id

    def get_headers(self):
        '''
        '''
        return self._headers

    def get_cookies(self):
        '''
        '''
        return self._cookies

    def set_method(self, method):
        '''
	'''
        self._method = method.upper()

    def set_post_data(self, postdata):
        '''
        '''
        self._post_data = postdata

    def set_get_data(self, getdata):
        '''
        '''
        self._get_data = getdata

    def set_referer(self, referer):
        '''
        '''
        self._referer = referer

    def set_cookies(self, cookies):
        '''
        '''
        self._cookies = cookies

    def __eq__(self, other):
        '''
        '''
        if self._url == other._url and self._method == other._method:
            return True
        else:
            return False

    def __str__(self):
        '''
        '''
        result_string = self._method

        result_string += " " + self._url.url_string + " HTTP/1.1\r\n"

        headers = copy.deepcopy(self._headers)
        headers.update({"Host": self._url.get_host()})

        for key, value in headers.iteritems():
            result_string += key + ": " + value
            result_string += "\r\n"

        result_string += "\r\n"

        if self._method == "POST":
            result_string += str(self._post_data)

        result_string = result_string.encode("utf-8")

        return result_string

    def __repr__(self):
        '''
        '''
        vals = {
            'method': self.get_method(),
            'url': self.get_url().url_string,
            'id': self.get_id()
        }

        return '<Request | %(method)s | %(url)s | %(id)s>' % vals