def store_date(org,dst,date_str): url=Url() url1=url.getUrl(org,dst,date_str) print(url1) pass
def __init__(self, base, artist, song, separator='-', extension='.html'): Url.__init__(self) self.base = base self.separator = separator self.extension = extension self.artist = UrlHelper.remove_url_nonesense(artist.lower()) self.song = UrlHelper.remove_url_nonesense(song.lower())
class Source: def __init__(self, url, name, lookupId): self.url = Url(url) self.name = name self.lookupId = lookupId self.categories = [] def getHumanReadableName(self): if(self.url.getDomain() in DOMAIN_HUMAN_READABLE): return DOMAIN_HUMAN_READABLE[self.url.getDomain()] + ' - ' + self.name return self.name def isAggregator(self): return (self.url.getDomain() in AGGREGATOR_DOMAIN) def __repr__(self): return "<Source(%s, %s)>" % (self.name.encode('ascii', 'ignore'), self.url)
def add_url(self, url, rank=0, method="url"): url = Url(url) try: self.db.seeds.insert_one({ "date": [self.date], "url": url.url, "url_id" : url.url_id, "url_info": url.export(), "title": None, "description": None, "rank": rank, "source_url": None, "depth": 0, "method": method }) except pymongo.errors.DuplicateKeyError: pass return self
def service_obj_creator(self): f = open("URLs.txt") var = 1 url_obj_list = [] while var == 1: line = str(f.readline()) if "end" in line: break urllist = line.split() print(urllist) url_obj_list.append( Url(urllist[1], urllist[0], self.usr, self.password)) return url_obj_list
def add(self, url): #新加url,判断是否爬取过,如果爬取过不加入 if self._filter(url): url_type = 'explore' if re.search(r'explore', url): url_type = 'explore' if re.search(r'topics', url): url_type = 'topics' if re.search(r'/question/\d+/*$', url): url_type = 'question' if re.search(r'/question/\d+/answer/\d+/?$', url): url_type = 'answer' if re.search(r'/people/\w+/?/$', url): url_type = 'people' u_obj = Url(url=url, url_type=url_type) self.urls.append(u_obj)
def post(self): """Post short URL. Returns a short URL for a long URL. :param object request_body: the long URL with its meta data :rtype: json: contains the long URL, the short URL and the metadata """ body = request.get_json() long_url = body["long_url"] custom_salt = get_key(body, 'custom_salt') custom_url = get_key(body, 'custom_url') tag = get_key(body, 'tag') metadata = get_key(body, 'metadata') url_type = body["type"][0] if url_type == 'iota': url = IotaUrl(address=long_url, tag=tag, metadata=metadata, custom_salt=custom_salt) elif url_type == 'document': url = DocumentUrl(document_hash=long_url, tag=tag, metadata=metadata, custom_salt=custom_salt) else: url = Url(long_url=long_url, tag=tag, metadata=metadata, custom_salt=custom_salt) if custom_url: url.random_id = custom_url url_manager = UrlManager() message = url_manager.publish_url(url=url) message = message.json status = 200 response = app.response_class(response=json.dumps(message), status=status, mimetype='application/json') return response
def urls_from_content(self, content): """Retrieves the URLs from a request content. Keyword arguments: content -- page content, as retrieved via urllib urlopen or compatible with lxml. """ if content: bs = BeautifulSoup(content, 'lxml') urls = [] for u in bs.findAll('a'): new_url = Url(u.get('href'), domain = self.crawl_url.netloc(), protocol=self.crawl_url.proto()) if new_url == None or new_url == '': error("Failed to process {}".format(u.get('href'))) else: urls.append(new_url) return urls else: return []
def validate_url(self, short_url: str, long_url: str, custom_salt: str = None): random_id = short_url.split("/")[-1] url = Url() url.random_id = random_id url_transactions = self.node_manager.retrieve_transactions(address=url.address) if not url_transactions: return False for url_transaction in url_transactions: url_to_validate = Url(custom_salt=custom_salt) url_to_validate.from_message(url_transaction.message) if url_to_validate.long_url == long_url: if url_to_validate.is_valid: return True return False
def base(self): """ Returns the base URI for this response :rtype: class:`Url` or None """ url = None if self.header('Content-Base'): url = self.header('Content-Base') if self.header('Content-Location'): url = self.header('Content-Location') if url is None and self.request: url = self.request.url if not url: return None if not isinstance(url, Url): url = Url(url) return url
def test_url_protocol_parsing(self): self.assertEqual(Url("https://www.google.com").proto(), "https") self.assertEqual(Url("http://www.google.com").proto(), "http") self.assertEqual(Url("https://google.com").proto(), "https")
def doc(self) -> Url: return Url('https://wiki.micdoodle8.com/wiki/Galacticraft')
def __init__(self, urls): self.urls = [] for url in urls: self.urls.append(Url(url))
def test_get_path_from_url_root_directory(): assert get_path_from_url('/root/path', Url('http://my.DOMAIN.com/this/file.html') ) == '/root/path/my.domain.com/this/file.html%$%'
def url(self) -> Url: return Url('https://minecraft.curseforge.com')
from url import Url from document import Document if __name__ == "__main__": raw_file = open(DATA_PATH+"rawlist.data") url_file = open(DATA_PATH+"url.index") doc_file = open(DATA_PATH+"doc.index") url_list = [] doc_list = [] for line in url_file.readlines(): line = line.split('\t') summary = line[0] docId = int(line[1]) url = Url() url.m_summary=summary url.m_docId=docId url_list.append(url) for line in doc_file.readlines(): line = line.split('\t') docId = int(line[0]) pos = int(line[1]) document = Document() document.m_docId = docId document.m_pos = pos doc_list.append(document) index_dict = {} title_index_dict = {}
def url_item_insert(self, html, purl, table_name='url_item'): for item in Url.url_todo(html, purl): sql = self.db.sql_insert(table_name, item) self.db.execsql(sql)
def get_url(self, url_tuple): with open(self.url_file(url_tuple), 'r') as file: return Url(json=json.load(file))
def __init__(self): self._logger = logger self._url = Url()
class PycURLLibrary(): """PycURLLibrary is a library for functional testing with URL syntax, supporting DICT, FILE, FTP, FTPS, Gopher, HTTP, HTTPS, IMAP, IMAPS, LDAP, LDAPS, POP3, POP3S, RTMP, RTSP, SCP, SFTP, SMTP, SMTPS, Telnet and TFTP. PycURLLibrary supports SSL certificates and more. PycURLLibrary is based on PycURL [http://pycurl.sourceforge.net/], PycURL is a Python interface to libcurl [http://curl.haxx.se/libcurl/]. xml.etree.ElementTree [http://docs.python.org/2/library/xml.etree.elementtree.html] is used for XML operations. Supported XPath syntax (from Python v2.7.5 documentation): | Syntax | Meaning | | tag | Selects all child elements with the given tag. For example, spam selects all child elements named spam, spam/egg selects all grandchildren named egg in all children named spam. | | * | Selects all child elements. For example, */egg selects all grandchildren named egg. | | . | Selects the current node. This is mostly useful at the beginning of the path, to indicate that it’s a relative path. | | // | Selects all subelements, on all levels beneath the current element. For example, .//egg selects all egg elements in the entire tree. | | .. | Selects the parent element. | | [@attrib] | Selects all elements that have the given attribute. | | [@attrib='value'] | Selects all elements for which the given attribute has the given value. The value cannot contain quotes. | | [tag] | Selects all elements that have a child named tag. Only immediate children are supported. | | [position] | Selects all elements that are located at the given position. The position can be either an integer (1 is the first position), the expression last() (for the last position), or a position relative to the last position (e.g. last()-1). | """ ROBOT_LIBRARY_VERSION = VERSION ROBOT_LIBRARY_SCOPE = "TEST CASE" ROBOT_LIBRARY_DOC_FORMAT = "ROBOT" def __init__(self): self._logger = logger self._url = Url() def verbose(self): """Makes the fetching more verbose/talkative. Mostly useful for debugging. A line starting with '>' means "header data" sent by curl, '<' means "header data" received by curl that is hidden in normal cases, and a line starting with '*' means additional info provided by curl. Note that if you only want HTTP headers in the output, -i, --include might be the option you're looking for. If you think this option still doesn't give you enough details, consider using --trace or --trace-ascii instead. This option overrides previous uses of --trace-ascii or --trace. """ self._url.set_verbose(True) # def no_buffer(self): # """Disables the buffering of the output stream. # # In normal work situations, curl will use a standard buffered output stream that will have the effect that it will output the data in chunks, # not necessarily exactly when the data arrives. Using this option will disable that buffering. # Note that this is the negated option name documented. You can thus use --buffer to enforce the buffering. # """ def server_connection_establishment_timeout(self, timeout): """The maximum time in seconds that you allow the connection to the server to take (long value). This only limits the connection phase, once it has connected, this option is of no more use. Set to zero to switch to the default built-in connection timeout - 300 seconds. """ self._url.get_context().set_server_connection_establishment_timeout(long(str(timeout))) def insecure_ssl(self): """(SSL) This option explicitly allows curl to perform "insecure" SSL connections and transfers. All SSL connections are attempted to be made secure by using the CA certificate bundle installed by default. This makes all connections considered "insecure" fail unless -k, --insecure is used. """ self._url.set_insecure(True) def request_method(self, requestMethod): """ Set's the request method. Default's to GET if Post Fields keyword is used POST is used | Method | | GET | | POST | | PUT | | DELETE | """ self._url.get_context().set_request_method(requestMethod) def add_header(self, header): """(HTTP) Extra header to use when getting a web page. Each *Add Header* keyword is equivalent for one <-H, --header> argument with curl Examples: | Add Header | Content-Type: text/xml; charset=UTF-8 | | Add Header | Frame.Version:3.0 | """ self._logger.info('Header %s' % header) self._url.get_context().add_header(str(header)) def headers_file(self, headerFile): """(HTTP) Extra headers to use when getting a web page. *headerFile* contains all headers. One line is one header. Note do not make line feed after last header. Example: | Headers File | /data/headers.txt | Example of content of *headerFile*: | Version: 2 | | Content-Type: text/xml; charset=UTF-8 | """ headers = [line.rstrip() for line in open(headerFile, 'r')] self._logger.info('Headers %s' % headers) self._url.get_context().set_headers(headers) def post_fields(self, postFields): """(HTTP) Sends the specified data in a POST request to the HTTP server, in the same way that a browser does when a user has filled in an HTML form and presses the submit button. This will cause curl to pass the data to the server using the content-type application/x-www-form-urlencoded. Equivalent for <--data> argument Example: | Post Fields | pizza=Quattro+Stagioni&extra=cheese | """ self._url.set_post_fields(postFields) if postFields is not None: self._url.get_context().set_request_method('POST') def post_fields_file(self, postFieldsFile): """(HTTP) Sends the specified data in a POST request to the HTTP server, in the same way that a browser does when a user has filled in an HTML form and presses the submit button. This will cause curl to pass the data to the server using the content-type application/x-www-form-urlencoded. Equivalent for <--data> @argument Example: | Post Fields File | /data/message.txt | """ f = open(postFieldsFile, 'r') postFields = f.read() f.close() self._url.set_post_fields(postFields) self._url.get_context().set_request_method('POST') def set_url(self, url): """Specify a URL to fetch. """ self._url.get_context().set_url(str(url)) def ca_path(self, cacertDirectory): """(SSL) Tells curl to use the specified certificate directory to verify the peer. Multiple paths can be provided by separating them with ":" (e.g. "path1:path2:path3"). The certificates must be in PEM format. Equivalent for <--capath> argument with curl """ self._url.get_context().set_capath(str(cacertDirectory)) def client_certificate_file(self, cert): """(SSL) Tells curl to use the specified client certificate file when getting a file with HTTPS, FTPS or another SSL-based protocol. The certificate must be in PEM format Equivalent for <--cert> argument with curl """ self._url.get_context().set_client_certificate_file(str(cert)) def private_key_file(self, key): """(SSL/SSH) Private key file name. Allows you to provide your private key in this separate file. Equivalent for <--key> argument with curl """ self._url.get_context().set_private_key_file(str(key)) def perform(self): """Perform curl perform. """ self._url.perform() def response(self): """Get response from latest perform result """ return self._url.get_context().get_response() def response_headers(self): """Get response headers from latest perform result for protocols having headers preceding the data (like HTTP) """ return self._url.get_context().get_response_headers() def parse_xml(self): """Parses an XML section of the response. Returns an root Element instance. """ return self._url.get_context().parse_response_xml() def xml_root_element(self): """Returns the result root Element instance of `Parse Xml` keyword. """ return self._url.get_context().get_xml_root_element() def find_elements(self, element, xpath): """Returns a list containing all matching elements in document order Examples: | Find Elements | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer | | Find Elements | ${root} | .//name | """ assert element is not None, \ 'Element is Null.' xp = str(xpath) return element.findall(xp) def find_first_element(self, element, xpath): """Finds the first subelement matching *xpath*. Match may be a _tag name_ or _path_. Returns an element instance or None. Examples: | Find First Element | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer | | Find First Element | ${root} | .//name | """ assert element is not None, \ 'Element is Null.' xp = str(xpath) return element.find(xp) def should_contain_element(self, element, xpath): """Fails if the *element* does not contain *xpath* element Examples: | Should Contain Element | ${root} | .//{http://ws.poc.jivalo/hello/v1}customer | | Should Contain Element | ${root} | .//name | """ elements = self.find_elements(element, xpath) assert elements, \ 'Element "%s" contains not XPaht element "%s".' % ( element.tag, xpath) def element_should_contain(self, element, text): """Fails if the *element* text value does not contain *text* Examples: | Element Should Contain | ${elem} | Hello, world! | """ assert text in element.text, \ 'Element "%s" does not contains text "%s".' % ( element.tag, text) def element_should_match(self, element, text): """Fails if the *element* text value does not match *text* Examples: | Element Should Match | ${elem} | Hello, world! | """ assert text == element.text, \ 'Element "%s" does not match text "%s".' % ( element.tag, text) def http_response_status(self): """Get response status from latest HTTP response status line """ return self._url.get_context().get_response_status() def response_status_should_contain(self, text): """Fails if the _Response Status_ does not contain *text* Examples: | Response Status Should Contain | 200 | """ assert str(text) in str(self.http_response_status()), \ 'Response Status "%s" does not contains text "%s".' % ( self.http_response_status(), text) def log_response(self, log_level='INFO'): """ Logs the response of the URL transfer. Specify *log_level* (default: "INFO") to set the log level. """ if self.response(): self._logger.write("Response body:", log_level) self._logger.write(self.response(), log_level) else: self._logger.debug("No response received", log_level) def log_response_headers(self, log_level='INFO'): """ Logs the response headers for protocols having headers preceding the data (like HTTP), line by line. Specify *log_level* (default: "INFO") to set the log level. """ if self.response_headers(): self._logger.write("HTTP Response headers:", log_level) for header in self.response_headers(): self._logger.write(header, log_level) else: self._logger.debug("No HTTP response headers received", log_level) def log_http_response_status(self, log_level='INFO'): """ Logs the HTTP response header status line. Specify *log_level* (default: "INFO") to set the log level. """ if self.http_response_status(): self._logger.write("HTTP Response status:", log_level) self._logger.write(self.http_response_status(), log_level) else: self._logger.debug("No HTTP response status received", log_level) def log_version(self, log_level='INFO'): """ Logs the PycURLLibrary Version. Specify *log_level* (default: "INFO") to set the log level. """ self._logger.write("PycURLLibrary version %s" % (self.ROBOT_LIBRARY_VERSION), log_level)
def __init__(self, url, name, lookupId): self.url = Url(url) self.name = name self.lookupId = lookupId self.categories = []
def __init__(self): Url.__init__(self)
def setUp(self) -> None: self.kidega_site = DistSite('Kidega', 'query', 'https://kidega.com/arama', 'searcHeadArea.txt', 20, 'page') self.idefix_site = DistSite('Idefix', 'Q', 'https://idefix.com/search', 'searcHeadArea.txt', 36, 'Page') self.url_kidega = Url(self.kidega_site, 'python') self.url_idefix = Url(self.idefix_site, 'python')
def test_url_subdomain_parsing(self): self.assertEqual( Url("http://www.fake.google.com").subdomain(), "www.fake") self.assertEqual(Url("http://fake.google.com").subdomain(), "fake")
def test_url_suffix_parsing(self): self.assertEqual(Url("http://www.google.com").suffix(), "com") self.assertEqual(Url("http://www.google.co.uk").suffix(), "co.uk")
def crawl(url, workers=None, limit_to_domain=True, robot=False, single=False): """Crawls a given url to determine its link tree. Keyword arguments: url -- the url to crawl workers -- the number of processes to spawn (default cpu_count() * 2) limit_to_domain -- if the crawler should be limited to the url domain (default True) """ u = Url(url) domain = u.domain() # Establish communication queues tasks = multiprocessing.JoinableQueue() results = multiprocessing.Queue() # Start consumers if robot: rob = Robot(u.url()) num_consumers = 1 elif workers: num_consumers = workers else: num_consumers = multiprocessing.cpu_count() * 2 debug('Creating {} consumers'.format(num_consumers)) consumers = [Consumer(tasks, results) for i in range(num_consumers)] for w in consumers: w.start() num_jobs = 1 tasks.put(CrawlerTask(url)) # Keeps executing while there are URLs to process while num_jobs > 0: debug("Number of jobs: {}".format(num_jobs)) debug("Fetching results") result = results.get() debug("Got results") if limit_to_domain: # Filter urls based on domain (this could be merged to previous filter) domain_urls = list( filter(lambda x: x.domain() == domain, result['urls'])) else: domain_urls = result['urls'] # Filter urls based on freshness (i.e., do not parse repeated urls) new_urls = list(filter(lambda x: is_new_url(x.url()), domain_urls)) # Print [(lambda x: info("{} -> {}".format(result['parent'], x.url())))(r) for r in new_urls] debug("URL stats: Total {} Domain {} New {}".format( len(result['urls']), len(domain_urls), len(new_urls))) for r in new_urls: if robot and rob.should_block(r): info("Blocked access to {}".format(r.url())) continue print_url(r.url()) if not single: debug('Scheduling: {}'.format(r.url())) tasks.put(CrawlerTask(r.url())) if robot and rob.throttle_time(): info('Sleeping {} seconds'.format(rob.throttle_time())) sleep(rob.throttle_time()) num_jobs += 1 num_jobs -= 1 debug("Done scheduling") # This stops all the processes for i in range(num_consumers): tasks.put(None) # Waits for the correct killing of the processes tasks.join() debug("Done")
def resolve_download_url(self, url: Url) -> Url: unshortener = AdfLy() return Url(unshortener.unshorten(url))
from __future__ import unicode_literals from url import Url import os url = Url( 'http://www.timesoccer.com/video/burnley-vs-bournemouth-live-streaming-highlights.html' ) url.youtube_dl() os.startfile(url.video_path)
def setUp(self) -> None: self.dist1 = DistSite('Kidega', 'query', 'https://kidega.com/arama', 'searcHeadArea.txt', 20, 'page') self.url1 = Url(self.dist1, 'python') self.page1 = Page(self.url1) self.pager1 = Pager(self.page1)
def extract(self, response, depth=0, filters=True): article = {} html = response.text url = response.url url = Url(url) article["url"] = url.url article["url_info"] = url.export() article["url_id"] = url.url_id article["depth"] = depth article["type"] = response.headers['content-type'] article["date"] = self.date article["encoding"] = response.encoding.lower() article["status"] = True if url.status: article_txt = lxml_extractor(html, url) article["title"] = self.extract_title(html) article["meta"] = self.extract_meta(html) article["keywords"] = self.extract_keywords(article["meta"]) if filters: if self.check_lang(article_txt): if self.check_query(article_txt): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["citeds_url"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: if self.check_query(article["title"]): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang article = self.extract_page(article, article_txt, html) article["lang"] = self.page_lang return article else: article["status"] = False article["status_code"] = 900 article["msg"] = "Search expression not found" return article else: if self.check_lang(article["title"]): article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: article["status"] = False article["status_code"] = 1000 article["msg"] = "Lang is invalid" article["lang"] = self.page_lang return article else: self.check_lang(article_txt) article["html_file"] = self.store_file(article["url_id"], html, fmt="html") article["txt_file"] = self.store_file(article["url_id"], article_txt, fmt="txt") outlinks = self.extract_outlinks(html, url, depth) article["cited_urls"] = [n["url"] for n in outlinks] article["cited_url_ids"] = [n["url_id"] for n in outlinks] article["outlinks"] = outlinks article["lang"] = self.page_lang return article else: article["status"] = False article["error"] = "Invalid url" article["status_code"] = 800 return article
def test_get_path_from_url_empty_path(): # test with empty path urls assert get_path_from_url( '', Url('http://www.github.com')) == 'github.com/index.html%$%'
def doc(self) -> Url: return Url('https://pixelmonmod.com/wiki')
def test_get_path_from_url_with_querystring(): assert get_path_from_url('', Url('http://youtube.com/watch?v=ghsu3u43') ) == 'youtube.com/watch?v=ghsu3u43%$%'
def __init__(self, url): self.url = Url(urljoin(url, '/robots.txt')) self.rerp = RobotExclusionRulesParser() self.rerp.user_agent = 'Mozilla/5.0' self.rerp.fetch(self.url.url())
import json import requests from data import * from url import Url for url_to_check in urls_to_check: for i, url in enumerate(url_to_check['urls']): print(url) for j, child in enumerate(Url(url).get_children()): print(f'{i}.{j} - Checking {child}') if child.response_status() != 200 or j == 3: body = \ f'You have a broken link at {url}\n' + \ f'- Link: {child}\n' + \ '\n\n' + \ 'This message was automatically created by <https://github.com/matheusvanzan/broken-links-monitor|broken-links-monitor>' response = requests.post(url_to_check['channel'], json.dumps({'text': body})) print(response)
def url(self) -> Url: return Url('https://micdoodle8.com')
def __init__(self, url, domain=None, protocol=None): self.crawl_url = Url(url, domain=domain, protocol=protocol)
def url(self) -> Url: return Url('https://reforged.gg')
node = html.fromstring(index) start = time.time() mydb = mysql.connector.connect(host="localhost", user="******", passwd="root", database="crawel") cur = mydb.cursor() #threads = [] for i in node.xpath("//body//a"): url = i.get("href") ourl = Url(url) sql = "insert into t_url (url, created_date) values ('%s', '%s')" % ( ourl.url, ourl.created_date) print sql cur.execute(sql) #t = threading.Thread(target=downloader.download,args=(root, url, basedir)) #t.start() #threads.append(t) mydb.commit() cur.close() mydb.close() #main thread waiting for sub thread #for t in threads: # t.join()
def getUrl(date_str,detal_days=30): url_obj=Url() url_list=url_obj.getAllUrl(date_str,detal_days) return url_list