def split_dburl(dburl, default_dburl=None): """ we split the url into the base mongodb URL, and the path element, whose first element is the database name, and the remainder is interpreted as collection id. """ # if the given URL does not contain schema nor host, the default URL is used # as base, and the given URL string is appended to the path element. url = ruu.Url(dburl) if not url.schema and not url.host: url = ruu.Url(default_dburl) url.path = dburl # NOTE: add other data base schemes here... if 'mongodb' not in url.schema.split('+'): raise ValueError( "url must be a 'mongodb://' or 'mongodb+ssl://' url, not '%s'" % dburl) host = url.host port = url.port path = url.path user = url.username pwd = url.password ssl = False if 'ssl' in url.schema.split('+'): ssl = True url.schema = 'mongodb' if not host: host = 'localhost' if path.startswith('/'): path = path[1:] path_elems = path.split('/') dbname = None cname = None pname = None if len(path_elems) > 0: dbname = path_elems[0] if len(path_elems) > 1: dbname = path_elems[0] cname = path_elems[1] if len(path_elems) > 2: dbname = path_elems[0] cname = path_elems[1] pname = '.'.join(path_elems[2:]) if dbname == '.': dbname = None return [host, port, dbname, cname, pname, user, pwd, ssl]
def home(): url, local_filename, remote_basename, error = None, None, None, None if request.method == 'POST': session['_url'] = request.form['url'] return redirect(url_for('signin')) if session.get('_url'): # Fetch the OAuth credentials that will be used to obtain upload access to # the Google Drive. credentials = google.oauth2.credentials.\ Credentials(**session['credentials']) try: url = urlm.Url(session['_url'], credentials.token) local_filename, remote_basename = url.drive_it() except RuntimeError as e: flash(str(e), 'notification') else: msg = 'Success! File "{}" is in your Drive now.' flash(msg.format(remote_basename), 'notification') finally: session['_url'] = None return render_template('index.html')
def test_uses_expected_method(self): """Uses ‘urllib.parse.urlparse’, with the correct URL. It’s assumed that ‘urllib.parse.urlparse’ will be used.""" with patch('urllib.parse.urlparse') as urlparse_mock: # Get a test Url instance ready. url_obj = urlm.Url(random_string(), str()) # Using another random string for the attribute ‘_responseurl’ to # have an independent test that the program sets and gets it # properly, and that it does not change it before using it to # obtain ‘_urlpath’. responseurl = random_string() url_obj._responseurl = responseurl # Get the mocked method ready to be called. # Assigns the property ‘path’ to whatever is returned after calling # ‘urlparse_mock()’. # This is necessary because the parsed URL itself corresponds to the # property ‘path’ from the object that is returned from the parsing # method. urlparse_mock().path = random_string() # Force test object to parse its URL, giving it a reason to call # ‘urllib.parse.urlparse’. urlpath = url_obj._urlpath # Check if ‘urllib.parse.urlparse’ was called, and with the correct # URL. urlparse_mock.assert_called_with(responseurl) # Check if it assigned its attribute to the correct property from # the return value of ‘urllib.parse.urlparse’. self.assertEqual(url_obj._urlpath, urlparse_mock().path)
def test_no_exception_if_url_string(self): """No error when instantiated with string.""" try: urlm.Url(str(), str()) except TypeError: self.fail('Raised TypeError when instantiated with string')
def setUp(self): """Create test file to be downloaded and test object to do it.""" self.f_remote = random_temp_file() self.url_obj = urlm.Url(random_string(), str()) self.url_obj._responseurl = random_string()
def extractlink(self, soup, purl): hrefs = [] links1 = soup.findAll("a") for link in links1: href = link.get('href') if href: href = href.strip("/") # if 'javascript:location.href' in href: # href = href[len("javascript:location.href")+2:-2] hrefs.append(href) links2 = soup.find_all("img") for link in links2: href = link.get('src') if href: hrefs.append(href) for href in hrefs: # u = url.Url(purl.seed,purl.relpath+"/"+href) u = url.Url(purl.link, href) if u.depth > int(self.conf_dic['spider']['max_depth']): continue if u.link not in self.urlpool: self.urlpool[u.link] = 1 req = threadpool.WorkRequest(self.crawl, [u], callback=self.print_result, exc_callback=self.handle_exception) # print(newurl) self.pool.putRequest(req) print "new url %s added." % u.link
def test_raises_errors(self): """Raises errors as promised in the docstring.""" with patch('urllib.parse.urlparse') as urlparse_mock: # Create a test object. url_obj = urlm.Url(random_string(), str()) url_obj._responseurl = random_string() # Make the mocked method raises the proper error when called. for exception in builtin_exceptions(): # Skip trickier Unicode exceptions. if exception.__name__.startswith('Unicode'): continue with self.subTest(exception=exception): urlparse_mock.side_effect = exception # Check if the expected error was raised as a result. # For this, it suffices to get the attribute, which will # then call ‘urllib.parse.urlparse’. # ValueError should raise a RuntimeError. # All other exceptions should raise themselves. if exception is ValueError: should_raise = RuntimeError else: should_raise = exception with self.assertRaises(should_raise): url_obj._urlpath
def push_and_pull(start_page, pages, debug, sql_table, creds, proxy_pass): pnp_start = url.Url(start_page) pnp_start.add_key('sort', 'priceup') parse_pages(pnp_start.get_url(), 100, debug, sql_table, creds, proxy_pass) pnp_start.change_key('sort', 'pricedown') parse_pages(pnp_start.get_url(), pages - 100, debug, sql_table, creds, proxy_pass)
def test__responseurl_get(self): """‘get’ works properly for ‘_responseurl’.""" url_obj = urlm.Url('', str()) for url in urls_for_test(): with self.subTest(url=url): url_obj._responseurl = url self.assertEqual(url_obj._responseurl, url)
def test_exception_if_not_string(self): """Error when instantiated with something other than string.""" for typer in (int, float, tuple, list, set, dict): for args in ((typer, str()), (str(), typer)): with self.subTest(args=args): # Create a dummy object for test by calling each type. # Called without any arguments, ‘typer’ returns an object of # its type. with self.assertRaises(TypeError): urlm.Url(**args)
def parse_pages(start_page, pages, debug, sql_table, creds, proxy_pass): import url parse_page = url.Url(start_page) first_page = html_page.HtmlPage(parse_page.get_url()) html = first_page.get_html(creds, proxy_pass) if html: soup = BeautifulSoup(html, 'html.parser') # 1st page arts_dict = {} for i in soup.findAll('div', class_="j-card-item"): art_num = re.search(r'\d+', i.get('data-popup-nm-id')) arts_dict[art_num[0]] = i.find('a')['href'] for art, link in arts_dict.items(): if not sql_table.table_check_presence(art, creds[6]): handbag = bag.Bag() if not link.startswith('https'): link = "https://www.wildberries.ru" + link handbag.get_bag_page(art, link, debug, creds, proxy_pass) sql_table.table_append(handbag) sql_table.cnx.commit() # after 1st page if parse_page.check_key('page'): return 0 parse_page.add_key('page', '1') # 2nd page and further for i in range(2, pages + 1): parse_page.change_key('page', str(i)) print(parse_page.get_url()) have_a_try = 3 if have_a_try: further_page = html_page.HtmlPage(parse_page.get_url()) arts_dict = further_page.get_wb_page(creds, proxy_pass) if arts_dict: for art, url in arts_dict.items(): if not sql_table.table_check_presence(art, creds[6]): handbag = bag.Bag() handbag.get_bag_page(art, url, debug, creds, proxy_pass) sql_table.table_append(handbag) sql_table.cnx.commit() continue else: sql_table.cnx.commit() print(f"Page {str(i)} parse error. Trying again.") have_a_try -= 1 else: sql_table.cnx.commit() print(f"No luck. Next page.")
def __init__(self): self.template = template.Template(template) arg_parser = ArgumentParser() arg_parser.add_argument("-u", "--url", dest="url") args = arg_parser.parse_args() if args.url is None: print("Вы забыли ввести URL") else: page = url.Url(args.url) data = page.get_content() path = page.parse_url() current_template = self.template.get_template() html_parser = MyHTMLParser(current_template) html_parser.feed(data) self.content = content.Content(current_template, html_parser.content) write_data = self.content.format_data() self.save_data(path, write_data)
def test__upload(self): """Upload method properly receives file chunks.""" # Create a test object. url_obj = urlm.Url(random_string(), random_string()) for upload_chunk_size in [1, 2, 3, 5, 7, 11, 256, 2 * 256 * 1024]: with self.subTest(upload_chunk_size=upload_chunk_size): # Patch a mock to intercept the uploaded chunks. with patch('requests.put') as put_mock,\ patch('url.Url._get_upload_url')\ as _get_upload_url_mock,\ patch('url.get_last_uploaded_byte')\ as get_last_uploaded_byte_mock: # Prepare the mocked method. We won't need a meaningful # return value for it because the method that would use # it, requests.put, is also being patched. _get_upload_url_mock.return_value = random_string() # Create a test file to be uploaded. with open(random_temp_file(), mode='rb') as original,\ NamedTemporaryFile(mode='wb', delete=False)\ as uploaded: # Prepare the test object. url_obj._filename = original.name # Prepare the mocked method that will return the last # successfully uploaded byte for each iteration. It # consists of a list of byte positions from 0 to # file_size - 1. file_size = os.path.getsize(original.name) get_last_uploaded_byte_mock.side_effect =\ self.get_lubmse(file_size, upload_chunk_size) # Call the upload method. url_obj._upload(upload_chunk_size=upload_chunk_size) for call in put_mock.call_args_list: chunk = call.kwargs['data'] uploaded.write(chunk) # Check if the constructed file has the same contents as # the original file. self.assertTrue(filecmp.cmp(original.name, uploaded.name)) os.remove(original.name) os.remove(uploaded.name)
def __init__(self, config, seed): log.init_log('./log/MiniSpider') cp = ConfigParser.ConfigParser() cp.readfp(open(config)) self.conf_dic = dict(cp._sections) for key in self.conf_dic: self.conf_dic[key] = dict(cp._defaults, **self.conf_dic[key]) self.conf_dic[key].pop('__name__', None) urllib2.socket.setdefaulttimeout(float(self.conf_dic['spider']['crawl_timeout'])) self.seeds = ["http://pycm.baidu.com:8081/page3.html"] # self.seeds = ["http://www.sina.com.cn/"] self.urls = [] self.urlpool = {} for i in range(len(self.seeds)): u = url.Url(self.seeds[i], "") self.urls.append(u) self.urlpool[u.link] = 1 logging.info("init")
def test__responseurl_must_be_set_first(self): """Raises error if trying to get _responseurl before setting it.""" with self.assertRaises(RuntimeError): urlm.Url('', str())._responseurl
def setUp(self): """Prepare an object for test""" self.url_obj = urlm.Url(random_string(), str()) self.url_obj._responseurl = random_string()
from google.oauth2.credentials import Credentials import url as urlm CREDENTIALS_FILE = 'credentials-desktop.json' CLIENT_SECRETS_FILE = 'client_secrets-desktop.json' SCOPES = ['https://www.googleapis.com/auth/drive.file'] credentials = None # The file credentials.json stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. # Note: this is *not* the credentials file. if os.path.exists(CREDENTIALS_FILE): credentials = Credentials.\ from_authorized_user_file(CREDENTIALS_FILE, SCOPES) # If there are no (valid) credentials available, let the user log in. if not credentials or not credentials.valid: if credentials and credentials.expired and credentials.refresh_credentials: credentials.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( CLIENT_SECRETS_FILE, SCOPES) credentials = flow.run_local_server(port=0) # Save the credentials for the next run with open(CREDENTIALS_FILE, 'w') as f: f.write(credentials.to_json()) url = urlm.Url('file:///home/rafa/re/eu/profile-picture/avatar.jpg', credentials.token) filename, basename = url.drive_it()
def test_preserves_url(self): """Instantiation with URL honor received value.""" for url in urls_for_test(): with self.subTest(url=url): self.assertEqual(url, urlm.Url(url, str()).url)
else: mysql_table.table_make() if args.update or args.https: clear_table = True h = proxy.Proxy('http', http_url) s = proxy.Proxy('https', https_url) len_table = h.form_table(clear_table) if args.https: print(f"В базе {len_table} прокси.") clear_table = False time.sleep(60) len_table += s.form_table(clear_table) print(f"В базе {len_table} прокси.") link = url.Url(args.source) main_page = html_page.HtmlPage(link.get_url()) main_html = main_page.get_html(cred_tuple, args.noproxy) if main_html and not args.material: if link.check_key('page'): parse_pages(link.get_url(), 1, args.debug, mysql_table, cred_tuple, args.noproxy) else: main_soup = BeautifulSoup(main_html, 'html.parser') try: items = main_soup.find('span', class_="total many").find('span').text except AttributeError: print("Bad first page. Try to run again.") sys.exit(0)