コード例 #1
0
def split_dburl(dburl, default_dburl=None):
    """
    we split the url into the base mongodb URL, and the path element, whose
    first element is the database name, and the remainder is interpreted as
    collection id.
    """

    # if the given URL does not contain schema nor host, the default URL is used
    # as base, and the given URL string is appended to the path element.

    url = ruu.Url(dburl)

    if not url.schema and not url.host:
        url = ruu.Url(default_dburl)
        url.path = dburl

    # NOTE: add other data base schemes here...
    if 'mongodb' not in url.schema.split('+'):
        raise ValueError(
            "url must be a 'mongodb://' or 'mongodb+ssl://' url, not '%s'" %
            dburl)

    host = url.host
    port = url.port
    path = url.path
    user = url.username
    pwd = url.password
    ssl = False

    if 'ssl' in url.schema.split('+'):
        ssl = True
        url.schema = 'mongodb'

    if not host:
        host = 'localhost'

    if path.startswith('/'):
        path = path[1:]
    path_elems = path.split('/')

    dbname = None
    cname = None
    pname = None

    if len(path_elems) > 0:
        dbname = path_elems[0]

    if len(path_elems) > 1:
        dbname = path_elems[0]
        cname = path_elems[1]

    if len(path_elems) > 2:
        dbname = path_elems[0]
        cname = path_elems[1]
        pname = '.'.join(path_elems[2:])

    if dbname == '.':
        dbname = None

    return [host, port, dbname, cname, pname, user, pwd, ssl]
コード例 #2
0
def home():
    url, local_filename, remote_basename, error = None, None, None, None
    if request.method == 'POST':
        session['_url'] = request.form['url']

        return redirect(url_for('signin'))

    if session.get('_url'):
        # Fetch the OAuth credentials that will be used to obtain upload access to
        # the Google Drive.
        credentials = google.oauth2.credentials.\
          Credentials(**session['credentials'])

        try:
            url = urlm.Url(session['_url'], credentials.token)
            local_filename, remote_basename = url.drive_it()
        except RuntimeError as e:
            flash(str(e), 'notification')
        else:
            msg = 'Success! File "{}" is in your Drive now.'
            flash(msg.format(remote_basename), 'notification')
        finally:
            session['_url'] = None

    return render_template('index.html')
コード例 #3
0
    def test_uses_expected_method(self):
        """Uses ‘urllib.parse.urlparse’, with the correct URL.

        It’s assumed that ‘urllib.parse.urlparse’ will be used."""

        with patch('urllib.parse.urlparse') as urlparse_mock:
            # Get a test Url instance ready.
            url_obj = urlm.Url(random_string(), str())

            # Using another random string for the attribute ‘_responseurl’ to
            # have an independent test that the program sets and gets it
            # properly, and that it does not change it before using it to
            # obtain ‘_urlpath’.
            responseurl = random_string()
            url_obj._responseurl = responseurl

            # Get the mocked method ready to be called.
            # Assigns the property ‘path’ to whatever is returned after calling
            # ‘urlparse_mock()’.
            # This is necessary because the parsed URL itself corresponds to the
            # property ‘path’ from the object that is returned from the parsing
            # method.
            urlparse_mock().path = random_string()

            # Force test object to parse its URL, giving it a reason to call
            # ‘urllib.parse.urlparse’.
            urlpath = url_obj._urlpath

            # Check if ‘urllib.parse.urlparse’ was called, and with the correct
            # URL.
            urlparse_mock.assert_called_with(responseurl)

            # Check if it assigned its attribute to the correct property from
            # the return value of ‘urllib.parse.urlparse’.
            self.assertEqual(url_obj._urlpath, urlparse_mock().path)
コード例 #4
0
    def test_no_exception_if_url_string(self):
        """No error when instantiated with string."""

        try:
            urlm.Url(str(), str())
        except TypeError:
            self.fail('Raised TypeError when instantiated with string')
コード例 #5
0
    def setUp(self):
        """Create test file to be downloaded and test object to do it."""

        self.f_remote = random_temp_file()

        self.url_obj = urlm.Url(random_string(), str())
        self.url_obj._responseurl = random_string()
コード例 #6
0
ファイル: mini_spider.py プロジェクト: zgbdsg/MiniSpider
    def extractlink(self, soup, purl):
        hrefs = []
        links1 = soup.findAll("a")
        for link in links1:
            href = link.get('href')
            if href:
                href = href.strip("/")
            # if 'javascript:location.href' in href:
            #     href = href[len("javascript:location.href")+2:-2]
            hrefs.append(href)

        links2 = soup.find_all("img")
        for link in links2:
            href = link.get('src')
            if href:
                hrefs.append(href)

        for href in hrefs:
            # u = url.Url(purl.seed,purl.relpath+"/"+href)
            u = url.Url(purl.link, href)
            if u.depth > int(self.conf_dic['spider']['max_depth']):
                continue
            if u.link not in self.urlpool:
                self.urlpool[u.link] = 1
                req = threadpool.WorkRequest(self.crawl, [u], callback=self.print_result,
                                             exc_callback=self.handle_exception)
                # print(newurl)
                self.pool.putRequest(req)
                print "new url  %s added." % u.link
コード例 #7
0
    def test_raises_errors(self):
        """Raises errors as promised in the docstring."""

        with patch('urllib.parse.urlparse') as urlparse_mock:
            # Create a test object.
            url_obj = urlm.Url(random_string(), str())
            url_obj._responseurl = random_string()

            # Make the mocked method raises the proper error when called.
            for exception in builtin_exceptions():
                # Skip trickier Unicode exceptions.
                if exception.__name__.startswith('Unicode'):
                    continue

                with self.subTest(exception=exception):
                    urlparse_mock.side_effect = exception

                    # Check if the expected error was raised as a result.
                    # For this, it suffices to get the attribute, which will
                    # then call ‘urllib.parse.urlparse’.
                    # ValueError should raise a RuntimeError.
                    # All other exceptions should raise themselves.
                    if exception is ValueError:
                        should_raise = RuntimeError
                    else:
                        should_raise = exception

                    with self.assertRaises(should_raise):
                        url_obj._urlpath
コード例 #8
0
ファイル: crawler.py プロジェクト: raxers/wildberries
def push_and_pull(start_page, pages, debug, sql_table, creds, proxy_pass):
    pnp_start = url.Url(start_page)
    pnp_start.add_key('sort', 'priceup')
    parse_pages(pnp_start.get_url(), 100, debug, sql_table, creds, proxy_pass)
    pnp_start.change_key('sort', 'pricedown')
    parse_pages(pnp_start.get_url(), pages - 100, debug, sql_table, creds,
                proxy_pass)
コード例 #9
0
    def test__responseurl_get(self):
        """‘get’ works properly for ‘_responseurl’."""

        url_obj = urlm.Url('', str())
        for url in urls_for_test():
            with self.subTest(url=url):
                url_obj._responseurl = url
                self.assertEqual(url_obj._responseurl, url)
コード例 #10
0
    def test_exception_if_not_string(self):
        """Error when instantiated with something other than string."""

        for typer in (int, float, tuple, list, set, dict):
            for args in ((typer, str()), (str(), typer)):
                with self.subTest(args=args):
                    # Create a dummy object for test by calling each type.
                    # Called without any arguments, ‘typer’ returns an object of
                    # its type.
                    with self.assertRaises(TypeError):
                        urlm.Url(**args)
コード例 #11
0
def parse_pages(start_page, pages, debug, sql_table, creds, proxy_pass):

    import url
    parse_page = url.Url(start_page)
    first_page = html_page.HtmlPage(parse_page.get_url())
    html = first_page.get_html(creds, proxy_pass)

    if html:
        soup = BeautifulSoup(html, 'html.parser')

        # 1st page
        arts_dict = {}
        for i in soup.findAll('div', class_="j-card-item"):
            art_num = re.search(r'\d+', i.get('data-popup-nm-id'))
            arts_dict[art_num[0]] = i.find('a')['href']
        for art, link in arts_dict.items():
            if not sql_table.table_check_presence(art, creds[6]):
                handbag = bag.Bag()
                if not link.startswith('https'):
                    link = "https://www.wildberries.ru" + link
                handbag.get_bag_page(art, link, debug, creds, proxy_pass)
                sql_table.table_append(handbag)
        sql_table.cnx.commit()

        # after 1st page
        if parse_page.check_key('page'):
            return 0
        parse_page.add_key('page', '1')

        # 2nd page and further
        for i in range(2, pages + 1):
            parse_page.change_key('page', str(i))
            print(parse_page.get_url())
            have_a_try = 3
            if have_a_try:
                further_page = html_page.HtmlPage(parse_page.get_url())
                arts_dict = further_page.get_wb_page(creds, proxy_pass)
                if arts_dict:
                    for art, url in arts_dict.items():
                        if not sql_table.table_check_presence(art, creds[6]):
                            handbag = bag.Bag()
                            handbag.get_bag_page(art, url, debug, creds,
                                                 proxy_pass)
                            sql_table.table_append(handbag)
                    sql_table.cnx.commit()
                    continue
                else:
                    sql_table.cnx.commit()
                    print(f"Page {str(i)} parse error. Trying again.")
                    have_a_try -= 1
            else:
                sql_table.cnx.commit()
                print(f"No luck. Next page.")
コード例 #12
0
ファイル: webscraper.py プロジェクト: Cycloone/WebScraper
 def __init__(self):
     self.template = template.Template(template)
     arg_parser = ArgumentParser()
     arg_parser.add_argument("-u", "--url", dest="url")
     args = arg_parser.parse_args()
     if args.url is None:
         print("Вы забыли ввести URL")
     else:
         page = url.Url(args.url)
         data = page.get_content()
         path = page.parse_url()
         current_template = self.template.get_template()
         html_parser = MyHTMLParser(current_template)
         html_parser.feed(data)
         self.content = content.Content(current_template, html_parser.content)
         write_data = self.content.format_data()
         self.save_data(path, write_data)
コード例 #13
0
    def test__upload(self):
        """Upload method properly receives file chunks."""

        # Create a test object.
        url_obj = urlm.Url(random_string(), random_string())

        for upload_chunk_size in [1, 2, 3, 5, 7, 11, 256, 2 * 256 * 1024]:
            with self.subTest(upload_chunk_size=upload_chunk_size):
                # Patch a mock to intercept the uploaded chunks.
                with patch('requests.put') as put_mock,\
                     patch('url.Url._get_upload_url')\
                         as _get_upload_url_mock,\
                     patch('url.get_last_uploaded_byte')\
                         as get_last_uploaded_byte_mock:
                    # Prepare the mocked method.  We won't need a meaningful
                    # return value for it because the method that would use
                    # it, requests.put, is also being patched.
                    _get_upload_url_mock.return_value = random_string()
                    # Create a test file to be uploaded.
                    with open(random_temp_file(), mode='rb') as original,\
                         NamedTemporaryFile(mode='wb', delete=False)\
                         as uploaded:
                        # Prepare the test object.
                        url_obj._filename = original.name

                        # Prepare the mocked method that will return the last
                        # successfully uploaded byte for each iteration.  It
                        # consists of a list of byte positions from 0 to
                        # file_size - 1.
                        file_size = os.path.getsize(original.name)
                        get_last_uploaded_byte_mock.side_effect =\
                            self.get_lubmse(file_size, upload_chunk_size)

                        # Call the upload method.
                        url_obj._upload(upload_chunk_size=upload_chunk_size)

                        for call in put_mock.call_args_list:
                            chunk = call.kwargs['data']
                            uploaded.write(chunk)

                    # Check if the constructed file has the same contents as
                    # the original file.
                    self.assertTrue(filecmp.cmp(original.name, uploaded.name))

                    os.remove(original.name)
                    os.remove(uploaded.name)
コード例 #14
0
ファイル: mini_spider.py プロジェクト: zgbdsg/MiniSpider
    def __init__(self, config, seed):
        log.init_log('./log/MiniSpider')

        cp = ConfigParser.ConfigParser()
        cp.readfp(open(config))

        self.conf_dic = dict(cp._sections)
        for key in self.conf_dic:
            self.conf_dic[key] = dict(cp._defaults, **self.conf_dic[key])
            self.conf_dic[key].pop('__name__', None)

        urllib2.socket.setdefaulttimeout(float(self.conf_dic['spider']['crawl_timeout']))
        self.seeds = ["http://pycm.baidu.com:8081/page3.html"]
        # self.seeds = ["http://www.sina.com.cn/"]

        self.urls = []
        self.urlpool = {}
        for i in range(len(self.seeds)):
            u = url.Url(self.seeds[i], "")
            self.urls.append(u)
            self.urlpool[u.link] = 1

        logging.info("init")
コード例 #15
0
    def test__responseurl_must_be_set_first(self):
        """Raises error if trying to get _responseurl before setting it."""

        with self.assertRaises(RuntimeError):
            urlm.Url('', str())._responseurl
コード例 #16
0
    def setUp(self):
        """Prepare an object for test"""

        self.url_obj = urlm.Url(random_string(), str())
        self.url_obj._responseurl = random_string()
コード例 #17
0
from google.oauth2.credentials import Credentials
import url as urlm

CREDENTIALS_FILE = 'credentials-desktop.json'
CLIENT_SECRETS_FILE = 'client_secrets-desktop.json'
SCOPES = ['https://www.googleapis.com/auth/drive.file']

credentials = None
# The file credentials.json stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
# Note: this is *not* the credentials file.
if os.path.exists(CREDENTIALS_FILE):
    credentials = Credentials.\
        from_authorized_user_file(CREDENTIALS_FILE, SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not credentials or not credentials.valid:
    if credentials and credentials.expired and credentials.refresh_credentials:
        credentials.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            CLIENT_SECRETS_FILE, SCOPES)
        credentials = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open(CREDENTIALS_FILE, 'w') as f:
        f.write(credentials.to_json())

url = urlm.Url('file:///home/rafa/re/eu/profile-picture/avatar.jpg',
               credentials.token)
filename, basename = url.drive_it()
コード例 #18
0
    def test_preserves_url(self):
        """Instantiation with URL honor received value."""

        for url in urls_for_test():
            with self.subTest(url=url):
                self.assertEqual(url, urlm.Url(url, str()).url)
コード例 #19
0
ファイル: crawler.py プロジェクト: raxers/wildberries
    else:
        mysql_table.table_make()

    if args.update or args.https:
        clear_table = True
        h = proxy.Proxy('http', http_url)
        s = proxy.Proxy('https', https_url)
        len_table = h.form_table(clear_table)
        if args.https:
            print(f"В базе {len_table} прокси.")
            clear_table = False
            time.sleep(60)
            len_table += s.form_table(clear_table)
        print(f"В базе {len_table} прокси.")

    link = url.Url(args.source)
    main_page = html_page.HtmlPage(link.get_url())
    main_html = main_page.get_html(cred_tuple, args.noproxy)

    if main_html and not args.material:
        if link.check_key('page'):
            parse_pages(link.get_url(), 1, args.debug, mysql_table, cred_tuple,
                        args.noproxy)
        else:
            main_soup = BeautifulSoup(main_html, 'html.parser')
            try:
                items = main_soup.find('span',
                                       class_="total many").find('span').text
            except AttributeError:
                print("Bad first page. Try to run again.")
                sys.exit(0)