def make_request_from_data(self, data): """Returns a Request instance from data coming from Redis. By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding. Parameters ---------- data : bytes Message from redis. """ url = bytes_to_str(data, self.redis_encoding) opt = URL(url) if opt.domain().find('eastmoney') == -1: print('eastmoney false') return been_flag = self.rlink.get(self.redis_name + 'been_url:' + url) if opt.domain() == 'iguba.eastmoney.com': self.rlink.rpush(self.redis_key, url) if been_flag: print(self.redis_name + 'been_url:' + url) RedisMixin.site_no_add_content_count += 1 time.sleep(RedisMixin.site_no_add_content_count * RedisMixin.site_no_add_content_count) return else: RedisMixin.site_no_add_content_count = 0 else: if been_flag: return print(url) return self.make_requests_from_url(url)
def deal_domain(response): opt = URL(response.url) page_domain = opt.domain() scheme = opt.scheme() response.page_domain = page_domain response.scheme = scheme response.page_prefix = response.scheme + '://' + response.page_domain + '/'
def get_data(q_link): url = URL(q_link) if url.domain() not in ['quora.com', 'www.quora.com']: return 'error, not quora' url = URL( scheme='https', host='www.quora.com', path=url.path(), query='share=1').as_string() soup = BeautifulSoup(requests.get(url).text) question = {} question['url'] = url question['title'] = soup.find("div", {"class": "question_text_edit"}).text question['topics'] = [topic.text for topic in soup.find_all("div", {"class": "topic_list_item"})] question['details'] = soup.find("div", {"class": "question_details_text"}).text answers = [] divs = soup.find_all("div", {"class": "pagedlist_item"}) try: ans_count = soup.find("div", {"class": "answer_header_text"}).text.strip() count = int(re.match(r'(\d+) Answers', ans_count).groups()[0]) except: return jsonify(question=question, answers=answers) question['answer_count'] = count count = len(divs) - 1 if count < 6 else 6 for i in range(count): one_answer = { 'votes': '-1', 'rank': 0, 'answer': '' } try: author = {} author['name'] = divs[i].find("div", {"class": "answer_user"}).find("span", {"class": "answer_user_wrapper"}).find("a", {"class": "user"}).string author['bio'] = divs[i].find("div", {"class": "answer_user"}).find("span", {"class": "answer_user_wrapper"}).find_all("span", {"class": "rep"})[1].find("span", {"class": "hidden"}).text except: author['name'] = 'Anonymous' author['bio'] = '' one_answer['author'] = author one_answer['votes'] = divs[i].find("span", {"class":"numbers"}).text html_block = divs[i].find("div", {"id": re.compile("(.*)_container")}).contents answer_html = '' for p in range(len(html_block) - 1): answer_html += str(html_block[p]) one_answer['answer_html'] = answer_html one_answer['answer'] = divs[i].find("div", {"class": "answer_content"}).text one_answer['rank'] = i + 1 answers.append(one_answer) return jsonify(question=question, answers=answers)
def canonical_url(url, domain_check=True): """ Ensure that the url contains the `http://mysite.com` part, particularly for requests made on the local dev server """ current_site = Site.objects.get(id=settings.SITE_ID) if not url.startswith('http'): url = "http://%s" % os.path.join(current_site.domain, url.lstrip('/')) if domain_check: url_parts = URL(url) current_site_parts = URL(URL().domain(current_site.domain).as_string()) if url_parts.subdomains()[-2:] != current_site_parts.subdomains()[-2:]: raise ValueError("Suspicious domain '%s' that differs from the " "current Site one '%s'" % (url_parts.domain(), current_site_parts.domain())) return url
def extract_url(self, response): if len(response.all_url) > 0: get_domain_list = [] get_url_list = [] for url in response.all_url: if not url: continue end_fix = url[-4:len(url)] if '.jpg.png.gif.rar.zip.doc.pdf.css'.find(end_fix) != -1: continue opt = URL(url) url_domain = opt.domain() if not url_domain: url = response.page_prefix + '/' + url url_domain = response.page_domain elif not opt.scheme(): url = 'http://' + url if url_domain.find('eastmoney') == -1: continue response.pipe.get(response.spider_name + 'been_url:' + url) get_domain_list.append(url_domain) get_url_list.append(url) for url_domain in get_domain_list: response.pipe.get(response.spider_name + 'ban_host:' + url_domain) get_urlex_dmexp_list = response.pipe.execute() adv_len = len(get_url_list) if len(get_urlex_dmexp_list) == 0 or len( get_urlex_dmexp_list) != adv_len + len(get_domain_list): return for index in range(len(get_url_list)): url = get_url_list[index] exist_flag = get_urlex_dmexp_list[index] if exist_flag: continue is_ban_host = get_urlex_dmexp_list[index + adv_len] if is_ban_host: continue response.pipe.lpush(self.redis_key, url) response.pipe.execute() return True
def get_questions(s_link): url = URL(s_link) if url.domain() not in ['quora.com', 'www.quora.com']: return 'error, not quora' quora_url = URL(scheme='https', host='www.quora.com', path=url.path(), query='share=1').as_string() soup = BeautifulSoup(requests.get(quora_url).get_text()) topic = {} topic['url'] = quora_url topic['title'] = url.path().split('/')[-1] questions = [] divs = soup.find_all("div", {"class": "pagedlist_item"}) count = len(divs) - 1 for i in range(count): one_question = {'url': '', 'title': ''} try: one_question['url'] = divs[i].find( "a", {"class": "question_link"})['href'] one_question['title'] = divs[i].find("a", { "class": "question_link" }).find("span", { "class": "link_text" }).text except: jsonify(topic=topic, questions=questions, parse_failure=one_question) one_question['url'] = URL(scheme='https', host='www.quora.com', path=one_question['url']).as_string() if one_question['title'] != "": questions.append(one_question) print(f'{type(topic)}, {type(questions)}') return jsonify(topic=topic, questions=questions)
def get_questions(s_link): url = URL(s_link) if url.domain() not in ['quora.com', 'www.quora.com']: return 'error, not quora' url = URL( scheme='https', host='www.quora.com', path=url.path(), query='share=1').as_string() soup = BeautifulSoup(requests.get(url).text) topic = {} topic['url'] = url topic['title'] = soup.find("span", {"class": "TopicName"}).text questions = [] divs = soup.find_all("div", {"class": "pagedlist_item"}) count = len(divs) - 1 for i in range(count): one_question = { 'url': '', 'title': '' } try: one_question['url'] = divs[i].find("a", {"class": "question_link"})['href'] one_question['title'] = divs[i].find("a", {"class": "question_link"}).find("span", {"class": "link_text"}).text except: jsonify(topic=topic, questions=questions, parse_failure=one_question) one_question['url'] = URL( scheme='https', host='www.quora.com', path=one_question['url']).as_string() if one_question['title'] != "": questions.append(one_question) return jsonify(topic=topic, questions=questions)
def test_remove_domain(self): url = URL('https://example.com/hello?x=100') new = url.domain('') self.assertEqual('/hello?x=100', str(new))
print(str_url.as_string()) argument_url = URL(scheme='https', host='www.google.com', path='/search', query='q=google') print(argument_url) print(argument_url.as_string()) inline_url = URL().scheme('https').domain('www.google.com').path( 'search').query_param('q', 'google') print(inline_url) print(inline_url.as_string()) u = URL('postgres://*****:*****@localhost:1234/test?ssl=true') print(u.scheme()) print(u.host()) print(u.domain()) print(u.username()) print(u.password()) print(u.netloc()) print(u.port()) print(u.path()) print(u.query()) print(u.path_segments()) print(u.query_param('ssl')) print(u.query_param('ssl', as_list=True)) print(u.query_params()) print(u.has_query_param('ssl')) print(u.subdomains()) u = URL.from_string('https://github.com/minwook-shin') print(u.path_segment(0))