Exemple #1
0
    def make_request_from_data(self, data):
        """Returns a Request instance from data coming from Redis.

        By default, ``data`` is an encoded URL. You can override this method to
        provide your own message decoding.

        Parameters
        ----------
        data : bytes
            Message from redis.

        """
        url = bytes_to_str(data, self.redis_encoding)
        opt = URL(url)

        if opt.domain().find('eastmoney') == -1:
            print('eastmoney false')
            return
        been_flag = self.rlink.get(self.redis_name + 'been_url:' + url)
        if opt.domain() == 'iguba.eastmoney.com':
            self.rlink.rpush(self.redis_key, url)
            if been_flag:
                print(self.redis_name + 'been_url:' + url)
                RedisMixin.site_no_add_content_count += 1
                time.sleep(RedisMixin.site_no_add_content_count *
                           RedisMixin.site_no_add_content_count)
                return
            else:
                RedisMixin.site_no_add_content_count = 0
        else:
            if been_flag:
                return
        print(url)
        return self.make_requests_from_url(url)
Exemple #2
0
 def deal_domain(response):
     opt = URL(response.url)
     page_domain = opt.domain()
     scheme = opt.scheme()
     response.page_domain = page_domain
     response.scheme = scheme
     response.page_prefix = response.scheme + '://' + response.page_domain + '/'
Exemple #3
0
def get_data(q_link):
    url = URL(q_link)
    if url.domain() not in ['quora.com', 'www.quora.com']:
        return 'error, not quora'

    url = URL(
        scheme='https',
        host='www.quora.com',
        path=url.path(),
        query='share=1').as_string()

    soup = BeautifulSoup(requests.get(url).text)

    question = {}
    question['url'] = url
    question['title'] = soup.find("div", {"class": "question_text_edit"}).text
    question['topics'] = [topic.text for topic in soup.find_all("div", {"class": "topic_list_item"})]
    question['details'] = soup.find("div", {"class": "question_details_text"}).text

    answers = []

    divs = soup.find_all("div", {"class": "pagedlist_item"})
    
    try:
        ans_count = soup.find("div", {"class": "answer_header_text"}).text.strip()
        count = int(re.match(r'(\d+) Answers', ans_count).groups()[0])
    except:
        return jsonify(question=question, answers=answers)

    question['answer_count'] = count

    count = len(divs) - 1 if count < 6 else 6
    for i in range(count):
        one_answer = {
            'votes': '-1',
            'rank': 0,
            'answer': ''
        }
        try:
            author = {}
            author['name'] = divs[i].find("div", {"class": "answer_user"}).find("span", {"class": "answer_user_wrapper"}).find("a", {"class": "user"}).string
            author['bio'] = divs[i].find("div", {"class": "answer_user"}).find("span", {"class": "answer_user_wrapper"}).find_all("span", {"class": "rep"})[1].find("span", {"class": "hidden"}).text
        except:
            author['name'] = 'Anonymous'
            author['bio'] = ''
        one_answer['author'] = author

        one_answer['votes'] = divs[i].find("span", {"class":"numbers"}).text

        html_block = divs[i].find("div", {"id": re.compile("(.*)_container")}).contents
        answer_html = ''
        for p in range(len(html_block) - 1):
            answer_html += str(html_block[p])
        one_answer['answer_html'] = answer_html
        one_answer['answer'] = divs[i].find("div", {"class": "answer_content"}).text
        one_answer['rank'] = i + 1
        answers.append(one_answer)

    return jsonify(question=question, answers=answers)
def canonical_url(url, domain_check=True):
    """
    Ensure that the url contains the `http://mysite.com` part,
    particularly for requests made on the local dev server
    """
    current_site = Site.objects.get(id=settings.SITE_ID)
    if not url.startswith('http'):
        url = "http://%s" % os.path.join(current_site.domain, url.lstrip('/'))
    
    if domain_check:
        url_parts = URL(url)
        current_site_parts = URL(URL().domain(current_site.domain).as_string())
        if url_parts.subdomains()[-2:] != current_site_parts.subdomains()[-2:]:
            raise ValueError("Suspicious domain '%s' that differs from the "
                "current Site one '%s'" % (url_parts.domain(), current_site_parts.domain()))

    return url
Exemple #5
0
    def extract_url(self, response):
        if len(response.all_url) > 0:
            get_domain_list = []
            get_url_list = []
            for url in response.all_url:
                if not url:
                    continue
                end_fix = url[-4:len(url)]
                if '.jpg.png.gif.rar.zip.doc.pdf.css'.find(end_fix) != -1:
                    continue
                opt = URL(url)
                url_domain = opt.domain()
                if not url_domain:
                    url = response.page_prefix + '/' + url
                    url_domain = response.page_domain
                elif not opt.scheme():
                    url = 'http://' + url
                if url_domain.find('eastmoney') == -1:
                    continue
                response.pipe.get(response.spider_name + 'been_url:' + url)
                get_domain_list.append(url_domain)
                get_url_list.append(url)

            for url_domain in get_domain_list:
                response.pipe.get(response.spider_name + 'ban_host:' +
                                  url_domain)

            get_urlex_dmexp_list = response.pipe.execute()
            adv_len = len(get_url_list)
            if len(get_urlex_dmexp_list) == 0 or len(
                    get_urlex_dmexp_list) != adv_len + len(get_domain_list):
                return
            for index in range(len(get_url_list)):
                url = get_url_list[index]
                exist_flag = get_urlex_dmexp_list[index]
                if exist_flag:
                    continue
                is_ban_host = get_urlex_dmexp_list[index + adv_len]
                if is_ban_host:
                    continue

                response.pipe.lpush(self.redis_key, url)
            response.pipe.execute()
        return True
Exemple #6
0
def get_questions(s_link):
    url = URL(s_link)
    if url.domain() not in ['quora.com', 'www.quora.com']:
        return 'error, not quora'

    quora_url = URL(scheme='https',
                    host='www.quora.com',
                    path=url.path(),
                    query='share=1').as_string()

    soup = BeautifulSoup(requests.get(quora_url).get_text())

    topic = {}
    topic['url'] = quora_url
    topic['title'] = url.path().split('/')[-1]

    questions = []
    divs = soup.find_all("div", {"class": "pagedlist_item"})
    count = len(divs) - 1
    for i in range(count):
        one_question = {'url': '', 'title': ''}
        try:
            one_question['url'] = divs[i].find(
                "a", {"class": "question_link"})['href']
            one_question['title'] = divs[i].find("a", {
                "class": "question_link"
            }).find("span", {
                "class": "link_text"
            }).text
        except:
            jsonify(topic=topic,
                    questions=questions,
                    parse_failure=one_question)
        one_question['url'] = URL(scheme='https',
                                  host='www.quora.com',
                                  path=one_question['url']).as_string()

        if one_question['title'] != "":
            questions.append(one_question)

    print(f'{type(topic)}, {type(questions)}')
    return jsonify(topic=topic, questions=questions)
Exemple #7
0
def get_questions(s_link):
    url = URL(s_link)
    if url.domain() not in ['quora.com', 'www.quora.com']:
        return 'error, not quora'

    url = URL(
        scheme='https',
        host='www.quora.com',
        path=url.path(),
        query='share=1').as_string()

    soup = BeautifulSoup(requests.get(url).text)

    topic = {}
    topic['url'] = url
    topic['title'] = soup.find("span", {"class": "TopicName"}).text

    questions = []
    divs = soup.find_all("div", {"class": "pagedlist_item"})
    count = len(divs) - 1
    for i in range(count):
        one_question = {
            'url': '',
			'title': ''
        }
        try:
            one_question['url'] = divs[i].find("a", {"class": "question_link"})['href']
            one_question['title'] = divs[i].find("a", {"class": "question_link"}).find("span", {"class": "link_text"}).text
        except:
            jsonify(topic=topic, questions=questions, parse_failure=one_question)
        one_question['url'] = URL(
            scheme='https',
            host='www.quora.com',
            path=one_question['url']).as_string()

        if one_question['title'] != "":
            questions.append(one_question)

    return jsonify(topic=topic, questions=questions)
Exemple #8
0
 def test_remove_domain(self):
     url = URL('https://example.com/hello?x=100')
     new = url.domain('')
     self.assertEqual('/hello?x=100', str(new))
Exemple #9
0
print(str_url.as_string())
argument_url = URL(scheme='https',
                   host='www.google.com',
                   path='/search',
                   query='q=google')
print(argument_url)
print(argument_url.as_string())
inline_url = URL().scheme('https').domain('www.google.com').path(
    'search').query_param('q', 'google')
print(inline_url)
print(inline_url.as_string())

u = URL('postgres://*****:*****@localhost:1234/test?ssl=true')
print(u.scheme())
print(u.host())
print(u.domain())
print(u.username())
print(u.password())
print(u.netloc())
print(u.port())
print(u.path())
print(u.query())
print(u.path_segments())
print(u.query_param('ssl'))
print(u.query_param('ssl', as_list=True))
print(u.query_params())
print(u.has_query_param('ssl'))
print(u.subdomains())

u = URL.from_string('https://github.com/minwook-shin')
print(u.path_segment(0))
Exemple #10
0
 def test_remove_domain(self):
     url = URL('https://example.com/hello?x=100')
     new = url.domain('')
     self.assertEqual('/hello?x=100', str(new))