Ejemplo n.º 1
0
 def pq_html(self, bsObj):
     try:
         init_content = pq(bsObj)
         return init_content
     except ValueError:
         LOGGER.exception(ValueError)
         return None
Ejemplo n.º 2
0
 def Cold_boot(self, url, pause=3):
     """
     retry if errors are met
     """
     headers = {'user-agent': self.get_random_user_agent()}
     try:
         requests.packages.urllib3.disable_warnings(
             requests.packages.urllib3.exceptions.InsecureRequestWarning)
         r = requests.get(url=url,
                          proxies=self.proxies,
                          headers=headers,
                          allow_redirects=False,
                          verify=False,
                          timeout=30)
         time.sleep(pause)
         LOGGER.info(url)
         content = r.content
         charset = cchardet.detect(content)
         bsObj = content.decode(charset['encoding'])
         return bsObj
     except (ValueError, Exception) as e:
         # print('something')
         print(e.message)
         print("Sleeping for %i" % self.error_delay)
         time.sleep(self.error_delay)
         return self.Cold_boot(url)
Ejemplo n.º 3
0
    def search_relation(self, bsObj, pause=2):

        # RelatedKw = []
        pq_content = self.pq_html(bsObj)
        # ylog.info(pq_content)
        related_str = (str(pq_content))
        related_str_re = re.compile("\"rfs\":\[[^!]+\]")
        try:
            related_str_rfs = related_str_re.search(related_str).group()
        except AttributeError:
            LOGGER.debug(related_str)
            return None
        # ylog.debug(related_str_rfs)
        related_ls_re = re.compile("(:\[|,)(\"[A-Za-z\s\u4e00-\u9fa5]*\")")
        ls_related = related_ls_re.findall(related_str_rfs)
        RelatedKw = [x[1][1:-1] for x in ls_related]
        ylog.debug("related keywords: %s" % RelatedKw)
        # if pq_content is not None:
        #     for item in pq_content('p._Bmc').items():
        #         href = item('a').attr('href')
        #         if href:
        #             o = urlparse(href, 'http')
        #             if o.netloc:
        #                 kw = href
        #             if href.startswith('/search?'):
        #                 href = parse_qs(o.query)['q'][0]
        #                 o = urlparse(href, 'http')
        #                 if o.path:
        #                     kw = href
        #             RelatedKw.append(kw)
        return RelatedKw
Ejemplo n.º 4
0
def filter_link(link):
    try:
        o = urlparse(link, 'http')
        if o.netloc:
            return link
        if link.startswith('/url?'):
            link = parse_qs(o.query)['q'][0]
            o = urlparse(link, 'http')
            if o.netloc:
                return link
    except Exception as e:
        LOGGER.exception(e)
        return None
Ejemplo n.º 5
0
        requests.packages.urllib3.disable_warnings(
            requests.packages.urllib3.exceptions.InsecureRequestWarning)
        r = requests.get(url=url,
                         proxies=proxies,
                         headers=headers,
                         allow_redirects=False,
                         verify=False,
                         timeout=30)
        time.sleep(5)
    except requests.exceptions.SSLError as e:
        print(e)
        # LOGGER.info(url)
        ylog.debug(domain)
        time.sleep(5)
        continue
LOGGER.info(url)
content = r.content
charset = cchardet.detect(content)
text = content.decode(charset['encoding'])
bsObj = BeautifulSoup(text, "lxml")

# result counts
brief_counts = bsObj.find_all('div', id='gs_ab_md')[0].text
print(brief_counts)
text1 = brief_counts.replace(r',', "")
pattern = re.compile(u'\d+')
result_count = re.findall(pattern, text1)[0]
print(result_count)

# content
global download_link
Ejemplo n.º 6
0
        headers = {'user-agent': get_random_user_agent()}
        requests.packages.urllib3.disable_warnings(
            requests.packages.urllib3.exceptions.InsecureRequestWarning)
        r = requests.get(
            url=url,
            proxies=proxies,
            headers=headers,
            allow_redirects=False,
            verify=False,
            timeout=30)
        time.sleep(3)
    except:
        print('exception')
        time.sleep(pause)
        continue
LOGGER.info(url)
content = r.content
charset = cchardet.detect(content)
bsObj = content.decode(charset['encoding'])
# pq html
try:
    pq_content = pq(bsObj)
    # return init_content
except ValueError:
    LOGGER.exception(ValueError)
# content
result = []
if pq_content is not None:
    for item in pq_content('div.g').items():
        information = {'Title': None, 'PageURL': None, 'Abstract': None}
        Title = item('h3.r>a').eq(0).text()