Example #1
0
    def my_parse(self, response):

        log.msg('Parsing urls from %s' % response.url, level=log.INFO)

        # http://my.linkedin.com/directory/people/a.html
        lx1 = SgmlLinkExtractor(
                allow= '(' + self.base_url + ')?' + r'/directory/people/([a-z]|\@)\.html',
                deny=(self.deny_re),
              )
        # http://my.linkedin.com/directory/people/my/A1.html
        lx2 = SgmlLinkExtractor(
                allow= '('+ self.base_url + ')?' + r'/directory/people/my/[A-Z]\d+\.html',
                deny=(self.deny_re),
                )
        # http://my.linkedin.com/directory/people/my/ahamid-3.html
        # http://my.linkedin.com/directory/people/my/aan.html
        lx3 = SgmlLinkExtractor(
                allow= '(' + self.base_url + ')?' + r'/directory/people/my/[a-z]+(\-\d+)?\.html',
                deny=(self.deny_re),
                )
        # http://my.linkedin.com/pub/zarita-a-baharum/23/9a2/756
        lx4 = SgmlLinkExtractor(
                allow= '(' + self.base_url +')?' + r'/pub/[a-z\-]+/[a-z0-9]+/[a-z0-9]+/[a-z0-9]+',
                deny=(self.deny_re),
                )
        # http://www.linkedin.com/in/levananh
        lx5 = SgmlLinkExtractor(
                allow= '(' + self.base_url + ')?' + r'/in/[a-z0-9]+$',
                deny=(self.deny_re),
                )

        try:
            l1 = lx1._extract_links(response.body, response.url, 'utf-8')
            l1 = lx1._process_links(l1)

            l2 = lx2._extract_links(response.body, response.url, 'utf-8')
            l2 = lx2._process_links(l2)

            l3 = lx3._extract_links(response.body, response.url, 'utf-8')
            l3 = lx3._process_links(l3)

            l4 = lx4._extract_links(response.body, response.url, 'utf-8')
            l4 = lx4._process_links(l4)

            l5 = lx5._extract_links(response.body, response.url, 'utf-8')
            l5 = lx5._process_links(l5)

            links = [URL(main_url = response.url, found_urls = l1[i].url) for i in range(len(l1))]
            links.extend([URL(main_url = response.url, found_urls = l2[i].url) for i in range(len(l2))])
            links.extend([URL(main_url = response.url, found_urls = l3[i].url) for i in range(len(l3))])
            links.extend([URL(main_url = response.url, found_urls = clean_url(l4[i].url)) for i in range(len(l4))])
            links.extend([URL(main_url = response.url, found_urls = clean_url(l5[i].url)) for i in range(len(l5))])
            s = 'http://' + CountryCode.code
            if s in response.url:
                links.append(URL(main_url = response.url, found_urls = '$'))

        except:
            pass

        pub_re = [r'/pub/[a-z\-]+/[a-z0-9]+/[a-z0-9]+/[a-z0-9]+',
                  r'/in/[a-z0-9]+'
                 ]
        for pub in pub_re:
            if re.search(pub, response.url):
                self.extract(response) # extract profiles
        
        self.db.insert_urls(links)
Example #2
0
 def _process_links(self, links):
   links = SgmlLinkExtractor._process_links(self, links)
   return links