Esempio n. 1
0
 def __init__(self, out=sys.stdout):
     self.out = out
     self.decoy = BrowserDecoy()
     self.visited = set([])
 def __init__(self, out=sys.stdout):
    self.out = out
    self.decoy = BrowserDecoy()
    self.visited = set([])
Esempio n. 3
0
class CellCrawler(object):
    """A crawler to retrive the links to the articles of Cell."""

    # Regular expressions used for crawling.
    BASE_URL = 'http://www.cell.com/'
    PREV_NEXT = '"/(issue\?pii[^"]+)'
    FULL_TEXT = '"/(fulltext/[^"]+)'
    PDF = 'href="(http://download.cell.com/pdf/[^"]+.pdf)"'
    DATE = '<title>.*, (.*)</title>'
    SWITCH_TIME = dt(2005, 5, 6, 0, 0)

    def __init__(self, out=sys.stdout):
        self.out = out
        self.decoy = BrowserDecoy()
        self.visited = set([])

    def start(self,
              url="current",
              headers={'Host': 'www.cell.com'},
              verbose=True):
        """Start crawling by getting the content of the current issue.
      Take a cookie as we go and initiate recursive crawling.

      NB: All URLs are assumed to be relative.
      """

        # Instantiate a browser decoy.
        full_url = self.BASE_URL + url
        self.decoy.connect(full_url, headers)

        if verbose:
            sys.stderr.write('started at %s\n' % full_url)

        # Get the url to previous issue.
        to_previous_issue = re.findall(self.PREV_NEXT, self.decoy.read())

        # Update headers with referer and cookie.
        headers.update({'Referer': full_url})
        headers.update(self.decoy.get_cookie_items())

        self.visited.add(full_url)

        # Crawl!
        self.crawl(url_list=to_previous_issue,
                   headers=headers,
                   verbose=verbose)

    def crawl(self, url_list, headers, verbose=True):

        # Skip visited urls.
        urls_to_visit = set(url_list).difference(self.visited)

        for url in urls_to_visit:

            if verbose:
                sys.stderr.write(self.BASE_URL + url + '\n')

            # Connect.
            retries = 0
            while retries < 3:
                # Give the conncetion 3 tries and then give up.
                try:
                    self.decoy.connect(self.BASE_URL + url, headers)
                    content = remove_JS(self.decoy.read())
                    date_match = re.search(self.DATE, content).groups()
                    date = dt.strptime(date_match[0], '%d %B %Y')
                except:
                    retries += 1
                    continue
                else:
                    break

            if date > self.SWITCH_TIME:
                # After SWITCH_TIME (6 May 2005), research articles
                # have the 'article' class.
                BS = BeautifulSoup(content)
                article_tags = BS.findAll(attrs={'class': 'article'})
                articles = '\n'.join([str(tag) for tag in article_tags])
            else:
                # Before SWITCH_TIME (6 May 2005) there is no separation
                # between the different types of articles.
                articles = content

            # Now grep a couple of links.
            to_prev_next = re.findall(self.PREV_NEXT, content)
            to_pdf = re.findall(self.PDF, articles)
            to_full = re.findall(self.FULL_TEXT, articles)

            # Dump.
            self.out.write('"%s": ' % url)
            json.dump([to_pdf, to_full], self.out)
            self.out.write(',\n')

            # Add url to visited and update referer...
            self.visited.add(url)
            headers['Referer'] = url

            # ... and crawl onwards!
            self.crawl(url_list=to_prev_next, headers=headers)
class CellCrawler(object):
   """A crawler to retrive the links to the articles of Cell."""

   # Regular expressions used for crawling.
   BASE_URL = 'http://www.cell.com/'
   PREV_NEXT = '"/(issue\?pii[^"]+)'
   FULL_TEXT = '"/(fulltext/[^"]+)'
   PDF = 'href="(http://download.cell.com/pdf/[^"]+.pdf)"'
   DATE = '<title>.*, (.*)</title>'
   SWITCH_TIME = dt(2005, 5, 6, 0, 0)

   def __init__(self, out=sys.stdout):
      self.out = out
      self.decoy = BrowserDecoy()
      self.visited = set([])

   def start( self, url = "current", headers = {'Host': 'www.cell.com'},
         verbose = True):
      """Start crawling by getting the content of the current issue.
      Take a cookie as we go and initiate recursive crawling.

      NB: All URLs are assumed to be relative.
      """

      # Instantiate a browser decoy.
      full_url = self.BASE_URL + url
      self.decoy.connect(full_url, headers)

      if verbose:
         sys.stderr.write('started at %s\n' % full_url)

      # Get the url to previous issue.
      to_previous_issue = re.findall(self.PREV_NEXT, self.decoy.read())

      # Update headers with referer and cookie.
      headers.update({'Referer': full_url})
      headers.update(self.decoy.get_cookie_items())

      self.visited.add(full_url)

      # Crawl!
      self.crawl(
          url_list = to_previous_issue,
          headers = headers,
          verbose = verbose
      )

   def crawl(self, url_list, headers, verbose=True):

      # Skip visited urls.
      urls_to_visit = set(url_list).difference(self.visited)

      for url in urls_to_visit:

         if verbose:
            sys.stderr.write(self.BASE_URL + url + '\n')

         # Connect.
         retries = 0
         while retries < 3:
            # Give the conncetion 3 tries and then give up.
            try:
               self.decoy.connect(self.BASE_URL + url, headers)
               content = remove_JS(self.decoy.read())
               date_match = re.search(self.DATE, content).groups()
               date = dt.strptime(date_match[0], '%d %B %Y')
            except:
               retries += 1
               continue
            else:
               break

         if date > self.SWITCH_TIME:
            # After SWITCH_TIME (6 May 2005), research articles
            # have the 'article' class.
            BS = BeautifulSoup(content)
            article_tags = BS.findAll(attrs={'class': 'article'})
            articles = '\n'.join([str(tag) for tag in article_tags])
         else:
            # Before SWITCH_TIME (6 May 2005) there is no separation
            # between the different types of articles.
            articles = content


         # Now grep a couple of links.
         to_prev_next = re.findall(self.PREV_NEXT, content)
         to_pdf = re.findall(self.PDF, articles)
         to_full = re.findall(self.FULL_TEXT, articles)

         # Dump.
         self.out.write('"%s": ' % url)
         json.dump([to_pdf, to_full], self.out)
         self.out.write(',\n')

         # Add url to visited and update referer...
         self.visited.add(url)
         headers['Referer'] = url

         # ... and crawl onwards!
         self.crawl(
             url_list = to_prev_next,
             headers = headers
         )