Beispiel #1
0
    def get_new_loan_pages(self, wait, N):
        LPP = LoanPageParser()
        loanids = self.new_loans_set()

        # pull N loan pages at a time and insert into DB
        counter = 0
        num_loanids = len(loanids)
        while len(loanids)>0:
            LC = PageCrawler(self.loan_page_url, self.loan_page_login_str, wait)
            
            # create subset of loans to crawl for
            loans_to_grab = []
            for i in range(N):
                try:
                    loans_to_grab.append(loanids.pop())
                except IndexError:
                    break
                
            LC.crawl(loans_to_grab)
            loans = LC.get_data()
           
            # parse loan and insert into DB
            for loanID in loans:
                html = loans[loanID]
                db_doc = LPP.parse_html(html)
                self.loans.update({'loanID':db_doc['loanID']},
                                  {'$set': db_doc}, upsert=True, safe=True)
                counter += 1
            print 'inserted loan %s of %s' % (counter, num_loanids)
Beispiel #2
0
 def get_note_pages(self, note_tups, wait):
     PC = PageCrawler(self.note_page_url, self.login_str, wait)    
     PC.crawl(note_tups)
     return PC.get_data()