Esempio n. 1
0
 def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False):
     '''
     :param url:
     :param destination_directory:
     :param output_file:
     '''
     try:
         # get parameters
         self.url, self.destination_directory, output_file = triple
         self.output_file = self.destination_directory + output_file
         self.print = printing
         # crawl the pages
         # to be fixed
         if crawl:
             BibleCrawler.run_crawler(self, '//a[@class = "chapter-nav-right"]/@href', self.url, self.destination_directory)
         if parse:
             self.lang_directory = self.url.split('/')[3]
             # crawl the pages
             books=self.destination_directory + self.lang_directory
             self.run_parser(books, self.output_file)
             if remove_after_parse:
                 # parse the output file
                 # remove the directory
                 FileUtility.remove_dir(self.destination_directory + self.lang_directory)
     except:
         try:
             print(triple)
         except:
             return None
     return None
Esempio n. 2
0
 def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False):
     '''
     :param url:
     :param destination_directory:
     :param output_file:
     '''
     # get parameters
     self.url, self.destination_directory, output_file = triple
     response = requests.get(self.url)
     if response.status_code == 200:
         try:
             self.output_file = self.destination_directory + output_file
             self.print = printing
             # crawl the pages
             if crawl:
                 BibleCrawler.run_crawler(self, '//a[@class = "next"]/@href', self.url, self.destination_directory)
             if parse:
                 # find the lang ID in the website
                 self.lang_directory = '/'.join(self.url.split('/')[3:7]) + '/'
                 self.url = self.url[self.url.find('.com') + 5::]
                 if '.' in self.url.split('/')[-1]:
                     self.lang_directory = '/'.join(self.url.split('/')[3:-1]) + '/'
                 books = self.destination_directory + self.lang_directory
                 self.run_parser(books, self.output_file)
                 if remove_after_parse:
                     # parse the output file
                     # remove the directory
                     FileUtility.remove_dir(self.destination_directory + self.lang_directory)
         except:
             return None
     else:
         return None
     return None
 def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False):
     '''
     :param url:
     :param destination_directory:
     :param output_file:
     '''
     # get parameters
     self.url, self.destination_directory, output_file=triple
     self.output_file=self.destination_directory + output_file
     self.print=printing
     # find the lang ID in the website
     self.lang_directory = self.url.split('/')[0:-1][-1]
     if crawl:
         # crawl the pages
         BibleCrawler.run_crawler(self,'//a[text() = ">"]/@href',self.url, self.destination_directory, website='PNG')
     if parse:
         # parse the output file
         books=self.destination_directory
         self.run_parser(books, self.output_file)
         if remove_after_parse:
             FileUtility.remove_dir(self.destination_directory + self.lang_directory)
     return None