def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False): ''' :param url: :param destination_directory: :param output_file: ''' try: # get parameters self.url, self.destination_directory, output_file = triple self.output_file = self.destination_directory + output_file self.print = printing # crawl the pages # to be fixed if crawl: BibleCrawler.run_crawler(self, '//a[@class = "chapter-nav-right"]/@href', self.url, self.destination_directory) if parse: self.lang_directory = self.url.split('/')[3] # crawl the pages books=self.destination_directory + self.lang_directory self.run_parser(books, self.output_file) if remove_after_parse: # parse the output file # remove the directory FileUtility.remove_dir(self.destination_directory + self.lang_directory) except: try: print(triple) except: return None return None
def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False): ''' :param url: :param destination_directory: :param output_file: ''' # get parameters self.url, self.destination_directory, output_file = triple response = requests.get(self.url) if response.status_code == 200: try: self.output_file = self.destination_directory + output_file self.print = printing # crawl the pages if crawl: BibleCrawler.run_crawler(self, '//a[@class = "next"]/@href', self.url, self.destination_directory) if parse: # find the lang ID in the website self.lang_directory = '/'.join(self.url.split('/')[3:7]) + '/' self.url = self.url[self.url.find('.com') + 5::] if '.' in self.url.split('/')[-1]: self.lang_directory = '/'.join(self.url.split('/')[3:-1]) + '/' books = self.destination_directory + self.lang_directory self.run_parser(books, self.output_file) if remove_after_parse: # parse the output file # remove the directory FileUtility.remove_dir(self.destination_directory + self.lang_directory) except: return None else: return None return None
def __init__(self, triple, crawl=True, parse=True, remove_after_parse=False, printing=False): ''' :param url: :param destination_directory: :param output_file: ''' # get parameters self.url, self.destination_directory, output_file=triple self.output_file=self.destination_directory + output_file self.print=printing # find the lang ID in the website self.lang_directory = self.url.split('/')[0:-1][-1] if crawl: # crawl the pages BibleCrawler.run_crawler(self,'//a[text() = ">"]/@href',self.url, self.destination_directory, website='PNG') if parse: # parse the output file books=self.destination_directory self.run_parser(books, self.output_file) if remove_after_parse: FileUtility.remove_dir(self.destination_directory + self.lang_directory) return None