def wrapper_timable(*args, **kwargs): start_time = time.time() value = func(*args, **kwargs) elapsed_time = time.time() - start_time message = 'Function: ' + str( process_name or func.__name__) + ' Time: ' + str(elapsed_time) + 's' log_info(message) return value
def run(): logging.log_info('Starting scraping data from boxofficemojo.com...\n\n') tasks = create_tasks_from_config() config = configuration.get_config() if config['execution'] is not None: if config['execution']['purgeExistingData'] is not None: if config['execution']['purgeExistingData'] == 'True': clear_all_tables() if config['execution']['executionMode'] is not None: execution_mode = ExecutionMode[config['execution']['executionMode']] filtered_tasks = list(filter(lambda t: t.executionMode == execution_mode, tasks)) run_tasks(filtered_tasks) else: run_tasks(tasks) logging.log_info('Finished scraping data from boxofficemojo.com.')
def scrape_studio_movies(self, studio_id, url): if studio_id is None or studio_id == '': return i = 1 file_name = datafile.create_data_file_path(studio_id + '_Movies.tsv') self.files.append(file_name) if not datafile.is_data_file_complete(file_name): outfile = open(file_name, "w", newline='') writer = csv.writer(outfile, delimiter='\t') logging.log_info("\t\tScraping movies for studio %s...\n", studio_id) while True: full_url = ("https://www.boxofficemojo.com/studio/" + url + "&page=%d") % i rows = scrapeutil.scrape_table_rows(full_url, attributes={ 'border': '0', 'cellspacing': '1', 'cellpadding': '5' }) if len(rows) > 0: for row in rows: cell = row.find('a') if hasattr(cell, 'text'): movie_name = cell.text.replace(' ', '') if movie_name and movie_name != 'Rank': href = cell.get('href') row_data = scrapeutil.scrape_movie( href, movie_name, studio_id) if row_data is not None: writer.writerows([row_data]) self.movies.add(row_data[0]) i += 1 else: break datafile.mark_data_file_complete(writer) else: logging.log_info("\t\tSkipped scraping studio %s\n", studio_id)
def load_data(payload): log_info(payload, name='entity-get')
def run_as_lambda(event, context): logging.log_info("Event received: %s", event) logging.log_info("Log stream name: %s", context.log_stream_name) logging.log_info("Log group name: %s", context.log_group_name) run()
def run_tasks(tasks): tasks.sort(key=lambda t: t.order) for task in tasks: logging.log_info('Executing ' + type(task).__name__ + ' for table ' + task.tableName + '...\n') task.execute() logging.log_info('DONE!\n')
def execute(self): if self.enabled: logging.log_info("\tScraping web pages now...\n") self.scrape() if self.scrapeSuccess: logging.log_info("\tScrape successful.\n") if self.executionMode is ExecutionMode.completeRewrite: logging.log_info("\tClearing table...\n") self.clear_table() if self.clearTableSuccess: logging.log_info( "\tCleared table. Started writing to table...\n") self.write_to_db() if self.writeToDbSuccess: logging.log_info( "Finished writing to table. Cleaning up data files...\n" ) self.cleanup() else: logging.log_info("\tStarted writing to table...\n") self.write_to_db() if self.writeToDbSuccess: logging.log_info("\tFinished writing to table.\n") logging.log_info("\tCleaning up data files...\n") self.cleanup() logging.log_info("\tClean up complete.\n") else: logging.log_info("\tTask was disabled. Skipping!\n")