def do_upload(): csvfile = request.files.get('csvfile', None) fblogin() s = Scraper() try: if csvfile.file == None: raise Exception('The file is None') company_list = s.read_csv(csvfile.file) if len(company_list) > 0: # Save the file f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb') f.write(csvfile.value) f.seek(0) # Save to CSV_FILE db csv_file_path, db_file_path = save_csv_db(csvfile) # Run the scrape process in background # TODO just upload, not scrape do_scrape_async(s, csv_file_path, db_file_path) else: raise Exception('The file is not format as company list') except Exception as e: log.error(e) return csv_upload(error_message='Error: %s' % e.message) #return redirect('/') return csv_upload( success_message= 'The file updated success and will do scrape in background. Please refrush page later to view the new data.' )
def re_scrape_schedule(): conn = sqlite3.connect('data/setting.db') c = conn.cursor() schedule_interval = c.execute( 'SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0] c.close() conn.close() conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute( 'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s, csv_path, db_path) thread.start() # Re schedule with new interval seconds cron.reSchedule(seconds=schedule_interval) return settings( success_message= 'The cron job has been started in background and rescheduled.')
def do_upload(): csvfile = request.files.get('csvfile', None) fblogin() s = Scraper() try: if csvfile.file == None: raise Exception('The file is None') company_list = s.read_csv(csvfile.file) if len(company_list) > 0: # Save the file f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb') f.write(csvfile.value) f.seek(0) # Save to CSV_FILE db csv_file_path, db_file_path = save_csv_db(csvfile) # Run the scrape process in background # TODO just upload, not scrape do_scrape_async(s, csv_file_path, db_file_path) else: raise Exception('The file is not format as company list') except Exception as e: log.error(e) return csv_upload(error_message='Error: %s' % e.message) #return redirect('/') return csv_upload( success_message='The file updated success and will do scrape in background. Please refrush page later to view the new data.')
def doJob(): ''' Do the scrape every interval time ''' #get all csv and db paths conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute( 'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() threads = [] fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s, csv_path, db_path) #thread.start() thread.run() # run threads one by one threads.append(thread) # Wait for all threads to complete #for t in threads: # t.join() log.info('all scraper threads finished in doJob()') return
def doJob(): ''' Do the scrape every interval time ''' #get all csv and db paths conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute('SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() threads = [] fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s,csv_path, db_path) #thread.start() thread.run() # run threads one by one threads.append(thread) # Wait for all threads to complete #for t in threads: # t.join() log.info('all scraper threads finished in doJob()') return
def testMain(self): fblogin() s = main.Scraper() # Test None file file = None try: s.read_csv(file) except Exception as e: self.assertEqual('The file is none.', e.message) # Test good format csv file = open('testdata/good_format.csv', 'rb') company_list = s.read_csv(file) self.assertTrue(len(company_list) > 0) list = s.get_social_media(company_list[0:1], 'testdata/data.db') self.assertEqual(1, len(list)) c = list[0] self.assertTrue('Wal-Mart Stores', c.company_name) # Test good format csv file = open('data/NRN_RestaurantList.csv', 'rb') company_list = s.read_csv(file) self.assertTrue(len(company_list) > 0) list = s.get_social_media(company_list[0:1], 'testdata/data.db') self.assertEqual(1, len(list)) c = list[0] #self.assertTrue('Wal-Mart Stores', c.company_name) # Test error format csv try: file = open('testdata/error_format.csv', 'rb') s.read_csv(file) except Exception as e: self.assertTrue(e) # Test write db s.write_db(list,'testdata/data.db') conn = sqlite3.connect('testdata/data.db') c = conn.cursor() # Create table c.execute('DELETE FROM COMPANY') conn.commit() c.close() conn.close()
def testMain(self): fblogin() s = main.Scraper() # Test None file file = None try: s.read_csv(file) except Exception as e: self.assertEqual('The file is none.', e.message) # Test good format csv file = open('testdata/good_format.csv', 'rb') company_list = s.read_csv(file) self.assertTrue(len(company_list) > 0) list = s.get_social_media(company_list[0:1], 'testdata/data.db') self.assertEqual(1, len(list)) c = list[0] self.assertTrue('Wal-Mart Stores', c.company_name) # Test good format csv file = open('data/NRN_RestaurantList.csv', 'rb') company_list = s.read_csv(file) self.assertTrue(len(company_list) > 0) list = s.get_social_media(company_list[0:1], 'testdata/data.db') self.assertEqual(1, len(list)) c = list[0] #self.assertTrue('Wal-Mart Stores', c.company_name) # Test error format csv try: file = open('testdata/error_format.csv', 'rb') s.read_csv(file) except Exception as e: self.assertTrue(e) # Test write db s.write_db(list, 'testdata/data.db') conn = sqlite3.connect('testdata/data.db') c = conn.cursor() # Create table c.execute('DELETE FROM COMPANY') conn.commit() c.close() conn.close()
def re_scrape_schedule(): conn = sqlite3.connect('data/setting.db') c = conn.cursor() schedule_interval = c.execute('SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0] c.close() conn.close() conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute('SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s, csv_path, db_path) thread.start() # Re schedule with new interval seconds cron.reSchedule(seconds=schedule_interval) return settings(success_message='The cron job has been started in background and rescheduled.')
company.micro_metrics['tw_percent'], company.micro_metrics['yt_percent'], company.micro_metrics['fb_abs'], company.micro_metrics['tw_abs'], company.micro_metrics['yt_abs'], company.time_taken )) count += 1 except Exception as e: log.error(e) pass conn.commit() c.close() conn.close() return count if __name__ == '__main__': log.info('begin') args = sys.argv if len(args) >= 2: file = open(args[1], 'r') fblogin() s = Scraper() count = s.write_db(s.get_social_media(s.read_csv(file), 'data/data.db'), 'data/data.db') print '\n' print '%d records has been saved to database %s' % (count, 'data/data.db') else: print 'Please input the file name as the first parameter.' log.info('end')