def do_upload(): csvfile = request.files.get('csvfile', None) fblogin() s = Scraper() try: if csvfile.file == None: raise Exception('The file is None') company_list = s.read_csv(csvfile.file) if len(company_list) > 0: # Save the file f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb') f.write(csvfile.value) f.seek(0) # Save to CSV_FILE db csv_file_path, db_file_path = save_csv_db(csvfile) # Run the scrape process in background # TODO just upload, not scrape do_scrape_async(s, csv_file_path, db_file_path) else: raise Exception('The file is not format as company list') except Exception as e: log.error(e) return csv_upload(error_message='Error: %s' % e.message) #return redirect('/') return csv_upload( success_message= 'The file updated success and will do scrape in background. Please refrush page later to view the new data.' )
def test_download_photos(self, mock_get): """download_photos() test This test creates test photos.csv which containes different urls to photos. API is mocked and it returns everytime the same image that was previously downloaded. """ config = { 'app': { 'threads': 1, 'logger': 'DEBUG' }, 'api': { 'url': 'https://jsonplaceholder.typicode.com/', 'endpoints': [ 'test', ] } } scraper = Scraper(config) urls = ['https://via.placeholder.com/photos/testphoto'] * 5 urls = [y + str(x) for x, y in enumerate(urls)] testphotos = pd.DataFrame(urls, columns=['url']) testphotos.to_csv('testphotos.csv', index=False) scraper.download_photos('testphotos.csv') testphotos = pd.read_csv('testphotos.csv') gold_image = open('scraper_test/testphoto.jpg', 'rb').read() for file_path in testphotos['file_path']: image = open(file_path, 'rb').read() self.assertEqual(image, gold_image) os.remove(file_path) os.remove('testphotos.csv')
def test_download_photo(self, mock_get): """download_photo() test This test creates scraper object with test endpoint and its given test url to download image from. API is mocked and 'requests.get()' method returns previously downloaded image. """ config = { 'app': { 'threads': 1, 'logger': 'DEBUG' }, 'api': { 'url': 'https://jsonplaceholder.typicode.com/', 'endpoints': [ 'test', ] } } scraper = Scraper(config) test_urltuple = ('https://via.placeholder.com/photos/testphot', 'testphoto.jpg') scraper.download_photo(test_urltuple) gold_image = open('scraper_test/testphoto.jpg', 'rb').read() image = open('testphoto.jpg', 'rb').read() self.assertEqual(image, gold_image) os.remove('testphoto.jpg')
def __init__(self): Scraper.__init__(self) self.bu = 'https://www.filmlinks4u.is/category/' self.icon = self.ipath + 'flinks.png' self.list = {'01Tamil Movies': self.bu + 'tamil', '02Telugu Movies': self.bu + 'telugu', '03Malayalam Movies': self.bu + 'malayalam', '04Kannada Movies': self.bu + 'kannada', '05Hindi Movies': self.bu + 'hindi', '06English Movies': self.bu + 'hollywood', '07Animation Movies': self.bu + 'animation', '08Biography Movies': self.bu + 'biography', '09Documentary Movies': self.bu + 'documentary', '10Bengali Movies': self.bu + 'bengali', '11Bhojpuri Movies': self.bu + 'bhojpuri', '12Gujarati Movies': self.bu + 'gujarati', '13Marathi Movies': self.bu + 'marathi', '14Oriya Movies': self.bu + 'oriya', '15Punjabi Movies': self.bu + 'punjabi', '16Rajasthani Movies': self.bu + 'rajasthani', '17Urdu Movies': self.bu + 'urdu', '18Nepali Movies': self.bu + 'nepali', '19[COLOR cyan]Hindi Adult Softcore[/COLOR]': self.bu + 'adult-hindi-short-films', '20[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'adult', '21[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
def do_upload(): csvfile = request.files.get('csvfile', None) fblogin() s = Scraper() try: if csvfile.file == None: raise Exception('The file is None') company_list = s.read_csv(csvfile.file) if len(company_list) > 0: # Save the file f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb') f.write(csvfile.value) f.seek(0) # Save to CSV_FILE db csv_file_path, db_file_path = save_csv_db(csvfile) # Run the scrape process in background # TODO just upload, not scrape do_scrape_async(s, csv_file_path, db_file_path) else: raise Exception('The file is not format as company list') except Exception as e: log.error(e) return csv_upload(error_message='Error: %s' % e.message) #return redirect('/') return csv_upload( success_message='The file updated success and will do scrape in background. Please refrush page later to view the new data.')
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.desitashan.me/' self.icon = self.ipath + 'desit.png' self.list = { '01Indian': self.bu, '02Pakistani': self.bu + 'pakistan-tv/' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://abcmalayalam.com' self.icon = self.ipath + 'abcm.png' self.list = { '01Movies': self.bu + '/movies', '02Short Films': self.bu + '/short-film' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://olangal.pro/' self.icon = self.ipath + 'olangal.png' self.list = { '01Recent Movies': self.bu, '02[COLOR yellow]** Search **[/COLOR]': self.bu + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.andhrawatch.com/' self.icon = self.ipath + 'awatch.png' self.list = { '01Movies': self.bu + 'telugu-movies/', '02Trailers': self.bu + 'movie-trailers/', '03Short Films': self.bu + 'short-films/' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://mhdtvlive.com/' self.icon = self.ipath + 'lmtv.png' self.list = {'01Entertainment Channels': self.bu + 'entertainment_channels', '02News Channels': self.bu + 'news_channels', '03Regional Channels': self.bu + 'regional_web_channels', '04Devotional Channels': self.bu + 'devotional_channels', '07[COLOR yellow]** Search **[/COLOR]': self.bu + '?s='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.ozee.com/' self.icon = self.ipath + 'ozee.png' self.list = { '01Shows': self.bu + '/shows/all', '02Movies': self.bu + '/movies/all', '03Videos': self.bu + '/videos/all', '04Music': self.bu + '/music/all' }
def __init__(self): Scraper.__init__(self) self.bu = 'https://www.hindilinks4u.to/category/' self.icon = self.ipath + 'hlinks.png' self.list = { '01Hindi Movies': self.bu + 'hindi-movies', '02Dubbed Movies': self.bu + 'dubbed-movies', '03Documentary Movies': self.bu + 'documentaries', '04[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'adult', '05[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://tamilrasigan.net/' self.icon = self.ipath + 'rasigan.png' self.list = { '01Featured': self.bu[:-1], '02New Releases': self.bu + 'tamil-movies-online/', '03Super Hit Movies': self.bu + 'category/super-hit-movies/', '04Trailers & Songs': self.bu + 'category/tamil-videos/', '05[COLOR yellow]** Search **[/COLOR]': self.bu + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://thiruttuvcds.com/category/' self.icon = self.ipath + 'tvcds.png' self.list = {'01Tamil Movies': self.bu + 'new-tamil-movies/', '02Telugu Movies': self.bu + 'telugu/', '03Hindi Movies': self.bu + 'hindi/', '04Tamil Dubbed Movies': self.bu + 'tamil-dubbed/', '05HD Movies': self.bu + 'hd-movies/', '06[COLOR cyan]Adult Movies[/COLOR]': self.bu[:-9] + 'private/', '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://runtamil.tv/category/' self.icon = self.ipath + 'runt.png' self.list = {'01Tamil New Movies': self.bu + 'runtamil-new-tamil-movies2o1/', '02Tamil HD Movies': self.bu + 'tamil-hd-movies-online/', '03Tamil DVD Movies': self.bu + 'tamil-dvd-movies1/', '04Tamil Classic Movies': self.bu + 'mid-movies/', '05Tamil Old Movies': self.bu + 'old-tamil-movies/', '06Tamil Dubbed Movies': self.bu + 'runtamil-tamil-dubbed-movies/', '99[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://bharat-movies.com/' self.icon = self.ipath + 'bmov.png' self.list = { '01Hindi': 'hindi', '02Telugu': 'telugu', '03Tamil': 'tamil', '04Malayalam': 'malayalam', '05Kannada': 'kannada', '06Bengali': 'bengali' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://tamilgun.pro/categories/' self.icon = self.ipath + 'tgun.png' self.list = { '01New Movies': self.bu + 'new-movies-2017/', '02HD Movies': self.bu + 'hd-movies/', '03Dubbed Movies': self.bu + 'dubbed-movies/', '04Trailers': self.bu + 'trailers/', '05Special Shows': self.bu + 'special-tv-shows/', '06[COLOR yellow]** Search **[/COLOR]': self.bu[:-11] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://mhdtvlive.com/' self.icon = self.ipath + 'mhdtv.png' self.list = { '01Tamil TV': self.bu + 'tamil_channels', '02Telugu TV': self.bu + 'telugu_channels', '03Malayalam TV': self.bu + 'malayalam_channels', '04Kannada TV': self.bu + 'kannada_channels', '05Hindi TV': self.bu + 'hindi_channels', '06English TV': self.bu + 'english_channels' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://mersalaayitten.biz/videos?c=' self.icon = self.ipath + 'mersal.png' self.hdstr = self.settings('mersalhd') self.list = {'01Tamil Movies': self.bu + '1&o=mr', '02Telugu Movies': self.bu + '3&o=mr', '03Hindi Movies': self.bu + '2&o=mr', '04Malayalam Movies': self.bu + '4&o=mr', '05Dubbed Movies': self.bu + '6&o=mr', '06Animation Movies': self.bu + '5&o=mr', '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + 'search?search_type=videos&search_query='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.thiruttuvcd.me/category/' self.icon = self.ipath + 'tvcd.png' self.list = { '01Tamil Movies': self.bu + 'tamil-movies-online/', '02Malayalam Movies': self.bu + 'malayalam/', '03Telugu Movies': self.bu + 'watch-telugu-movie/', '04Hindi Movies': self.bu + 'hindi-movies-online/', '05Tamil TV Shows': self.bu[:-9] + 'tv/', '06[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'hot-movies/', '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://apnaview.com/browse/' self.icon = self.ipath + 'apnav.png' self.list = { '01Tamil Movies': self.bu + 'tamil', '02Telugu Movies': self.bu + 'telugu', '03Malayalam Movies': self.bu + 'Malayalam', '04Hindi Movies': self.bu + 'hindi', '05Marathi Movies': self.bu + 'marathi', '06Punjabi Movies': self.bu + 'punjabi', '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?q=' }
def __init__(self): Scraper.__init__(self) self.icon = self.ipath + 'rajt.png' self.bu = 'http://www.rajtamil.com/category/' self.list = { '01Recent Movies': self.bu + 'movies/', '02Dubbed Movies': self.bu + 'tamil-dubbed/', '03Comedy Scenes': self.bu + 'comedy/', '04Movie Songs': self.bu + 'download-songs/', '05Sun TV Shows': self.bu + 'sun-tv-show/', '06Vijay TV Shows': self.bu + 'vijay-tv-shows/', '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.filmgur.com/category/' self.icon = self.ipath + 'moviefk.png' self.list = { '01Tamil Movies': self.bu + 'tamil-movies/', '02Telugu Movies': self.bu + 'telugu-movies/', '03Hindi Movies': self.bu + 'bollywood-movies/', '04English Movies': self.bu + 'hollywood-movies/', '05Hindi Dubbed Movies': self.bu + 'hindi-dubbed-movies/', '06Punjabi Movies': self.bu + 'punjabi-movies/', '07Urdu Movies': self.bu + 'pakistani-movies/', '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://moviefisher.org/category/' self.icon = self.ipath + 'mfish.png' self.list = { '01Tamil Movies': self.bu + 'tamil', '02Telugu Movies': self.bu + 'telugu-movie', '04Hindi Movies': self.bu + 'bollywood', '05English Movies': self.bu + 'hollywood', '06Hindi Dubbed Movies': self.bu + 'dubbed', '07Hindi Dubbed South Movies': self.bu + 'south-in-hindi', '08Punjabi Movies': self.bu + 'punjabi', '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def __init__(self): Scraper.__init__(self) self.bu = 'http://redmovies.co/category/' self.icon = self.ipath + 'redm.png' self.list = {'01Tamil Movies': self.bu + 'tamil/', '02Telugu Movies': self.bu + 'telugu/', '03Malayalam Movies': self.bu + 'malayalam/', '04Kannada Movies': self.bu + 'kannada-movies/', '05Hindi Movies': self.bu + 'bollywood-movies/', '06English Movies': self.bu + 'hollywood-movies/', '07Animation Movies': self.bu + 'animation/', '08Hindi Dubbed Movies': self.bu + 'hindi-dubbed/', '09Punjabi Movies': self.bu + 'punjabi-movies/', '10Urdu Movies': self.bu + 'pakistan/', '12[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://malayalamserials.in/category/' self.icon = self.ipath + 'mserial.png' self.list = {'01Asianet': self.bu + 'asianet/', '02Mazhavil Manorama': self.bu + 'mazhavil/', '03Surya': self.bu + 'surya/', '04Kairali': self.bu + 'kairali/', '05Flowers': self.bu + 'flowers/', '06Media One': self.bu + 'media/', '07Amrita': self.bu + 'amrita/', '08Movies': self.bu + 'movies/', '09News': self.bu + 'news/', '10Gossip': self.bu + 'malayalam_movie/', '11[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
def __init__(self): Scraper.__init__(self) self.bu = 'http://www.tamiltwists.com/category/' self.icon = self.ipath + 'ttwist.png' self.list = { '01Tamil New Movies': self.bu + 'new-movies/', '02Tamil HD Movies': self.bu + 'hd-movies/', '03Tamil Dubbed Movies': self.bu + 'dubbed-movies/', '04Tamil Movie Trailers': self.bu + 'trailers/', '05Malayalam Movies': self.bu + 'malayalam-movies/', '06Telugu Movies': self.bu + 'telugu-movies/', '07Hindi Movies': self.bu + 'hindi-movies/', '08[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'hot-masala/', '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s=' }
def re_scrape_schedule(): conn = sqlite3.connect('data/setting.db') c = conn.cursor() schedule_interval = c.execute( 'SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0] c.close() conn.close() conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute( 'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s, csv_path, db_path) thread.start() # Re schedule with new interval seconds cron.reSchedule(seconds=schedule_interval) return settings( success_message= 'The cron job has been started in background and rescheduled.')
def __init__(self): Scraper.__init__(self) self.bu = 'http://mhdtvlive.com/' self.icon = self.ipath + 'mhdtv.png' self.list = { '01Tamil TV': self.bu + 'tamil-tvs', '02Telugu TV': self.bu + 'telugu_channels', '03Malayalam TV': self.bu + 'malayalam_channels', '04Kannada TV': self.bu + 'kannada_channels', '05Hindi TV': self.bu + 'hindi_tvs', '06English TV': self.bu + 'english_channels', '07Sports TV': self.bu + 'sport', '08Marathi TV': self.bu + 'marathi-channels', '09Punjabi TV': self.bu + 'punjabi-channels', '10Bangla TV': self.bu + 'bangla-channels' }
def doJob(): ''' Do the scrape every interval time ''' #get all csv and db paths conn = sqlite3.connect('data/setting.db') c = conn.cursor() csv_db_file_list = c.execute( 'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall() c.close() conn.close() threads = [] fblogin() for item in csv_db_file_list: csv_path = item[0] db_path = item[1] s = Scraper() thread = ScrapeThread(s, csv_path, db_path) #thread.start() thread.run() # run threads one by one threads.append(thread) # Wait for all threads to complete #for t in threads: # t.join() log.info('all scraper threads finished in doJob()') return
def __init__(self): Scraper.__init__(self) self.bu = 'http://abroadindia.com/k_loadlist.php?lan=' self.icon = self.ipath + 'aindia.png' self.list = { '01Tamil Channels': self.bu + 'tamil', '02Telugu Channels': self.bu + 'telugu', '03Malayalam Channels': self.bu + 'malayalam', '04Kannada Channels': self.bu + 'kannada', '05Hindi Channels': self.bu + 'hindi', '06English Channels': self.bu + 'english', '07Sports Channels': self.bu + 'sports', '08News Channels': self.bu + 'news', '09Informative Channels': self.bu + 'informative', '10Spiritual Channels': self.bu + 'spiritual', '12[COLOR yellow]** Search **[/COLOR]': self.bu[:-4] + 's=' }