Esempio n. 1
0
def do_upload():
    csvfile = request.files.get('csvfile', None)
    fblogin()
    s = Scraper()
    try:
        if csvfile.file == None:
            raise Exception('The file is None')
        company_list = s.read_csv(csvfile.file)
        if len(company_list) > 0:
            # Save the file
            f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb')
            f.write(csvfile.value)
            f.seek(0)

            # Save to CSV_FILE db
            csv_file_path, db_file_path = save_csv_db(csvfile)
            # Run the scrape process in background
            # TODO just upload, not scrape
            do_scrape_async(s, csv_file_path, db_file_path)
        else:
            raise Exception('The file is not format as company list')
    except Exception as e:
        log.error(e)
        return csv_upload(error_message='Error: %s' % e.message)
        #return redirect('/')
    return csv_upload(
        success_message=
        'The file updated success and will do scrape in background. Please refrush page later to view the new data.'
    )
Esempio n. 2
0
    def test_download_photos(self, mock_get):
        """download_photos() test

        This test creates test photos.csv which containes different urls to photos.
        API is mocked and it returns everytime the same image that was previously downloaded.

        """
        config = {
            'app': {
                'threads': 1,
                'logger': 'DEBUG'
            },
            'api': {
                'url': 'https://jsonplaceholder.typicode.com/',
                'endpoints': [
                    'test',
                ]
            }
        }
        scraper = Scraper(config)
        urls = ['https://via.placeholder.com/photos/testphoto'] * 5
        urls = [y + str(x) for x, y in enumerate(urls)]
        testphotos = pd.DataFrame(urls, columns=['url'])
        testphotos.to_csv('testphotos.csv', index=False)
        scraper.download_photos('testphotos.csv')

        testphotos = pd.read_csv('testphotos.csv')
        gold_image = open('scraper_test/testphoto.jpg', 'rb').read()
        for file_path in testphotos['file_path']:
            image = open(file_path, 'rb').read()
            self.assertEqual(image, gold_image)
            os.remove(file_path)
        os.remove('testphotos.csv')
Esempio n. 3
0
    def test_download_photo(self, mock_get):
        """download_photo() test

        This test creates scraper object with test endpoint and its given test url to download image from.
        API is mocked and 'requests.get()' method returns previously downloaded image.

        """
        config = {
            'app': {
                'threads': 1,
                'logger': 'DEBUG'
            },
            'api': {
                'url': 'https://jsonplaceholder.typicode.com/',
                'endpoints': [
                    'test',
                ]
            }
        }
        scraper = Scraper(config)
        test_urltuple = ('https://via.placeholder.com/photos/testphot',
                         'testphoto.jpg')
        scraper.download_photo(test_urltuple)
        gold_image = open('scraper_test/testphoto.jpg', 'rb').read()
        image = open('testphoto.jpg', 'rb').read()
        self.assertEqual(image, gold_image)
        os.remove('testphoto.jpg')
Esempio n. 4
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'https://www.filmlinks4u.is/category/'
     self.icon = self.ipath + 'flinks.png'
     self.list = {'01Tamil Movies': self.bu + 'tamil',
                  '02Telugu Movies': self.bu + 'telugu',
                  '03Malayalam Movies': self.bu + 'malayalam',
                  '04Kannada Movies': self.bu + 'kannada',
                  '05Hindi Movies': self.bu + 'hindi',
                  '06English Movies': self.bu + 'hollywood',
                  '07Animation Movies': self.bu + 'animation',
                  '08Biography Movies': self.bu + 'biography',
                  '09Documentary Movies': self.bu + 'documentary',
                  '10Bengali Movies': self.bu + 'bengali',
                  '11Bhojpuri Movies': self.bu + 'bhojpuri',
                  '12Gujarati Movies': self.bu + 'gujarati',
                  '13Marathi Movies': self.bu + 'marathi',
                  '14Oriya Movies': self.bu + 'oriya',
                  '15Punjabi Movies': self.bu + 'punjabi',
                  '16Rajasthani Movies': self.bu + 'rajasthani',
                  '17Urdu Movies': self.bu + 'urdu',
                  '18Nepali Movies': self.bu + 'nepali',
                  '19[COLOR cyan]Hindi Adult Softcore[/COLOR]': self.bu + 'adult-hindi-short-films',
                  '20[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'adult',
                  '21[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
Esempio n. 5
0
def do_upload():
    csvfile = request.files.get('csvfile', None)
    fblogin()
    s = Scraper()
    try:
        if csvfile.file == None:
            raise Exception('The file is None')
        company_list = s.read_csv(csvfile.file)
        if len(company_list) > 0:
            # Save the file
            f = open('%s/data/%s' % (WEB_ROOT, csvfile.filename), 'wb')
            f.write(csvfile.value)
            f.seek(0)

            # Save to CSV_FILE db
            csv_file_path, db_file_path = save_csv_db(csvfile)
            # Run the scrape process in background
            # TODO just upload, not scrape
            do_scrape_async(s, csv_file_path, db_file_path)
        else:
            raise Exception('The file is not format as company list')
    except Exception as e:
        log.error(e)
        return csv_upload(error_message='Error: %s' % e.message)
        #return redirect('/')
    return csv_upload(
        success_message='The file updated success and will do scrape in background. Please refrush page later to view the new data.')
Esempio n. 6
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.desitashan.me/'
     self.icon = self.ipath + 'desit.png'
     self.list = {
         '01Indian': self.bu,
         '02Pakistani': self.bu + 'pakistan-tv/'
     }
Esempio n. 7
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://abcmalayalam.com'
     self.icon = self.ipath + 'abcm.png'
     self.list = {
         '01Movies': self.bu + '/movies',
         '02Short Films': self.bu + '/short-film'
     }
Esempio n. 8
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://olangal.pro/'
     self.icon = self.ipath + 'olangal.png'
     self.list = {
         '01Recent Movies': self.bu,
         '02[COLOR yellow]** Search **[/COLOR]': self.bu + '?s='
     }
Esempio n. 9
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.andhrawatch.com/'
     self.icon = self.ipath + 'awatch.png'
     self.list = {
         '01Movies': self.bu + 'telugu-movies/',
         '02Trailers': self.bu + 'movie-trailers/',
         '03Short Films': self.bu + 'short-films/'
     }
Esempio n. 10
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://mhdtvlive.com/'
     self.icon = self.ipath + 'lmtv.png'
     self.list = {'01Entertainment Channels': self.bu + 'entertainment_channels',
                  '02News Channels': self.bu + 'news_channels',
                  '03Regional Channels': self.bu + 'regional_web_channels',
                  '04Devotional Channels': self.bu + 'devotional_channels',
                  '07[COLOR yellow]** Search **[/COLOR]': self.bu + '?s='}
Esempio n. 11
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.ozee.com/'
     self.icon = self.ipath + 'ozee.png'
     self.list = {
         '01Shows': self.bu + '/shows/all',
         '02Movies': self.bu + '/movies/all',
         '03Videos': self.bu + '/videos/all',
         '04Music': self.bu + '/music/all'
     }
Esempio n. 12
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'https://www.hindilinks4u.to/category/'
     self.icon = self.ipath + 'hlinks.png'
     self.list = {
         '01Hindi Movies': self.bu + 'hindi-movies',
         '02Dubbed Movies': self.bu + 'dubbed-movies',
         '03Documentary Movies': self.bu + 'documentaries',
         '04[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'adult',
         '05[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 13
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://tamilrasigan.net/'
     self.icon = self.ipath + 'rasigan.png'
     self.list = {
         '01Featured': self.bu[:-1],
         '02New Releases': self.bu + 'tamil-movies-online/',
         '03Super Hit Movies': self.bu + 'category/super-hit-movies/',
         '04Trailers & Songs': self.bu + 'category/tamil-videos/',
         '05[COLOR yellow]** Search **[/COLOR]': self.bu + '?s='
     }
Esempio n. 14
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://thiruttuvcds.com/category/'
     self.icon = self.ipath + 'tvcds.png'
     self.list = {'01Tamil Movies': self.bu + 'new-tamil-movies/',
                  '02Telugu Movies': self.bu + 'telugu/',
                  '03Hindi Movies': self.bu + 'hindi/',
                  '04Tamil Dubbed Movies': self.bu + 'tamil-dubbed/',
                  '05HD Movies': self.bu + 'hd-movies/',
                  '06[COLOR cyan]Adult Movies[/COLOR]': self.bu[:-9] + 'private/',
                  '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
Esempio n. 15
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://runtamil.tv/category/'
     self.icon = self.ipath + 'runt.png'
     self.list = {'01Tamil New Movies': self.bu + 'runtamil-new-tamil-movies2o1/',
                  '02Tamil HD Movies': self.bu + 'tamil-hd-movies-online/',
                  '03Tamil DVD Movies': self.bu + 'tamil-dvd-movies1/',
                  '04Tamil Classic Movies': self.bu + 'mid-movies/',
                  '05Tamil Old Movies': self.bu + 'old-tamil-movies/',
                  '06Tamil Dubbed Movies': self.bu + 'runtamil-tamil-dubbed-movies/',
                  '99[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
Esempio n. 16
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://bharat-movies.com/'
     self.icon = self.ipath + 'bmov.png'
     self.list = {
         '01Hindi': 'hindi',
         '02Telugu': 'telugu',
         '03Tamil': 'tamil',
         '04Malayalam': 'malayalam',
         '05Kannada': 'kannada',
         '06Bengali': 'bengali'
     }
Esempio n. 17
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://tamilgun.pro/categories/'
     self.icon = self.ipath + 'tgun.png'
     self.list = {
         '01New Movies': self.bu + 'new-movies-2017/',
         '02HD Movies': self.bu + 'hd-movies/',
         '03Dubbed Movies': self.bu + 'dubbed-movies/',
         '04Trailers': self.bu + 'trailers/',
         '05Special Shows': self.bu + 'special-tv-shows/',
         '06[COLOR yellow]** Search **[/COLOR]': self.bu[:-11] + '?s='
     }
Esempio n. 18
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://mhdtvlive.com/'
     self.icon = self.ipath + 'mhdtv.png'
     self.list = {
         '01Tamil TV': self.bu + 'tamil_channels',
         '02Telugu TV': self.bu + 'telugu_channels',
         '03Malayalam TV': self.bu + 'malayalam_channels',
         '04Kannada TV': self.bu + 'kannada_channels',
         '05Hindi TV': self.bu + 'hindi_channels',
         '06English TV': self.bu + 'english_channels'
     }
Esempio n. 19
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://mersalaayitten.biz/videos?c='
     self.icon = self.ipath + 'mersal.png'
     self.hdstr = self.settings('mersalhd')
     self.list = {'01Tamil Movies': self.bu + '1&o=mr',
                  '02Telugu Movies': self.bu + '3&o=mr',
                  '03Hindi Movies': self.bu + '2&o=mr',
                  '04Malayalam Movies': self.bu + '4&o=mr',
                  '05Dubbed Movies': self.bu + '6&o=mr',
                  '06Animation Movies': self.bu + '5&o=mr',
                  '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + 'search?search_type=videos&search_query='}
Esempio n. 20
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.thiruttuvcd.me/category/'
     self.icon = self.ipath + 'tvcd.png'
     self.list = {
         '01Tamil Movies': self.bu + 'tamil-movies-online/',
         '02Malayalam Movies': self.bu + 'malayalam/',
         '03Telugu Movies': self.bu + 'watch-telugu-movie/',
         '04Hindi Movies': self.bu + 'hindi-movies-online/',
         '05Tamil TV Shows': self.bu[:-9] + 'tv/',
         '06[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'hot-movies/',
         '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 21
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://apnaview.com/browse/'
     self.icon = self.ipath + 'apnav.png'
     self.list = {
         '01Tamil Movies': self.bu + 'tamil',
         '02Telugu Movies': self.bu + 'telugu',
         '03Malayalam Movies': self.bu + 'Malayalam',
         '04Hindi Movies': self.bu + 'hindi',
         '05Marathi Movies': self.bu + 'marathi',
         '06Punjabi Movies': self.bu + 'punjabi',
         '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?q='
     }
Esempio n. 22
0
 def __init__(self):
     Scraper.__init__(self)
     self.icon = self.ipath + 'rajt.png'
     self.bu = 'http://www.rajtamil.com/category/'
     self.list = {
         '01Recent Movies': self.bu + 'movies/',
         '02Dubbed Movies': self.bu + 'tamil-dubbed/',
         '03Comedy Scenes': self.bu + 'comedy/',
         '04Movie Songs': self.bu + 'download-songs/',
         '05Sun TV Shows': self.bu + 'sun-tv-show/',
         '06Vijay TV Shows': self.bu + 'vijay-tv-shows/',
         '07[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 23
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.filmgur.com/category/'
     self.icon = self.ipath + 'moviefk.png'
     self.list = {
         '01Tamil Movies': self.bu + 'tamil-movies/',
         '02Telugu Movies': self.bu + 'telugu-movies/',
         '03Hindi Movies': self.bu + 'bollywood-movies/',
         '04English Movies': self.bu + 'hollywood-movies/',
         '05Hindi Dubbed Movies': self.bu + 'hindi-dubbed-movies/',
         '06Punjabi Movies': self.bu + 'punjabi-movies/',
         '07Urdu Movies': self.bu + 'pakistani-movies/',
         '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 24
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://moviefisher.org/category/'
     self.icon = self.ipath + 'mfish.png'
     self.list = {
         '01Tamil Movies': self.bu + 'tamil',
         '02Telugu Movies': self.bu + 'telugu-movie',
         '04Hindi Movies': self.bu + 'bollywood',
         '05English Movies': self.bu + 'hollywood',
         '06Hindi Dubbed Movies': self.bu + 'dubbed',
         '07Hindi Dubbed South Movies': self.bu + 'south-in-hindi',
         '08Punjabi Movies': self.bu + 'punjabi',
         '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 25
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://redmovies.co/category/'
     self.icon = self.ipath + 'redm.png'
     self.list = {'01Tamil Movies': self.bu + 'tamil/',
                  '02Telugu Movies': self.bu + 'telugu/',
                  '03Malayalam Movies': self.bu + 'malayalam/',
                  '04Kannada Movies': self.bu + 'kannada-movies/',
                  '05Hindi Movies': self.bu + 'bollywood-movies/',
                  '06English Movies': self.bu + 'hollywood-movies/',
                  '07Animation Movies': self.bu + 'animation/',
                  '08Hindi Dubbed Movies': self.bu + 'hindi-dubbed/',
                  '09Punjabi Movies': self.bu + 'punjabi-movies/',
                  '10Urdu Movies': self.bu + 'pakistan/',
                  '12[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
Esempio n. 26
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://malayalamserials.in/category/'
     self.icon = self.ipath + 'mserial.png'
     self.list = {'01Asianet': self.bu + 'asianet/',
                  '02Mazhavil Manorama': self.bu + 'mazhavil/',
                  '03Surya': self.bu + 'surya/',
                  '04Kairali': self.bu + 'kairali/',
                  '05Flowers': self.bu + 'flowers/',
                  '06Media One': self.bu + 'media/',
                  '07Amrita': self.bu + 'amrita/',
                  '08Movies': self.bu + 'movies/',
                  '09News': self.bu + 'news/',
                  '10Gossip': self.bu + 'malayalam_movie/',
                  '11[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='}
Esempio n. 27
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://www.tamiltwists.com/category/'
     self.icon = self.ipath + 'ttwist.png'
     self.list = {
         '01Tamil New Movies': self.bu + 'new-movies/',
         '02Tamil HD Movies': self.bu + 'hd-movies/',
         '03Tamil Dubbed Movies': self.bu + 'dubbed-movies/',
         '04Tamil Movie Trailers': self.bu + 'trailers/',
         '05Malayalam Movies': self.bu + 'malayalam-movies/',
         '06Telugu Movies': self.bu + 'telugu-movies/',
         '07Hindi Movies': self.bu + 'hindi-movies/',
         '08[COLOR cyan]Adult Movies[/COLOR]': self.bu + 'hot-masala/',
         '09[COLOR yellow]** Search **[/COLOR]': self.bu[:-9] + '?s='
     }
Esempio n. 28
0
def re_scrape_schedule():
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    schedule_interval = c.execute(
        'SELECT SCHEDULE_INTERVAL FROM SETTINGS').fetchone()[0]
    c.close()
    conn.close()

    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute(
        'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path = item[1]
        s = Scraper()
        thread = ScrapeThread(s, csv_path, db_path)
        thread.start()
        # Re schedule with new interval seconds
    cron.reSchedule(seconds=schedule_interval)
    return settings(
        success_message=
        'The cron job has been started in background and rescheduled.')
Esempio n. 29
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://mhdtvlive.com/'
     self.icon = self.ipath + 'mhdtv.png'
     self.list = {
         '01Tamil TV': self.bu + 'tamil-tvs',
         '02Telugu TV': self.bu + 'telugu_channels',
         '03Malayalam TV': self.bu + 'malayalam_channels',
         '04Kannada TV': self.bu + 'kannada_channels',
         '05Hindi TV': self.bu + 'hindi_tvs',
         '06English TV': self.bu + 'english_channels',
         '07Sports TV': self.bu + 'sport',
         '08Marathi TV': self.bu + 'marathi-channels',
         '09Punjabi TV': self.bu + 'punjabi-channels',
         '10Bangla TV': self.bu + 'bangla-channels'
     }
Esempio n. 30
0
def doJob():
    '''
        Do the scrape every interval time
    '''
    #get all csv and db paths
    conn = sqlite3.connect('data/setting.db')
    c = conn.cursor()
    csv_db_file_list = c.execute(
        'SELECT CSV_FILE_PATH, DB_FILE_PATH FROM CSV_DB').fetchall()
    c.close()
    conn.close()
    threads = []

    fblogin()

    for item in csv_db_file_list:
        csv_path = item[0]
        db_path = item[1]
        s = Scraper()
        thread = ScrapeThread(s, csv_path, db_path)
        #thread.start()
        thread.run()  # run threads one by one
        threads.append(thread)

    # Wait for all threads to complete
    #for t in threads:
    #    t.join()

    log.info('all scraper threads finished in doJob()')
    return
Esempio n. 31
0
 def __init__(self):
     Scraper.__init__(self)
     self.bu = 'http://abroadindia.com/k_loadlist.php?lan='
     self.icon = self.ipath + 'aindia.png'
     self.list = {
         '01Tamil Channels': self.bu + 'tamil',
         '02Telugu Channels': self.bu + 'telugu',
         '03Malayalam Channels': self.bu + 'malayalam',
         '04Kannada Channels': self.bu + 'kannada',
         '05Hindi Channels': self.bu + 'hindi',
         '06English Channels': self.bu + 'english',
         '07Sports Channels': self.bu + 'sports',
         '08News Channels': self.bu + 'news',
         '09Informative Channels': self.bu + 'informative',
         '10Spiritual Channels': self.bu + 'spiritual',
         '12[COLOR yellow]** Search **[/COLOR]': self.bu[:-4] + 's='
     }