def test_places(self, all_places): test_foreign_title = all_places[0].find('span', { 'class': 'text-grey' }).get_text() test_a = all_places[0].find('a', {'class': 'all'}) logger.info(TAG, test_foreign_title) logger.info(TAG, test_a.text)
def get_page_source(self, url): logger.info(TAG, 'browser load url: {0}'.format(url)) self.init_session() self.driver.get(url) html = self.driver.page_source self.close() return html
def save_img(self, url, film_id): logger.info(TAG, 'save img url: {0}'.format(url)) img_data = requests.get(url).content with open( 'cache/images/{0}_800x600.{1}'.format(film_id, IMG_FILE_FORMAT), 'wb') as handler: handler.write(img_data)
def clear_db(self): os.chdir(DB_FOLDER) db_names = [m for m in glob.glob("*.{}".format(SQLITE_DB_EXT))] logger.info(TAG, db_names) for db in db_names: os.remove(db) os.chdir(self.work_dir)
def get_page_source(self, url, is_wall=False, film_id=None): if film_id is not None and self.check_cached_film(film_id): return self.get_cached_film(film_id) html = self.browser.get_page_source(url) if film_id is not None: logger.info(TAG, 'caching page: {0}'.format(film_id)) self.cache_page(html, film_id) return html
def download_images(self): logger.info( TAG, 'self.top_250_films len: {0}'.format(len(self.top_250_films))) for film in self.top_250_films: text = self.load_film_wall(film.film_id) if text == None: logger.info(TAG, 'wall page is null') continue self.process_img(text, film)
def get_stylized_images_names(self): os.chdir(FULL_STYLIZED_IMAGES_PATH) images_names = [ i.replace('.{}'.format(IMG_FILE_FORMAT), '') for i in [m for m in glob.glob("*.{}".format(IMG_FILE_FORMAT))] ] shuffle(images_names) logger.info(TAG, images_names) logger.info(TAG, len(images_names)) os.chdir(self.work_dir) return images_names
def resize(self, basewidth): os.chdir(FULL_STYLIZED_IMAGES_PATH) images = [i for i in glob.glob('*.{}'.format(IMG_FILE_FORMAT))] os.chdir(self.work_dir) for image in images: image_in = '{0}/{1}'.format(FULL_STYLIZED_IMAGES_PATH, image) image_out = '{0}/{1}'.format(RESIZED_IMAGES_FOLDER, image) resize(image_in, image_out, basewidth) logger.info(TAG, image_out) if self.test_mode: break logger.info(TAG, 'Success resized {} images'.format(len(images)))
def process_films(self, films): i = 1 for film in films: json = self.createJson(film) row = self.cur.fetchone() if row == None: self.insert_puzzle(i, self.get_film_title(film), json, 0) logger.info( TAG, 'insert {0} film with lang {1}'.format(i, self.lang)) else: self.update_puzzle(self.get_film_title(film), json, 0, row[0]) logger.info( TAG, 'update {0} film with lang {1}'.format(i, self.lang)) self.update_levels(row, i) i = i + 1
def update_country(self, film_id, country): c = self.con.cursor() c.execute("SELECT * FROM puzzles") parsed_string = None row = 1 while row is not None: row = c.fetchone() parsed_string = json.loads(row[2]) if parsed_string['film_id'] == film_id: parsed_string['country'] = country result = json.dumps(parsed_string, ensure_ascii=False).encode('utf8') self.update_puzzle(row[1], str(result), 0, row[0]) break logger.info(TAG, 'film not found') c.close()
def __init__(self, lang='ru'): self.base_dir = os.getcwd() self.lang = lang logger.info(TAG, self.base_dir) check_folder_and_create(DB_FOLDER) check_folder_and_create(CSV_OUTPUT_FOLDER) db_path = os.path.join( self.base_dir, "{0}/{1}_{2}.{3}".format(DB_FOLDER, DB_FILE_NAME, lang, SQLITE_DB_EXT)) self.con = lite.connect(db_path) with self.con: self.cur = self.con.cursor() self.create_db(self.con) self.cur.execute("SELECT * FROM puzzles") self.mLevel = 1 self.level_pack_count = 40
def process_img(self, text, film): soup = BeautifulSoup(text) film_list = soup.find('table', {'class': 'fotos fotos2'}) if film_list is None: film_list = soup.find('table', {'class': 'fotos fotos1'}) if film_list is None: # page not have wallpapers logger.info( TAG, 'film {0} not have wallpapers'.format(film.film_id)) return alla = film_list.find_all('a', href=True) #logger.info(alla) for a in alla: # get only 800x600 wallpaper url = a.get('href') img_url = self.getImageUrl(url) img_url = '{0}{1}'.format('http://', img_url) film.img_url = img_url self.save_img(img_url, film.film_id) self.save_img_url(film) break
def setup_top_250_films_ids(self, film_ids_list=None): text = self.get_page_content(TOP_250) self.cache_page(text, 'top250') soup = BeautifulSoup(text) all_places = soup.find_all('tr', {'id': re.compile("top250_place_[0-9]")}) #self.test_places(all_places) logger.info(TAG, 'Films count: {}'.format(len(all_places))) i = 0 for place in all_places: #all_a[2:252] foreign_title = place.find('span', {'class': 'text-grey'}) if foreign_title is not None: foreign_title = foreign_title.get_text() a = place.find('a', {'class': 'all'}) film_id = a.get('href').replace('/', ' ').split(' ')[2] if film_ids_list is not None: if film_id not in film_ids_list: continue country = self.retrive_country(film_id) if country is None: continue film = Film(film_id, "", self.clear_film_title(a.text), country.lower()) if foreign_title is not None: film.is_eng = True film.foreign_title = foreign_title logger.info(TAG, 'film_id: {0}'.format(film.film_id)) self.top_250_films.append(film) logger.info(TAG, 'film proceed: {0}'.format(i)) #logger.info(TAG, str(a.text)) i += 1 if self.test_mode: if i == 2: break shuffle(self.top_250_films) arrange_films_to_db(self.top_250_films)
def init_workspace(self): models_dir = os.getcwd() + '/torch/models' check_folder_and_create(models_dir) self.act_dir = os.getcwd() + '/torch/' self.work_dir = os.getcwd() os.chdir(self.act_dir + 'models') self.models = [m for m in glob.glob("*.t7")] logger.info(TAG, self.models) logger.info(TAG, len(self.models)) self.lua_script = 'fast_neural_style.lua' if not os.path.exists('{0}{1}'.format(self.act_dir, self.lua_script)): logger.info( TAG, 'Please put {0} to folder {1}'.format(self.lua_script, self.act_dir))
def get_page_source(self, url): logger.info(TAG, '{0} load url: {1}'.format(self.PRESIX, url))
def get_json_film_info_with_id(self, film_id): html = self.load_film(film_id) soup = BeautifulSoup(html) if html is None: logger.info(TAG, 'film with id {0} not found'.format(film_id)) return None film_hearder = soup.find( 'div', {'class', 'feature_film_background country_num1'}) title_h1 = film_hearder.find('h1', {'class', 'moviename-big'}) foreign_title_h1 = film_hearder.find('span') title = title_h1.get_text() foreign_title = foreign_title_h1.get_text() film_table = soup.find('table', {'class': 'info'}) all_tr = film_table.find_all('tr') keys = [ 'year', 'country', 'tagline_ru', 'director', 'genre', 'dollar', 'time' ] indxs = [{ 'num': 0, 'is_a': True }, { 'num': 1, 'is_a': True }, { 'num': 2, 'is_a': False }, { 'num': 3, 'is_a': True }, { 'num': 10, 'is_a': True }, { 'num': 11, 'is_a': True }, { 'num': 19, 'is_a': False }] film_json = {} for i, tr in enumerate(all_tr): if indxs[0]['num'] == i: td = tr.find_all('td')[1] text = td.find_all( 'a')[0].get_text() if indxs[0]['is_a'] else td.get_text() film_json[keys[0]] = text indxs.pop(0) keys.pop(0) film_json['title'] = title film_json['forreign_title'] = foreign_title film_json['film_id'] = str(film_id) return json.dumps(film_json, ensure_ascii=False).encode('utf8')
def load_film(self, film_id, add=''): logger.info(TAG, 'load film id: {0}'.format(film_id)) url = '%s/film/%s%s' % (BASE_URL, film_id, add) return self.get_page_source(url, is_wall=True, film_id=film_id)
def save_img_url(self, film): url = '{0} {1}'.format(film.film_id, film.img_url) logger.info(TAG, 'save img url: {0}'.format(url)) with open('cache/img_urls.txt', 'a') as the_file: the_file.write('{0}\n'.format(url))
def test_setup_all(self): for film in self.top_250_films: logger.info(TAG, film.film_id)