def retrieve_url(self, url): if self.sesh is None: self.login() try: response = self.sesh.open(url) except error.URLError as errno: print(" ".join(("Connection error:", str(errno.reason)))) return "" dat = response.read() # Check if it is gzipped if dat[:2] == b'\x1f\x8b': # Data is gzip encoded, decode it compressedstream = io.BytesIO(dat) gzipper = gzip.GzipFile(fileobj=compressedstream) extracted_data = gzipper.read() dat = extracted_data info = response.info() charset = 'utf-8' try: ignore, charset = info['Content-Type'].split('charset=') except Exception: pass dat = dat.decode(charset, 'replace') dat = htmlentitydecode(dat) return dat
def import_imdb_ratings(imdb_ratings_file_path, jinni_ratings_file_path): logging.info("Starting to import titles from IMDB export...") # The IMDB CSV export starts with an empty line which isn't good for using with # csv.DictReader, so fix it up first by stripping all empty lines. # TODO: this certainly could be done more elegantly... ratings = [line for line in open(imdb_ratings_file_path, "rb").readlines() if line.strip() != ""] open(imdb_ratings_file_path, "wb").writelines(ratings) imdb_ratings = unicode_csv.UnicodeDictReader(open(imdb_ratings_file_path, "rb")) # Read in the exported Jinni ratingss jinni_ratings_file = os.path.join(data_directory, jinni_ratings_file_path) jinni_ratings = unicode_csv.UnicodeDictReader(open(jinni_ratings_file, "rb")) jinni_ratings = [jinni_rating for jinni_rating in jinni_ratings] jinni_titles = set([jinni_rating["title"].lower() for jinni_rating in jinni_ratings]) jinni_ids = set([jinni_rating["jinni_id"] for jinni_rating in jinni_ratings]) # A CSV for saving any mismatches mismatches_file = open(os.path.join(data_directory, "mismatches.csv"), "wb") mismatches_csv = unicode_csv.UnicodeDictWriter(mismatches_file, fieldnames=["position", "const", "created", "modified", "description", "Title", "Title type", "Directors", "You rated", "IMDb Rating", "Runtime (mins)", "Year", "Genres", "Num. Votes", "Release Date (month/day/year)", "URL"]) mismatches = [] for imdb_rating in imdb_ratings: imdb_title = htmlentitydecode(imdb_rating["Title"]) if imdb_title.lower() in jinni_titles: logging.info('Skipping title "{0}" because rating already exists in Jinni...'.format(imdb_title)) continue search_results = jinni_search(u"{0} {1}".format(imdb_title, imdb_rating["Year"])) if len(search_results) == 0: logging.error(u'No search results for "{0}"...'.format(imdb_title)) mismatches.append(imdb_rating) else: match = None for search_result in search_results: try: if imdb_rating["const"] == search_result["affiliates"]["IMDB"]["affiliateContentIds"]["key"]: match = search_result break except KeyError, ex: continue if match: if str(search_result["DBID"]) in jinni_ids: logging.info('Skipping title "{0}" because rating already exists in Jinni...'.format(imdb_title)) continue logging.info(u'Submitting rating for "{0}" (Jinni id: {1})...'.format(imdb_title, search_result["DBID"])) jinni_submit_rating(imdb_rating["You rated"], search_result["DBID"]) else: # TODO: try a suggestion search before giving up? logging.error(u'Could not find a match for "{0}"...'.format(imdb_title)) mismatches.append(imdb_rating)
def retrieve_url(self, url): cj = http.cookiejar.MozillaCookieJar(self.cookie_filename) if os.access(self.cookie_filename, os.F_OK): cj.load() opener = urllib.request.build_opener( urllib.request.HTTPCookieProcessor(cj)) req = urllib.request.Request(url) response = opener.open(req) dat = response.read() # Check if it is gzipped if dat[:2] == '\037\213': # Data is gzip encoded, decode it compressedstream = StringIO.StringIO(dat) gzipper = gzip.GzipFile(fileobj=compressedstream) dat = gzipper.read() # document.cookie.indexOf('_ddn_intercept_2_=ebdc811923afde6a39f0d7bc77dfe97d') m = re.search(self.cookie_pattern, dat.decode('utf-8')) if m: #opener.addheaders.append(('Cookie', m.group('cookie'))) ck = http.cookiejar.Cookie(version=0, name=m.group('name'), value=m.group('value'), port=None, port_specified=False, domain=self.name, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(ck) cj.save(self.cookie_filename, ignore_discard=True, ignore_expires=True) response = opener.open(req) dat = response.read() info = response.info() charset = 'utf-8' try: ignore, charset = info['Content-Type'].split('charset=') except: pass dat = dat.decode(charset, 'replace') dat = htmlentitydecode(dat) return dat.encode('utf-8', 'replace')
def _retreive_url(self, url): """Return the HTML content of url page as a string """ try: res = self.sesh.open(url) except request.URLError as errorno: print("Connection Error: {}".format(errorno.reason)) return "" charset = 'utf-8' info = res.info() try: _, charset = info['Content-Type'].split('charset=') except: pass dat = res.read() dat = dat.decode(charset, 'replace') dat = htmlentitydecode(dat) return dat
def _get_link(self, link): """Return the HTML content of url page as a string """ try: logging.debug("Trying to open " + link) res = self.session.open(link) except request.URLError as errno: print("Connection Error: {}".format(errno.reason)) return "" charset = 'utf-8' info = res.info() try: _, charset = info['Content-Type'].split('charset=') except: pass data = res.read() data = data.decode(charset, 'replace') data = htmlentitydecode(data) return data
def retrieve_url(self,url): cj = http.cookiejar.MozillaCookieJar(self.cookie_filename) if os.access(self.cookie_filename, os.F_OK): cj.load() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) req = urllib.request.Request(url) response = opener.open(req) dat = response.read() # Check if it is gzipped if dat[:2] == '\037\213': # Data is gzip encoded, decode it compressedstream = StringIO.StringIO(dat) gzipper = gzip.GzipFile(fileobj=compressedstream) dat = gzipper.read() # document.cookie.indexOf('_ddn_intercept_2_=ebdc811923afde6a39f0d7bc77dfe97d') m = re.search(self.cookie_pattern, dat.decode('utf-8')) if m: #opener.addheaders.append(('Cookie', m.group('cookie'))) ck = http.cookiejar.Cookie(version=0, name=m.group('name'), value=m.group('value'), port=None, port_specified=False, domain=self.name, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(ck) cj.save(self.cookie_filename, ignore_discard=True, ignore_expires=True) response = opener.open(req) dat = response.read() info = response.info() charset = 'utf-8' try: ignore, charset = info['Content-Type'].split('charset=') except: pass dat = dat.decode(charset, 'replace') dat = htmlentitydecode(dat) return dat.encode('utf-8', 'replace')