def retrieve_url(self, url):
        if self.sesh is None:
            self.login()

        try:
            response = self.sesh.open(url)
        except error.URLError as errno:
            print(" ".join(("Connection error:", str(errno.reason))))
            return ""

        dat = response.read()

        # Check if it is gzipped
        if dat[:2] == b'\x1f\x8b':
            # Data is gzip encoded, decode it
            compressedstream = io.BytesIO(dat)
            gzipper = gzip.GzipFile(fileobj=compressedstream)
            extracted_data = gzipper.read()
            dat = extracted_data
        info = response.info()
        charset = 'utf-8'
        try:
            ignore, charset = info['Content-Type'].split('charset=')
        except Exception:
            pass
        dat = dat.decode(charset, 'replace')
        dat = htmlentitydecode(dat)

        return dat
Example #2
0
def import_imdb_ratings(imdb_ratings_file_path, jinni_ratings_file_path):
    logging.info("Starting to import titles from IMDB export...")
    
    # The IMDB CSV export starts with an empty line which isn't good for using with 
    # csv.DictReader, so fix it up first by stripping all empty lines.
    # TODO: this certainly could be done more elegantly...
    ratings = [line for line in open(imdb_ratings_file_path, "rb").readlines() if line.strip() != ""]
    open(imdb_ratings_file_path, "wb").writelines(ratings)
    imdb_ratings = unicode_csv.UnicodeDictReader(open(imdb_ratings_file_path, "rb"))
    
    # Read in the exported Jinni ratingss
    jinni_ratings_file = os.path.join(data_directory, jinni_ratings_file_path)
    jinni_ratings = unicode_csv.UnicodeDictReader(open(jinni_ratings_file, "rb"))
    jinni_ratings = [jinni_rating for jinni_rating in jinni_ratings]
    jinni_titles = set([jinni_rating["title"].lower() for jinni_rating in jinni_ratings])
    jinni_ids = set([jinni_rating["jinni_id"] for jinni_rating in jinni_ratings])
    
    # A CSV for saving any mismatches
    mismatches_file = open(os.path.join(data_directory, "mismatches.csv"), "wb")
    mismatches_csv = unicode_csv.UnicodeDictWriter(mismatches_file, fieldnames=["position", "const", "created", "modified", "description", "Title", "Title type", "Directors", "You rated", "IMDb Rating", "Runtime (mins)", "Year", "Genres", "Num. Votes", "Release Date (month/day/year)", "URL"])
    mismatches = []

    for imdb_rating in imdb_ratings:
        imdb_title = htmlentitydecode(imdb_rating["Title"])
        
        if imdb_title.lower() in jinni_titles:
            logging.info('Skipping title "{0}" because rating already exists in Jinni...'.format(imdb_title))
            continue
        
        search_results = jinni_search(u"{0} {1}".format(imdb_title, imdb_rating["Year"]))
        
        if len(search_results) == 0:
            logging.error(u'No search results for "{0}"...'.format(imdb_title))
            mismatches.append(imdb_rating)
        else:
            match = None
            for search_result in search_results:
                try:
                    if imdb_rating["const"] == search_result["affiliates"]["IMDB"]["affiliateContentIds"]["key"]:
                        match = search_result
                        break
                except KeyError, ex:
                    continue
                    
            if match:
                if str(search_result["DBID"]) in jinni_ids:
                    logging.info('Skipping title "{0}" because rating already exists in Jinni...'.format(imdb_title))
                    continue
                
                logging.info(u'Submitting rating for "{0}" (Jinni id: {1})...'.format(imdb_title, search_result["DBID"]))
                jinni_submit_rating(imdb_rating["You rated"], search_result["DBID"])
            else:
                # TODO: try a suggestion search before giving up?
                logging.error(u'Could not find a match for "{0}"...'.format(imdb_title))
                mismatches.append(imdb_rating)
Example #3
0
    def retrieve_url(self, url):
        cj = http.cookiejar.MozillaCookieJar(self.cookie_filename)
        if os.access(self.cookie_filename, os.F_OK):
            cj.load()
        opener = urllib.request.build_opener(
            urllib.request.HTTPCookieProcessor(cj))
        req = urllib.request.Request(url)
        response = opener.open(req)
        dat = response.read()
        # Check if it is gzipped
        if dat[:2] == '\037\213':
            # Data is gzip encoded, decode it
            compressedstream = StringIO.StringIO(dat)
            gzipper = gzip.GzipFile(fileobj=compressedstream)
            dat = gzipper.read()

        # document.cookie.indexOf('_ddn_intercept_2_=ebdc811923afde6a39f0d7bc77dfe97d')

        m = re.search(self.cookie_pattern, dat.decode('utf-8'))
        if m:
            #opener.addheaders.append(('Cookie', m.group('cookie')))
            ck = http.cookiejar.Cookie(version=0,
                                       name=m.group('name'),
                                       value=m.group('value'),
                                       port=None,
                                       port_specified=False,
                                       domain=self.name,
                                       domain_specified=False,
                                       domain_initial_dot=False,
                                       path='/',
                                       path_specified=True,
                                       secure=False,
                                       expires=None,
                                       discard=True,
                                       comment=None,
                                       comment_url=None,
                                       rest={'HttpOnly': None},
                                       rfc2109=False)
            cj.set_cookie(ck)
            cj.save(self.cookie_filename,
                    ignore_discard=True,
                    ignore_expires=True)
            response = opener.open(req)
            dat = response.read()

        info = response.info()
        charset = 'utf-8'
        try:
            ignore, charset = info['Content-Type'].split('charset=')
        except:
            pass
        dat = dat.decode(charset, 'replace')
        dat = htmlentitydecode(dat)
        return dat.encode('utf-8', 'replace')
Example #4
0
    def _retreive_url(self, url):
        """Return the HTML content of url page as a string """
        try:
            res = self.sesh.open(url)
        except request.URLError as errorno:
            print("Connection Error: {}".format(errorno.reason))
            return ""

        charset = 'utf-8'
        info = res.info()
        try:
            _, charset = info['Content-Type'].split('charset=')
        except:
            pass
        dat = res.read()
        dat = dat.decode(charset, 'replace')

        dat = htmlentitydecode(dat)
        return dat
    def _get_link(self, link):
        """Return the HTML content of url page as a string """
        try:
            logging.debug("Trying to open " + link)
            res = self.session.open(link)
        except request.URLError as errno:
            print("Connection Error: {}".format(errno.reason))
            return ""

        charset = 'utf-8'
        info = res.info()
        try:
            _, charset = info['Content-Type'].split('charset=')
        except:
            pass
        data = res.read()
        data = data.decode(charset, 'replace')

        data = htmlentitydecode(data)
        return data
Example #6
0
    def retrieve_url(self,url):
        cj = http.cookiejar.MozillaCookieJar(self.cookie_filename)
        if os.access(self.cookie_filename, os.F_OK):
            cj.load()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
        req = urllib.request.Request(url)
        response = opener.open(req)
        dat = response.read()
        # Check if it is gzipped
        if dat[:2] == '\037\213':
            # Data is gzip encoded, decode it
            compressedstream = StringIO.StringIO(dat)
            gzipper = gzip.GzipFile(fileobj=compressedstream)
            dat = gzipper.read()

        # document.cookie.indexOf('_ddn_intercept_2_=ebdc811923afde6a39f0d7bc77dfe97d')
 
        m = re.search(self.cookie_pattern, dat.decode('utf-8'))
        if m:
            #opener.addheaders.append(('Cookie', m.group('cookie')))
            ck = http.cookiejar.Cookie(version=0, name=m.group('name'), value=m.group('value'), port=None, port_specified=False, domain=self.name, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)
            cj.set_cookie(ck)
            cj.save(self.cookie_filename, ignore_discard=True, ignore_expires=True)
            response = opener.open(req)
            dat = response.read()


        info = response.info()
        charset = 'utf-8'
        try:
            ignore, charset = info['Content-Type'].split('charset=')
        except:
            pass
        dat = dat.decode(charset, 'replace')
        dat = htmlentitydecode(dat)
        return dat.encode('utf-8', 'replace')