コード例 #1
0
ファイル: Parsing.py プロジェクト: eliekaram/IceFilms.Bundle
def GetMediaInfo(url, mediainfo, query_external=False):

	"""
	Retrieve meta data about the passed in LMWT item from a meta provider.
	Additionally, for any info not returned by the meta provider, try to
	collect the info directly from the LMWT item page.
	"""
	
	try:
		
		if (mediainfo.id and re.match("tt\d+", mediainfo.id)):
			imdb_id = mediainfo.id
		else:
			soup = BeautifulSoup(HTTP.Request(ICEFILMS_URL + url).content)
			imdb_link = soup.find('a','iframe')['href']
			imdb_id = re.search("(tt\d+)", str(imdb_link)).group()
		
		if (query_external):
		
			# Construct kwargs.
			kwargs = {}
			kwargs['imdb_id'] = imdb_id
			kwargs['season'] = mediainfo.season
			if hasattr(mediainfo, 'show_name'):
				kwargs['show_name'] = mediainfo.show_name
			if hasattr(mediainfo, 'ep_num'):
				kwargs['ep_num'] = mediainfo.ep_num
			
			#Log("Query-ing External Provider. Args:" + str(kwargs))
			mediainfo = DBProvider().GetProvider(mediainfo.type).RetrieveItemFromProvider(**kwargs)
			#Log(str(mediainfo))
		else:
			mediainfo.id = imdb_id
		
		return mediainfo

	except Exception, ex:
		Log.Exception("Error whilst retrieveing media info")
		return None
コード例 #2
0
def GetMediaInfo(url, mediainfo, query_external=False):
    """
	Retrieve meta data about the passed in LMWT item from a meta provider.
	Additionally, for any info not returned by the meta provider, try to
	collect the info directly from the LMWT item page.
	"""

    try:

        if (mediainfo.id and re.match("tt\d+", mediainfo.id)):
            imdb_id = mediainfo.id
        else:
            soup = BeautifulSoup(HTTP.Request(ICEFILMS_URL + url).content)
            imdb_link = soup.find('a', 'NOiframe')['href']
            imdb_id = re.search("(tt\d+)", str(imdb_link)).group()

        if (query_external):

            # Construct kwargs.
            kwargs = {}
            kwargs['imdb_id'] = imdb_id
            kwargs['season'] = mediainfo.season
            if hasattr(mediainfo, 'show_name'):
                kwargs['show_name'] = mediainfo.show_name
            if hasattr(mediainfo, 'ep_num'):
                kwargs['ep_num'] = mediainfo.ep_num

            #Log("Query-ing External Provider. Args:" + str(kwargs))
            mediainfo = DBProvider().GetProvider(
                mediainfo.type).RetrieveItemFromProvider(**kwargs)
            #Log(str(mediainfo))
        else:
            mediainfo.id = imdb_id

        return mediainfo

    except Exception, ex:
        Log.Exception("Error whilst retrieveing media info")
        return None
コード例 #3
0
def GetMediaInfo(url, mediainfo, query_external=False):
    """
	Retrieve meta data about the passed in LMWT item from a meta provider.
	Additionally, for any info not returned by the meta provider, try to
	collect the info directly from the LMWT item page.
	"""

    # The description meta header for some shows inserts random double quotes in the
    # content which breaks the parsing of the page. Work around that by simply
    # removing the head section in which the meta elements are contained.
    headMassage = [(re.compile('<head>(.*)</head>', re.S), lambda match: '')]
    soupMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
    soupMassage.extend(headMassage)

    soup = BeautifulSoup(HTTP.Request(Dict['LMWT_URL'] + url).content,
                         markupMassage=soupMassage)

    try:

        imdb_id = None
        try:
            imdb_link = soup.find('div', 'mlink_imdb').a['href']
            imdb_id = re.search("(tt\d+)", str(imdb_link)).group()
        except:
            pass

        # Construct kwargs.
        kwargs = {}

        kwargs['imdb_id'] = imdb_id
        kwargs['show_name'] = mediainfo.show_name
        kwargs['season'] = mediainfo.season

        if hasattr(mediainfo, 'ep_num'):
            kwargs['ep_num'] = mediainfo.ep_num

        if (query_external):
            #Log("Query-ing External Provider")
            mediainfo_ret = DBProvider().GetProvider(
                mediainfo.type).RetrieveItemFromProvider(**kwargs)
            #Log(str(mediainfo))
        else:
            mediainfo_ret = MediaInfo()
            mediainfo_ret.id = imdb_id

        # Also parse the LMWT page and extract out any info not set by the meta provider.
        info_div = soup.find('div', 'movie_info')

        # First, extract out description...
        info = {}
        info['Description:'] = info_div.find('td', {'colspan': '2'}).text

        # Then, ratings....
        info['Rating:'] = info_div.find('li', 'current-rating').text

        # Extract out any other info.
        for row in info_div.findAll('tr'):
            row_items = row.findAll('td')
            if len(row_items) <> 2 or "colspan" in str(row_items[0]):
                continue
            info[row_items[0].text] = row_items[1].text

        # Map available extracted info back to the media info object.
        # First, define the mapping between LMWT items and media info and an additional function
        # to extract out sane info out of the LMWT data.
        item_map = {
            'Description:':
            ['summary', lambda x: Utils.decode_htmlentities(x)],
            'Air Date:':
            ['releasedate', lambda x: datetime.strptime(x, '%B %d, %Y')],
            'Runtime:': [
                'duration',
                lambda x: int(re.search("(\d*)", x).group(0)) * 60 * 1000
                if int(re.search("(\d*)", x).group(0)
                       ) * 60 * 1000 < sys.maxint else 0
            ],
            'Rating:': [
                'rating',
                lambda x: float(re.search("([\d\.]+)", x).group(0)) * 2
            ],
            'Title:': ['title', lambda x: decode_htmlentities(x)],
        }

        # For each extracted item from LMWT...
        for lmwt_item in info.keys():

            #Log("Processing: " + lmwt_item)

            # Look for matching entry in map...
            if lmwt_item not in item_map.keys():
                #Log("Not processing - no mapping")
                continue

            mi_item = item_map[lmwt_item]

            if (mi_item is None):
                #Log("Couldn't find a mi attr!")
                continue

            try:
                # And see if it's already set in the mediaInfo object.
                mi_val = getattr(mediainfo_ret, mi_item[0], None)

                #Log("Current mi value: " + str(mi_val))

                # And set it if it's not already.
                if (not mi_val):
                    #Log("Setting mi attr " + mi_item[0] + " to: " + str(mi_item[1](info[lmwt_item])))
                    setattr(mediainfo_ret, mi_item[0],
                            mi_item[1](info[lmwt_item]))

            except Exception, ex:
                #Log.Exception("Error whilst reading in info from LMWT Page. Field " + lmwt_item)
                pass

        return mediainfo_ret
コード例 #4
0
def GetMediaInfo(url, mediainfo, query_external=False):

	"""
	Retrieve meta data about the passed in LMWT item from a meta provider.
	Additionally, for any info not returned by the meta provider, try to
	collect the info directly from the LMWT item page.
	"""

	# The description meta header for some shows inserts random double quotes in the
	# content which breaks the parsing of the page. Work around that by simply
	# removing the head section in which the meta elements are contained.
	headMassage = [(re.compile('<head>(.*)</head>', re.S), lambda match: '')]
	soupMassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
	soupMassage.extend(headMassage)
	
	soup = BeautifulSoup(HTTP.Request(Dict['LMWT_URL'] + url).content, markupMassage=soupMassage)

	try:
	
		imdb_id = None
		try:
			imdb_link = soup.find('div','mlink_imdb').a['href']
			imdb_id = re.search("(tt\d+)", str(imdb_link)).group()
		except:
			pass
		
		# Construct kwargs.
		kwargs = {}
		
		kwargs['imdb_id'] = imdb_id	
		kwargs['show_name'] = mediainfo.show_name
		kwargs['season'] = mediainfo.season
		
		if hasattr(mediainfo, 'ep_num'):
			kwargs['ep_num'] = mediainfo.ep_num
		
		if (query_external):
			#Log("Query-ing External Provider")
			mediainfo_ret = DBProvider().GetProvider(mediainfo.type).RetrieveItemFromProvider(**kwargs)
			#Log(str(mediainfo))
		else:
			mediainfo_ret = MediaInfo()
			mediainfo_ret.id = imdb_id
		
		# Also parse the LMWT page and extract out any info not set by the meta provider.
		info_div = soup.find('div', 'movie_info')
		
		# First, extract out description...
		info = {}
		info['Description:'] = info_div.find('td', { 'colspan': '2' }).text
		
		# Then, ratings....
		info['Rating:'] = info_div.find('li', 'current-rating').text
		
		# Extract out any other info.
		for row in info_div.findAll('tr'):
			row_items = row.findAll('td')
			if len(row_items) <> 2 or "colspan" in str(row_items[0]):
				continue
			info[row_items[0].text] = row_items[1].text
		
		# Map available extracted info back to the media info object.
		# First, define the mapping between LMWT items and media info and an additional function
		# to extract out sane info out of the LMWT data.
		item_map = {
			'Description:' : ['summary', lambda x: Utils.decode_htmlentities(x)], 
			'Air Date:' : ['releasedate', lambda x: datetime.strptime(x, '%B %d, %Y')],
			'Runtime:' : ['duration', lambda x: int(re.search("(\d*)", x).group(0)) * 60 * 1000 if int(re.search("(\d*)", x).group(0)) * 60 * 1000 < sys.maxint else 0],
			'Rating:' : ['rating', lambda x: float(re.search("([\d\.]+)", x).group(0)) * 2],
			'Title:': ['title', lambda x: decode_htmlentities(x)],
		}
		
		# For each extracted item from LMWT...
		for lmwt_item in info.keys():
		
			#Log("Processing: " + lmwt_item)
			
			# Look for matching entry in map...
			if lmwt_item not in item_map.keys():
				#Log("Not processing - no mapping")
				continue
				
			mi_item = item_map[lmwt_item]
			
			if (mi_item is None):
				#Log("Couldn't find a mi attr!")
				continue
				
			try:
				# And see if it's already set in the mediaInfo object.
				mi_val = getattr(mediainfo_ret, mi_item[0], None)
				
				#Log("Current mi value: " + str(mi_val))
				
				# And set it if it's not already.
				if (not mi_val):
					#Log("Setting mi attr " + mi_item[0] + " to: " + str(mi_item[1](info[lmwt_item])))
					setattr(mediainfo_ret, mi_item[0],  mi_item[1](info[lmwt_item]))
						
			except Exception, ex:
				#Log.Exception("Error whilst reading in info from LMWT Page. Field " + lmwt_item)
				pass
				
		return mediainfo_ret