def get_movie_url(movie_data): global imdb_url_str params_dict = {'s': 'tt'} data_list = movie_data.split('*') params_dict['q'] = data_list[1] if data_list[1] else romanize(data_list[0]) response = get_response('/find?' + urllib.urlencode(params_dict)) # first check whether response is desired movie page re_movie_url = re.compile(r'/title/[\d\w]+/',re.M|re.U|re.I) response_url = response.geturl() if re_movie_url.search(response_url): return response_url # then check whether there is single link to movie in the loaded page response_str = response.read() links_list = [] for link in re_movie_url.findall(response_str): if link not in links_list: links_list.append(link) if len(links_list) == 1: return imdb_url_str + links_list[0] # then check 1st link to movie in response page if links_list and check_imdb_movie_year(links_list[0],data_list[2]): return imdb_url_str + links_list[0] # finally try to find movie in exact matches table if response_str.find('Titles (Exact Matches)') != -1: table_str = get_between(response_str,'<table>','</table>',response_str.find('Titles (Exact Matches)')) for row_match in re.finditer(r'<tr>.*?\((?P<year>\d{4})\).*?</tr>',table_str,flags = re.I|re.M|re.U|re.S): if int(data_list[2]) - 2 <= int(row_match.group('year')) <= int(data_list[2]) + 2: return imdb_url_str + get_between(row_match.group(0),'href="','"')
def get_movie_url(movie_data): url = 'http://www.deanclatworthy.com/imdb/' params_dict = {} data_list = movie_data.split('*') if len(data_list) < 3: raise ValueError('invalid movie data') # first trying to find movie using api params_dict['q'] = data_list[1] if data_list[1] else romanize(data_list[0]) params_dict['year'] = data_list[2] try: # first trying to find movie using api response = urllib.urlopen(url + '?' + urllib.urlencode(params_dict)) response_dict = json.loads(response.read()) if 'imdburl' in response_dict: return response_dict['imdburl'] # then trying own function movie_url = imdb_find_movie2.get_movie_url(movie_data) if movie_url: return movie_url except: pass return None