def analysis_relation2_fromurl(url): res = [] data = lurl.load(url) if data == None: print('timeout:' + url) return None soup = bs4.BeautifulSoup(data, 'html.parser') # get_name = soup.find_all('dd', attrs={'class': 'lemmaWgt-lemmaTitle-title'}) # if len(get_name) == 0: # print(path1) # m = m + 1 # continue # main_name = get_name[0].h1.text # print(main_name) tags = soup.find_all('ul', attrs={'class': 'slider maqueeCanvas'}) for tag in tags: soup2 = bs4.BeautifulSoup(str(tag), 'html.parser') tag2 = soup2.find_all('div', attrs={'class': 'name'}) if len(tag2) == 0: continue dict = {} for i in tag2: if not i.em: continue full = i.text name = i.em.text len1 = len(full) len2 = len(name) relations = full[0:len1 - len2] dict_tmp = {relations: name} res.append(dict_tmp) # print(relations) return res
def analysis_movieurl(url): relation = set() data = lurl.load(url) soup1 = bs4.BeautifulSoup(data, 'html.parser') tags1 = soup1.find_all('a', attrs={'class': 'actor-name '}) if (len(tags1) == 0): tags2 = soup1.find_all('p', attrs={'class': 'actorName'}) if (len(tags2) == 0): return None for tag2 in tags2: relation_star = tag2.text.strip() # if relation_star == real_name: # continue # else: relation.add(relation_star) else: for tag1 in tags1: relation_star = tag1.text.strip() # if relation_star == real_name: # continue # else: relation.add(relation_star) return relation
def analysis_showurl(url, show_path, file_name): """ 解析showurl并写html :param url: 要解析的url :param show_path: 写html的路径 :param file_name: 写html的文件名 :return: set """ # p = Pool(1) data = lurl.load(url) if data == None: print('timeout:' + url) with open('log', 'a', encoding='utf-8') as f: f.write('show_url load 失败:' + str(file_name) + '>----->' + url + '\n') return None # lurl.write_html(file_name, data, show_path) # p.apply_async(lurl.write_html,args=(file_name,data,show_path)) soup1 = bs4.BeautifulSoup(data, 'html.parser') # print(soup1) tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'}) # tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) if not tag1: return None relation = set() num = -1 soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': True}) # print(tags2) tags3 = soup2.find_all('dd', attrs={'class': True}) # print(tags3) for tag2 in tags2: if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2: num = -1 else: if tag2.text.strip('\n') == '主持人': num = num + 1 break else: num = num + 1 # print(num) # print(num) if num != -1: zhuchiren = tags3[num] # print(zhuchiren) else: return None list = re.split(r'[,、]', zhuchiren.text.strip('\n')) # print(list) # print(type(list)) for i in range(len(list)): list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i]) # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(') # relation = set(list) relation = set(list) return relation
def analysis_showurl(url): """ 解析url,获取该条movie-url的movie-relation关系 :param url: :return: 返回一个set集合 """ data = lurl.load(url) if data == None: print('timeout:' + url) with open('log', 'a') as f: f.write('show_url load 失败:' + url + '\n') return None soup1 = bs4.BeautifulSoup(data, 'html.parser') # print(soup1) tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'}) # tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) if not tag1: return None relation = set() num = -1 soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': True}) # print(tags2) tags3 = soup2.find_all('dd', attrs={'class': True}) # print(tags3) for tag2 in tags2: if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2: num = -1 else: if tag2.text.strip('\n') == '主持人': num = num + 1 break else: num = num + 1 # print(num) # print(num) if num != -1: zhuchiren = tags3[num] # print(zhuchiren) else: return None # list = zhuchiren.text.strip('\n').split('、') # print(list) list = re.split(r'[,、]', zhuchiren.text.strip('\n')) # print(list) # print(type(list)) for i in range(len(list)): list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i]) # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(') # relation = set(list) relation = set(list) return relation
def analysis_movieurl(url, movie_path, file_name): """ 接卸movie——url并且写html文件 :param url: movie的url :param movie_path: 写html的路径 :param file_name: html的文件名 :return: set """ relation = set() data = lurl.load(url) if data == None: print('timeout:' + url) with open('log', 'a', encoding='utf-8') as f: f.write('movie_url load 失败:' + str(file_name) + '>----->' + url + '\n') return None lurl.write_html(file_name, data, movie_path) # p.apply_async(lurl.write_html, args = (file_name,data,movie_path)) soup1 = bs4.BeautifulSoup(data, 'html.parser') tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) num = -1 if not tag1: return None # else: soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'}) tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'}) for tag2 in tags2: # print(tag2.text.strip('\n')) if tag2.text != '主 演' and num == len(tags2) - 2: num = -1 else: if tag2.text == '主 演': # print('here') num = num + 1 break else: num = num + 1 # print(num) if num != -1: zhuyan = tags3[num] else: return None list = re.split(r'[,、]', zhuyan.text.strip('\n')) # print(list) # print(type(list)) for i in range(len(list)): list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i]) # list[i]=list[i].rstrip('\n').rstrip('[').rstrip('(') relation = set(list) return relation
def get_movieurl_fromurl(star_url): url = [] data = lurl.load(star_url) if data == None: print('timeout:' + star_url) return None soup = bs4.BeautifulSoup(data, 'html.parser') # get_name = soup.find_all('dd', attrs={'class': 'lemmaWgt-lemmaTitle-title'}) # if len(get_name) == 0: # # print(path1) # continue # main_name = get_name[0].h1.text # print(main_name) tags = soup.find_all('div', attrs={'class': 'star-info-block works'}) # tags = soup.find_all('ul', attrs={'class':'slider maqueeCanvas'}) for tag in tags: soup2 = bs4.BeautifulSoup(str(tag), 'html.parser') tag2 = soup2.find_all('ul', attrs={'class': 'slider maqueeCanvas'}) if len(tag2) == 0: continue for i in tag2: soup3 = bs4.BeautifulSoup(str(i), 'html.parser') tag3 = soup3.find_all('a', attrs={'href': True}) for j in tag3: name = j.text.strip('\n') # print(name) item = j['href'] # print(item) if item[0] != '/': movie_url = item else: movie_url = tmp + item # print(movie_url) url.append(movie_url) # 下载电影html # if name == '' or name == None: # continue # else: # data = test.load(movie_url) # test.write_html(main_name.replace('/', ' ') + '-' + name.replace('/', ' '), data) # n = n + 1 # print(n) # print(url) return url
def analysis_movieurl_list(url): """ 因为要获取有序的列表,需要返回list(去重的list) :param url: :return: """ # relation = [] data = lurl.load(url) if data == None: print('timeout:' + url) with open('log', 'a') as f: f.write('movie_url load 失败:' + url + '\n') return None soup1 = bs4.BeautifulSoup(data, 'html.parser') tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) num = -1 if not tag1: return None soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'}) tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'}) for tag2 in tags2: # print(tag2.text.strip('\n')) if tag2.text != '主 演' and num == len(tags2) - 2: num = -1 else: if tag2.text == '主 演': num = num + 1 break else: num = num + 1 if num != -1: zhuyan = tags3[num] else: return None tmp_list = re.split(r'[,、]', zhuyan.text.strip('\n')) for i in range(len(tmp_list)): tmp_list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', tmp_list[i]) relation = list(set(tmp_list)) relation.sort(key=tmp_list.index) return relation
def analysis_movieurl(url): """ 解析url,获取该条url的movie——relation关系 :param url: 需要解析的movie——url地址 :return: 返回一个set集合 """ relation = set() data = lurl.load(url) if data == None: print('timeout:' + url) with open('log', 'a') as f: f.write('movie_url load 失败:' + url + '\n') return None soup1 = bs4.BeautifulSoup(data, 'html.parser') tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) num = -1 if not tag1: return None soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'}) tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'}) for tag2 in tags2: # print(tag2.text.strip('\n')) if tag2.text != '主 演' and num == len(tags2) - 2: num = -1 else: if tag2.text == '主 演': num = num + 1 break else: num = num + 1 if num != -1: zhuyan = tags3[num] else: return None list = re.split(r'[,、]', zhuyan.text.strip('\n')) for i in range(len(list)): list[i] = re.sub('[\[\d\]\n\xa0]|[\((][^\))]+[\))]$', '', list[i]) relation = set(list) return relation
def get_showurl_fromurl(star_url): res = [] data = lurl.load(star_url) if data == None: print('timeout:' + star_url) return None soup1 = bs4.BeautifulSoup(f, 'html.parser') tags1 = soup1.find_all('table', attrs={'class': 'cell-module'}) if (len(tags1) == 0): return None tags1 = tags1[0] soup2 = bs4.BeautifulSoup(str(tags1), 'html.parser') tags2 = soup2.find_all('a', attrs={'href': True}) if (len(tags2) == 0): return None for tag2 in tags2: # 每个循环一个url(一个show记录),解析并添加到relation_set show_name = tag2.text.strip('\n') url = tag2['href'] if url[0] == '/': show_url = tmp + url elif url[1] == 'h': show_url = url else: continue res.append(show_url) # n = n + 1 # print(n) # print(star_name) # data = lurl.load(show_url) # st.write_html(star_name.replace('/',' ')+'-'+show_name.replace('/',' '),data) # list = ansis.show_html_analysis(data) # if list != None: # for name in list: # realation_set.add(name) # else: # continue # print(res) # print(res) return res
def analysis_showurl2(url): data = lurl.load(url) if data == None: print('timeout:' + url) return None soup1 = bs4.BeautifulSoup(data, 'html.parser') # print(soup1) # tag1 = soup1.find('dl', attrs={'class': 'basicInfo-block basicInfo-left'}) # 在这里修改了 tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) if not tag1: return None relation = set() num = -1 soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': True}) # print(tags2) tags3 = soup2.find_all('dd', attrs={'class': True}) # print(tags3) for tag2 in tags2: if tag2.text.strip('\n') != '主持人' and num == len(tags2) - 2: num = -1 else: if tag2.text.strip('\n') == '主持人': num = num + 1 break else: num = num + 1 # print(num) # print(num) if num != -1: zhuchiren = tags3[num] # print(zhuchiren) else: return None list = zhuchiren.text.strip('\n').split('、') # print(list) relation = set(list) return relation
def analysis_movieurl2(url): relation = set() data = lurl.load(url) if data == None: print('timeout:' + url) return None soup1 = bs4.BeautifulSoup(data, 'html.parser') tag1 = soup1.find('div', attrs={'class': 'basic-info cmn-clearfix'}) num = -1 if not tag1: return None # else: soup2 = bs4.BeautifulSoup(str(tag1), 'html.parser') tags2 = soup2.find_all('dt', attrs={'class': 'basicInfo-item name'}) tags3 = soup2.find_all('dd', attrs={'class': 'basicInfo-item value'}) for tag2 in tags2: # print(tag2.text.strip('\n')) if tag2.text != '主 演' and num == len(tags2) - 2: num = -1 else: if tag2.text == '主 演': # print('here') num = num + 1 break else: num = num + 1 # print(num) if num != -1: zhuyan = tags3[num] else: return None list = zhuyan.text.strip('\n').split(',') relation = set(list) return relation
def recomend(star_url, star_path, movie_path, show_path, path_res): with open(path_res, 'a', encoding='UTF-8') as f: for k, v in star_url.items(): # p1 = Pool(3) star_name = k #明星名字 print('=======' + star_name + '=======') relation_list = [] # 解析的明星relation列表 movie_url = [] # 解析明星的movieurl列表 show_url = [] # 解析明星的showurl列表 full = {} # each line {star_name:full_relation} full_relation = [] # 一条总记录 movie_dic = {} # 解析的movie推荐列表 show_dic = {} # 借些的show推荐列表 # print(v) data = lurl.load(v) # lurl.write_html(star_name,data,star_path) # p1.apply_async(lurl.write_html(star_name, data, star_path)) if data == None: with open('log', 'a', encoding='utf-8') as f1: f1.write('明星url_load失败:') f1.write(k + ':' + v + '\n') continue #解析结果:relation,movieurl,showurl relation_list = analysis.get_relations(data) # relation_list = p1.apply_async(analysis.get_relations, args=(data, )).get() movie_url = analysis.get_movieurl(data) # movie_url = p1.apply_async(analysis.get_movieurl, args=(data, )).get() show_url = analysis.get_showurl(data) # show_url = p1.apply_async(analysis.get_showurl, args=(data, )).get() # print(show_url) # p1.close() # p1.join() # relation 结果存储 {relation:[name...} # if len(relation_list)!=0: if relation_list: tmp_dict = {} tmp_list = [] print('relation') for i in relation_list: for j in i.keys(): tmp_list.append(i[j]) tmp_dict['relation'] = tmp_list full_relation.append(tmp_dict) print('relation_over') # p2 = Pool(2) #load movieurl列表并解析 # if movie_url != None: # print('movie') # # # 逐一借些movie的url # movie_set = p2.apply_async(get_movieset,args=(movie_url, movie_path, star_name)).get() # # 把该明星名字从列表中去除 # if movie_set != '' and star_name in movie_set: # movie_set.remove(star_name) # movie_list=list(movie_set) # movie_dic['movie']=movie_list # if len(movie_dic['movie']) != 0 and movie_dic['movie'] != None: # full_relation.append(movie_dic) # print('movie_over') if movie_url: print('movie') movie_relation_list = get_movie_relation_list(movie_url) # movie_relation_list = p2.apply_async(get_movie_relation_list, args=(movie_url,)).get() if movie_relation_list and star_name in movie_relation_list: movie_relation_list.remove(star_name) movie_dic['movie'] = movie_relation_list if movie_dic['movie']: full_relation.append(movie_dic) print('movie_over') # load showurl列表并解析 # if show_url!= None: if show_url: print('show') show_set = get_showset(show_url, show_path, star_name) # show_set = p2.apply_async(get_showset,args=(show_url, show_path, star_name)).get() # 20171228新加的还没尝试(过滤重复名字) if show_set != '' and star_name in show_set: show_set.remove(star_name) show_list = list(show_set) # print(show_list) show_dic['show'] = show_list if len(show_dic['show']) != 0 and show_dic['show'] != None: full_relation.append(show_dic) print('show_over') # p2.close() # p2.join() if len(full_relation) != 0: full[star_name] = full_relation data = js.dumps(full, ensure_ascii=False) f.write(data + '\n') else: with open('Null_recommend_list', 'a', encoding='UTF-8') as f3: f3.write(k + ':' + v + '\n')
def recomend(star_url, path): with open(path, 'a') as f: for k, v in star_url.items(): star_name = k #明星名字 print('=======' + star_name + '=======') relation_list = [] # 解析的明星relation列表 movie_url = [] # 解析明星的movieurl列表 show_url = [] # 解析明星的showurl列表 full = {} # each line {star_name:full_relation} full_relation = [] # 一条总记录 movie_dic = {} # 解析的movie推荐列表 show_dic = {} # 借些的show推荐列表 # print(v) data = lurl.load(v) if data == None: with open('log', 'a') as f1: f1.write('明星url_load失败:') f1.write(k + ':' + v + '\n') continue #解析结果:relation,movieurl,showurl relation_list = analysis.get_relations(data) movie_url = analysis.get_movieurl(data) show_url = analysis.get_showurl(data) # print(show_url) # relation 结果存储 {relation:[name...} if len(relation_list) != 0: tmp_dict = {} tmp_list = [] print('relation') for i in relation_list: for j in i.keys(): tmp_list.append(i[j]) tmp_dict['relation'] = tmp_list full_relation.append(tmp_dict) print('relation_over') #load movieurl列表并解析 if movie_url != None: print('movie') movie_set = set() # 逐一借些movie的url for url in movie_url: # print(url) tmpset = analysis.analysis_movieurl(url) # print(tmpset) if tmpset != None and len(tmpset) != 0: movie_set = movie_set | tmpset else: continue # 把该明星名字从列表中去除 if movie_set != '' and star_name in movie_set: movie_set.remove(star_name) movie_list = list(movie_set) movie_dic['movie'] = movie_list if len(movie_dic['movie']) != 0 and movie_dic['movie'] != None: # print(len(movie_dic['movie'])) full_relation.append(movie_dic) print('movie_over') # load showurl列表并解析 if show_url != None: print('show') show_set = set() for url in show_url: # print(url) tmpset2 = analysis.analysis_showurl(url) # print(tmpset2) if tmpset2 != None and len(tmpset2) != 0: show_set = show_set | tmpset2 else: continue # 20171228新加的还没尝试(过滤重复名字) if show_set != '' and star_name in show_set: show_set.remove(star_name) show_list = list(show_set) # print(show_list) show_dic['show'] = show_list if len(show_dic['show']) != 0 and show_dic['show'] != None: full_relation.append(show_dic) print('show_over') if len(full_relation) != 0: full[star_name] = full_relation data = js.dumps(full, ensure_ascii=False) f.write(data + '\n') else: with open('None_recommend_list', 'a') as f3: f3.write(k + ':' + v + '\n')