def sim_content(data): sim_regx = {} if data.get("directors") and data.get("directors") != "": sim_regx["directors"] = re.compile( u"(" + parse_regx_char(data["directors"]) + ")", re.IGNORECASE) # if data.get("starring") and data.get("starring")!="": # sim_regx['starring'] = re.compile(u"("+data["starring"]+")",re.IGNORECASE) if len(sim_regx) == 0: """如果主演导演都没有的话,就不去搜索了""" return None if data.get("area"): sim_regx['area'] = re.compile( u"(" + "|".join(area_process(data.get("area"))) + ")", re.IGNORECASE) regx_name = search_preprocess(data.get("title")) regx_name = parse_regx_char(regx_name) regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*" regx_name = re.compile(regx_name, re.IGNORECASE) sim_regx['title'] = regx_name #匹配标题以之开头的 contents = db.contents.find(sim_regx) print('---%s------%s' % (regx_name, data.get("title"))) if contents.count() > 0: poster = db.posters.find({"content_id": str(contents[0]['_id'])}) if poster.count() > 0: p = [] for x in poster: print(x['_id'], data['_id']) print( "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) del x['_id'] x['content_id'] = data['_id'] p.append(x) return p else: return None
def merge_fields(self, info): L = Contents() L.title = info.get("name") L.summary = info.get("description") L.iqiyi_tvId = info.get("tvId") L.iqiyi_vid = info.get("vid") L.iqiyi_plays_num = info.get("playCount") L.iqiyi_albumId = info.get("albumId") L.iqiyi_play_url = info.get("url") if info.get("duration") and info.get("duration") != "": L.duration = info.get("duration")/60 L.poster = [] if info.get("albumImageUrl"): L.img_url = info.get("albumImageUrl") L.poster.append({"url": info.get("albumImageUrl")}) if info.get("imageUrl"): L.poster.append({"url": info.get("imageUrl")}) if info.get("videoImageUrl"): L.poster.append({"url": info.get("videoImageUrl")}) if info.get("posterUrl"): L.poster.append({"url": info.get("posterUrl")}) if info.get("tvImageUrl"): L.poster.append({"url": info.get("tvImageUrl")}) if info.get("qualityImageUrl"): L.poster.append({"url": info.get("qualityImageUrl")}) if info["issueTime"]: L.release_date = mictime_to_ymd(info["issueTime"]) if info.get("crumbList"): level2 = True for x in info.get("crumbList"): if int(x["level"])==2 and x["title"]!=u'VIP会员': L.category = x["title"] level2 = False if level2: for x in info.get("crumbList"): if int(x["level"])==3: L.category = x["title"] level2 = False _temp = [] for x in info.get("categories"): if u"地区" in x.get("subName"): L.area = area_process(x.get("name")) elif u"类型" in x.get("subName") or u'风格' in x.get("subName") or u'分类' in x.get("subName") or u'小学' in x.get("subName") or u'高中' in x.get("subName") or u'短片' in x.get("subName"): _temp.append(x.get("name")) elif u"语种" in x.get("subName"): L.language = language_process(x.get("name")) elif x.get("subName") == u"年龄段": L.age = x.get("name") L.tags = ",".join(_temp) L.all_episode = info.get("videoCount") L.sub_title = info.get("subtitle") L.iqiyi_rating_num = info.get("commentCount") L.iqiyi_qitanId = info.get("qitanId") if info.get("cast") and info.get("cast").get("directors"): L.directors = [] L.directors_list = [] for x in info.get("cast").get("directors"): L.directors.append(x.get("name")) L.directors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.directors = ",".join(L.directors) if info.get("cast") and info.get("cast").get("speakers"): L.speakers = [] L.speakers_list = [] for x in info.get("cast").get("speakers"): L.speakers.append(x.get("name")) L.speakers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.speakers = ",".join(L.speakers) if info.get("cast") and info.get("cast").get("publishers"): L.publishers = [] L.publishers_list = [] for x in info.get("cast").get("publishers"): L.publishers.append(x.get("name")) L.publishers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.publishers = ",".join(L.publishers) if info.get("cast") and info.get("cast").get("singers"): L.singers = [] L.singers_list = [] for x in info.get("cast").get("singers"): L.singers.append(x.get("name")) L.singers_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.singers = ",".join(L.singers) if info.get("cast") and info.get("cast").get("mainActors"): L.starring = [] L.starring_list = [] for x in info.get("cast").get("mainActors"): L.starring.append(x.get("name")) L.starring_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.starring = ",".join(L.starring) """编剧""" if info.get("cast") and info.get("cast").get("writers"): L.screenwriters = [] L.screenwriter_list = [] for x in info.get("cast").get("writers"): L.screenwriters.append(x.get("name")) L.screenwriter_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.screenwriters = ",".join(L.screenwriters) if info.get("cast") and info.get("cast").get("actors"): L.actors = [] L.actors_list = [] for x in info.get("cast").get("actors"): L.actors.append(x.get("name")) L.actors_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.actors = ",".join(L.actors) """嘉宾""" if info.get("cast") and info.get("cast").get("guests"): L.guests = [] L.guests_list = [] for x in info.get("cast").get("guests"): L.guests.append(x.get("name")) L.guests_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.guests = ",".join(L.guests) if info.get("cast") and info.get("cast").get("hosts"): L.hosts = [] L.hosts_list = [] for x in info.get("cast").get("hosts"): L.hosts.append(x.get("name")) L.hosts_list.append({"name": x.get("name"), "iqiyi_id": x.get("id"), "avatar": x.get( "imageUrl"), "iqiyi_userId": x.get("userId"), "iqiyi_circleId": x.get("circleId")}) L.hosts = ",".join(L.hosts) if L.release_date and L.year==None: m = re.search(u'(\d{4})',L.release_date) if m: L.year = m.group(1) if not L.directors_list and not L.directors_list and L.hosts: L.directors = L.hosts L.directors_list = L.hosts_list L.focuses = info.get("focuses") L.iqiyi_rating = info.get("score") L.created_at = time.time() return L.__dict__
def baike_parser(r,url=None): try: r = re.sub(u' ','',r) page = etree.HTML(r) except Exception as e: return False L = Contents() summary = page.xpath(u'//div[@class="lemmaWgt-lemmaSummary lemmaWgt-lemmaSummary-light"]') if len(summary) > 0: L.summary = summary[0].text title = page.xpath(u'//dt[contains(text(),"中文名")]') if len(title) > 0: L.title = parse_simple(title[0].getnext().text) foreign_title = page.xpath(u'//dt[contains(text(),"外文名")]') if len(foreign_title) > 0: L.foreign_title = parse_simple(foreign_title[0].getnext().text) production_company = page.xpath(u'//dt[contains(text(),"出品公司")]') if len(production_company) > 0: L.production_company = parse_simple(production_company[0].getnext().text) producer_country = page.xpath(u'//dt[contains(text(),"制片地区")]') if len(producer_country) > 0: L.producer_country = area_process(parse_simple(producer_country[0].getnext().text)) directors_list = page.xpath(u'//dt[contains(text(),"导演")]') if len(directors_list) > 0: a_tag = directors_list[0].getnext().findall('a') if len(a_tag) > 0: L.directors_list = [] directors = [] for x in a_tag: L.directors_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) directors.append(parse_simple(x.text)) L.directors = ",".join(set(directors)) else: L.directors = area_process(parse_simple(directors_list[0].getnext().text)) screenwriter_list = page.xpath(u'//dt[contains(text(),"编剧")]') if len(screenwriter_list) > 0: a_tag = screenwriter_list[0].getnext().findall('a') if len(a_tag) > 0: L.screenwriter_list = [] screenwriters = [] for x in a_tag: L.screenwriter_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) screenwriters.append(parse_simple(x.text)) L.screenwriters = ",".join(set(screenwriters)) else: L.screenwriters = area_process(parse_simple(screenwriter_list[0].getnext().text)) starring_list = page.xpath(u'//dt[contains(text(),"主演")]') if len(starring_list) > 0: a_tag = starring_list[0].getnext().findall('a') if len(a_tag) > 0: L.starring_list = [] starring = [] for x in a_tag: L.starring_list.append({"name":parse_simple(x.text),"baike_id":x.get("data-lemmaid"),"baike_url":u'https://baike.baidu.com'+x.get("href")}) starring.append(parse_simple(x.text)) L.starring = ",".join(set(starring)) else: L.starring = area_process(parse_simple(starring_list[0].getnext().text)) alias = page.xpath(u'//dt[contains(text(),"其它译名")]') if len(alias) > 0: a_tag = alias[0].getnext().findall("a") if len(a_tag) > 0: L.alias = ",".join([parse_simple(x.text) for x in a_tag if parse_simple(x.text)]) else: L.alias = parse_simple(alias[0].getnext().text) types = page.xpath(u'//dt[contains(text(),"类型")]') if len(types) > 0: L.type = area_process(parse_simple(types[0].getnext().text)) duration = page.xpath(u'//dt[contains(text(),"片长")]') if len(duration) > 0: L.duration = area_process(parse_simple(duration[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"上映时间")]') if len(release_date) > 0: L.release_date = area_process(parse_simple(release_date[0].getnext().text)) release_date = page.xpath(u'//dt[contains(text(),"语言")]') if len(release_date) > 0: L.language = language_process(parse_simple(release_date[0].getnext().text)) douban_rating = page.xpath(u'//span[contains(@class,"star-text")]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@alt="词条图片"]') if len(poster) > 0: L.poster = [{"url":poster[0].get("src"),"name":poster[0].get("alt")}] L.img_url = poster[0].get("src") actor_list = page.xpath(u'//ul[@class="actorList"]/li') if len(actor_list) > 0: starring = L.starring.split(',') L.actor_list = [] starring_list = [] for x in actor_list: _temp = {"avatar":x.find('img').get("src"),"name":x.xpath(u'//dl[@class="info"]/a')[0].text,"baike_id":x.xpath(u'//dl[@class="info"]/a')[0].get("data-lemmaid"),"baidu_url":"https://baike.baidu.com"+x.xpath(u'//dl[@class="info"]/a')[0].get("href")} if _temp['name'] in starring: starring_list.append(_temp) else: L.actor_list.append(_temp) if starring_list: L.starring_list = starring_list L.created_at = time.time() return L.__dict__
def process(self, task=None): '''处理job task''' if task.get("name") == None or task.get("name") == "": pass #测试,先屏蔽了 r_mongo = mongo_conn.contents.find({ 'relationship': { '$elemMatch': { 'mediaId': task.get("code"), "platform": "gd" } } }) if r_mongo.count() > 0: return self.after_mongo_succ(r_mongo[0]['_id'], task) ct = check_title(task['name']) if ct: return self.after_search_failed(task, category=ct) if task.get("actor") and task.get("director"): #信息全的走mongodb匹配,mongo没有的接着走百度搜索引擎 print("task:--name:%s------director:%s---actor:%s---area:%s-" % (task.get("name"), task.get("director"), task.get("actor"), task.get("region"))) regx = {} if task.get("director") and task.get("director") != "": regx["directors"] = re.compile( u"(" + "|".join(process_actor(task.get("director")).split(',')) + ")", re.IGNORECASE) if task.get("actor") and task.get("actor") != "": regx['actors'] = re.compile( u"(" + "|".join(process_actor(task.get("actor")).split(',')) + ")", re.IGNORECASE) # if task.get("year"): # regx['year'] = re.compile(u"("+ "|".join(process_actor(task.get("year")).split(','))+")",re.IGNORECASE) if task.get("region"): regx['area'] = re.compile( u"(" + "|".join(area_process(task.get("region"))) + ")", re.IGNORECASE) regx_name = title_preprocess_seed(task.get("name")) regx_name = parse_regx_char(regx_name) regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*" print('---%s------%s' % (regx_name, task.get("name"))) regx['title'] = re.compile(regx_name, re.IGNORECASE) contents = mongo_conn.contents.find(regx) '''在元数据仓库找不到的去百度搜索引擎''' if contents.count() == 0: return self.baidu(task) else: ls = [] lss = {} for mon in contents: sim = lst.ratio(title_preprocess(task.get("name")), title_preprocess(mon['title'])) print( "-sim:%s--title:%s--name:%s------director:%s---actor:%s---region:%s-" % (sim, mon['title'], task.get("name"), task.get("director"), task.get("actor"), task.get("region"))) if sim < 0.80: continue ls.append(sim) lss[sim] = mon.get("_id") if len(ls) == 0: return self.baidu(task) ls.sort() try: _id = lss[ls[-1]] except Exception as e: _id = lss[ls[0]] print( "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^%s" % regx_name) return self.after_mongo_succ(_id, task) else: return self.baidu(task)
def vdetail_parser(self, r): data = Contents() try: page = etree.HTML(r) except Exception as e: return False year = re.search(u'<span class="year">\((\d{4})\)</span>', r) if year: data.year = year.group(1) title = re.search(u'<span property="v\:itemreviewed">(.*)</span>', r) if title: data.title = title.group(1) bianju = page.xpath(u'//span[contains(text(),"编剧")]') if len(bianju) > 0: bianju_a = bianju[0].getnext() if bianju_a is not None: bianju_a = bianju_a.findall('a') data.screenwriter_list = [] screenwriters = '' for x in bianju_a: screenwriters = screenwriters + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.screenwriter_list.append(_temp) data.screenwriters = screenwriters.strip(',') directors_el = page.xpath(u'//span[contains(text(),"导演")]') if len(directors_el) > 0: directors_a = directors_el[0].getnext() if directors_a is not None: directors_a = directors_a.findall('a') data.directors_list = [] directors = "" for x in directors_a: directors = directors + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.directors_list.append(_temp) data.directors = directors.strip(',') starring_el = page.xpath(u'//span[contains(text(),"主演")]') if len(starring_el) > 0: starring_a = starring_el[0].getnext() if starring_a is not None: starring_a = starring_a.findall('a') data.starring_list = [] starring = "" for x in starring_a: starring = starring + parse_simple(x.text) + "," _temp = {} if re.search(u'/celebrity/(\d*)/', x.get("href")): _temp["doubanid"] = re.search(u'/celebrity/(\d*)/', x.get("href")).group(1) else: # doubanid = x.get("href") pass if x.get("href"): _temp[ "douban_url"] = "https://movie.douban.com" + x.get( "href") _temp["name"] = parse_simple(x.text) data.starring_list.append(_temp) starring = starring.strip(',') data.starring = starring type_el = page.xpath(u'//span[@property="v:genre"]') # 类型 mvtype = [] if len(type_el) > 0: for x in type_el: mvtype.append(parse_simple(x.text)) tags = page.xpath(u'//div[@class="tags-body"]/a') _temp = [] for x in tags: _temp.append(parse_simple(x.text)) _temp = _temp + mvtype data.tags = ",".join(set(_temp)) producer_country_el = page.xpath( u'//span[contains(text(),"制片国家/地区:")]') if len(producer_country_el) > 0: producer_country = page.xpath( u'//span[contains(text(),"制片国家/地区:")]/following::text()[1]')[0] data.producer_country = area_process( split_space(producer_country.replace('/', ','))) language_el = page.xpath(u'//span[contains(text(),"语言:")]') if len(language_el) > 0: language = page.xpath( u'//span[contains(text(),"语言:")]/following::text()[1]')[0] data.language = language_process( split_space(language.replace('/', ','))) all_episode = page.xpath(u'//span[contains(text(),"集数:")]') if len(all_episode) > 0: all_episode = page.xpath( u'//span[contains(text(),"集数:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', all_episode.replace(" ", "")) if m: data.all_episode = m.group(1) episode_time = page.xpath(u'//span[contains(text(),"单集片长:")]') if len(episode_time) > 0: episode = page.xpath( u'//span[contains(text(),"单集片长:")]/following::text()[1]')[0] m = re.search(u'(\d{1,})', episode.replace(" ", "")) if m: data.duration = m.group(1) season = page.xpath( u'//select[@id="season"]/option[@selected="selected"]') #season季数 if len(season) > 0: data.season = season[0].text release_date_el = page.xpath( u'//span[@property="v:initialReleaseDate"]') #首播 if len(release_date_el) > 0: release_date = "" for x in release_date_el: release_date = release_date + parse_simple(x.text) + "|" release_date = release_date.strip('|') m = re.search(u'(\d{4}-\d{2}-\d{2})', release_date.replace(" ", "")) if m: data.release_date = m.group(1) else: data.release_date = release_date duration_el = page.xpath(u'//span[@property="v:runtime"]') if len(duration_el) > 0: m = re.search(u'(\d{1,})', duration_el[0].text.replace(" ", '')) if m: data.duration = m.group(1) # 片长 alias_al = page.xpath(u'//span[contains(text(),"又名:")]') if len(alias_al) > 0: alias = page.xpath( u'//span[contains(text(),"又名:")]/following::text()[1]')[0] data.alias = split_space(alias.replace('/', ',')) IMDb_el = page.xpath(u'//span[contains(text(),"IMDb链接")]') if len(IMDb_el) > 0: data.IMDb = IMDb_el[0].getnext().get("href") rating = re.search(u'property="v\:average">(\d*\.\d*)</strong>', r) if rating: data.douban_rating = rating.group(1) rating_sum = page.xpath(u'//span[@property="v:votes"]') if len(rating_sum) > 0: data.douban_rating_sum = rating_sum[0].text summary_all = page.xpath(u'//span[@class="all hidden"]') summary = page.xpath(u'//span[@property="v:summary"]') if len(summary_all) > 0: data.summary = ''.join( page.xpath(u'//span[@class="all hidden"]/text()')) data.summary = parse_simple(data.summary) elif len(summary) > 0: data.summary = ''.join( page.xpath(u'//span[@property="v:summary"]/text()')) data.summary = parse_simple(data.summary) img_url = page.xpath(u'//img[@title="点击看更多海报"]') nbgnbg = page.xpath(u'//a[@title="点击看大图" and @class="nbgnbg"]') if len(img_url) > 0: data.img_url = page.xpath(u'//img[@title="点击看更多海报"]')[0].get("src") elif len(nbgnbg) > 0: data.img_url = nbgnbg[0].get("href") if data.all_episode > 1 and (u"动漫" in data.tags or u"动画" in data.tags): data.category = u"动漫" elif data.all_episode > 1 and (u"综艺" in data.tags or u'真人秀' in data.tags): data.category = u'综艺' elif data.all_episode > 1: data.category = u"电视剧" elif u"动漫" in data.tags or u"动画" in data.tags: data.category = u'动漫' elif u"短片" in data.tags: data.category = u'短片' else: data.category = u'电影' m = re.search(u"SUBJECT_ID: *'(\d*)'", r) if m: data.doubanid = m.group(1) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) print(data.__dict__) print( "oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo" ) return data.__dict__
def process(self, task=None): '''处理job task''' if task.get("contentName") == None or task.get("contentName") == "": pass # 测试,先屏蔽了 r_mongo = mongo_conn.contents.find({ 'relationship': { '$elemMatch': { 'mediaId': task.get("mediaId"), "platform": task.get("platform") } } }) if r_mongo.count() > 0: return self.after_mongo_succ(r_mongo[0]["_id"], task) """过滤新闻等短视频""" ct = check_title(task['contentName']) if ct: return self.after_search_failed(task, category=ct) if task.get("actor") and task.get("director"): # 信息全的走mongodb匹配,mongo没有的接着走百度搜索引擎 print( "task:--contentName:%s---contentType:%s---director:%s---actor:%s---area:%s-" % (task.get("contentName"), task.get("contentType"), task.get("director"), task.get("actor"), task.get("area"))) regx = {} if task.get("director") and task.get("director") != "": regx["directors"] = re.compile( u"(" + "|".join(process_actor(task.get("director")).split(',')) + ")", re.IGNORECASE) # 匹配至少有一个directors相交的 if task.get("actor") and task.get("actor") != "": regx['actors'] = re.compile( u"(" + "|".join(process_actor(task.get("actor")).split(',')) + ")", re.IGNORECASE) # 匹配至少有一个starring相交的 if task.get("year"): regx['year'] = re.compile( u"(" + "|".join(process_actor(task.get("year")).split(',')) + ")", re.IGNORECASE) # 匹配至少有一个year if task.get("area"): regx['area'] = re.compile( u"(" + "|".join(area_process(task.get("area"))) + ")", re.IGNORECASE) # 匹配至少有一个year regx_name = title_preprocess_seed(task.get("contentName")) regx_name = parse_regx_char(regx_name) regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*" regx_name = re.compile(regx_name, re.IGNORECASE) regx['title'] = regx_name # 匹配标题以之开头的 contents = mongo_conn.contents.find(regx) '''在元数据仓库找不到的去百度搜索引擎''' if contents.count() == 0: return self.baidu(task) else: # mongo_rs = mongo_conn.contents.find({"title": regx_name}) ls = [] lss = {} # 100:普通点播 110:连续剧父集 110:连续剧子集 for mon in contents: sim = lst.ratio(title_preprocess(task.get("contentName")), title_preprocess(mon['title'])) print( "-sim:%s--title:%s--contentName:%s---contentType:%s---director:%s---actor:%s---area:%s-" % (sim, mon['title'], task.get("contentName"), task.get("contentType"), task.get("director"), task.get("actor"), task.get("area"))) if sim < 0.80: continue ls.append(sim) lss[sim] = mon.get("_id") if len(ls) == 0: return self.baidu(task) ls.sort() try: _id = lss[ls[-1]] except Exception as e: _id = lss[ls[0]] print( "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^%s" % regx_name) return self.after_mongo_succ(_id, task) else: return self.baidu(task)
def vdetail_parser(self, r): try: page = etree.HTML(r) except Exception as e: return False L = Contents() title = page.xpath(u'//a[@_stat="info:title"]') m = re.search(u'\{"id":"(\w*\d*)"', r) if m: L.qq_id = m.group(1) m = re.search(u'&vid=(\w*\d*)&', r) if m: L.qq_vid = m.group(1) if len(title) > 0: L.title = title[0].text category = page.xpath(u'//span[@class="type"]') if len(category) > 0: L.category = category[0].text area = page.xpath(u'.//span[contains(text(),"地 区:")]') if len(area) > 0: L.area = area_process(area[0].getnext().text) foreign_title = page.xpath(u'//span[@class="title_en"]') if len(foreign_title) > 0: L.foreign_title = foreign_title[0].text qq_play = page.xpath(u'.//a[@_stat="info:playbtn"]') if len(qq_play) > 0: L.qq_play = qq_play[0].get("href") language = page.xpath(u'//span[contains(text(),"语 言:")]') if len(language) > 0: L.language = language_process(language[0].getnext().text) year = page.xpath(u'.//span[contains(text(),"上映时间")]') if len(year) > 0 and year[0].getnext().text: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) all_episode = page.xpath(u'//span[contains(text(),"总集数:")]') if len(all_episode) > 0: L.all_episode = all_episode[0].getnext().text release_date = page.xpath(u'//span[contains(text(),"出品时间:")]') if len(release_date) > 0: L.release_date = release_date[0].getnext().text if L.release_date and L.year == None: try: m = re.search(u'(\d{4})', L.release_date) if m: L.year = m.group(1) except Exception as e: pass year = page.xpath(u'.//span[contains(text(),"首播时间")]') if len(year) > 0 and L.year == None: m = re.search(u'(\d{4})', year[0].getnext().text) if m: L.year = m.group(1) alias = page.xpath(u'//span[contains(text(),"别 名")]') if len(alias) > 0: L.alias = alias[0].getnext().text tags = page.xpath(u'//a[@class="tag"]') if len(tags) > 0: _temp = [x.text for x in tags] L.tags = ",".join(set(_temp)) summary = page.xpath( u'//span[@class="desc_txt"]/span[@class="txt _desc_txt_lineHight"]' ) if len(summary) > 0: L.summary = parse_simple(summary[0].text) qq_rating = page.xpath(u'//div[@class="score_v"]/span[@class="score"]') if len(qq_rating) > 0: L.qq_rating = qq_rating[0].text douban_rating = page.xpath( u'//a[@class="score_db"]/span[@class="score"]') if len(douban_rating) > 0: L.douban_rating = douban_rating[0].text poster = page.xpath(u'//img[@_stat="info:poster"]') if len(poster) > 0: L.poster = [] if poster[0].get("src"): L.poster.append({ "url": self.parse_imgurl(poster[0].get("src")), "name": poster[0].get("alt") }) L.img_url = self.parse_imgurl(poster[0].get("src")) #导演演员 actor_list = page.xpath(u'//ul[contains(@class,"actor_list")]/li') starring_list = [] starring = [] directors_list = [] directors = [] if len(actor_list) > 0: _temp = [] for actor in actor_list: _dic = {} actor_avatar = actor.find(u'a') if actor_avatar is not None: if actor_avatar.find('img') is not None: _dic["avatar"] = self.parse_imgurl( actor_avatar.find('img').get("src")) _dic["qq_id"] = actor.get("data-id") if actor.find("span") is not None: _dic["name"] = actor.find("span").text _dic["qq_home_page"] = actor_avatar.get("href") actor_detail = actor.xpath( u'.//div[@class="actor_detail"]') if actor_detail: # 职业 occupation = actor_detail[0].xpath( u'.//span[contains(text(),"职业")]') if occupation: _dic['occupation'] = occupation[0].getnext().text # 地区 area = actor_detail[0].xpath( u'.//span[contains(text(),"地区")]') if len(area) > 0: _dic['area'] = area[0].getnext().text # 简介 intro = actor.xpath(u'.//span[@itemprop="description"]') if intro: _dic["intro"] = intro[0].text # 导演 if actor_avatar.xpath(u'.//span[@class="director"]'): directors_list.append(_dic) directors.append(_dic['name']) else: # 演员 starring_list.append(_dic) starring.append(_dic['name']) if starring_list: L.starring = ','.join(starring) L.starring_list = starring_list if directors_list: L.directors = ','.join(directors) L.directors_list = directors_list if L.title == None: return False L.created_at = time.time() return L.__dict__
def sync(arges): print("haha") offset = arges["offset"] size = arges['size'] mongo_contents = mongo_conn.contents mongo_posters = mongo_conn.posters succ = 0 total = 0 this_limit = 3000 this_offset = offset step = (size - offset) / this_limit print('this_offset:', this_offset, size, size / this_limit) for x in xrange(1, step + 1): print("offset:", offset + x * this_limit, this_limit) db_session = scoped_session(DBSession) print(offset + (x - 1) * this_limit, x * this_limit + offset) contents = dict() contents = db_session.query(Vod).filter( Vod.d_downfrom != "1").order_by(asc(Vod.id)).all() for item in contents: if item.d_name == None or item.d_name == "" or item.d_pic == None or item.d_pic == "": item.d_downfrom = '1' db_session.add(item) db_session.commit() continue total += 1 """没有地区或者导演主演都没有就pass""" if not item.d_area or (not item.d_directed and not item.d_starring): item.d_downfrom = '1' db_session.add(item) db_session.commit() continue regx_name = title_preprocess_seed(item.d_name) regx_name = parse_regx_char(regx_name) regx_name = u'.*' + regx_name.replace(u'-', '.*') + ".*" print("-----%s-----%s----%s--%s----%s---" % (item.d_name, regx_name, item.d_area, item.d_directed, item.d_starring)) regx = {} # regx["title"] = re.compile(regx_name, re.IGNORECASE) regx["title"] = regx_name if item.d_directed and item.d_directed != "": # regx["directors"] = re.compile(u"("+ "|".join(process_actor(item.d_directed).split(','))+")",re.IGNORECASE) #匹配至少有一个directors相交的 regx["directors"] = u"(" + "|".join( process_actor(item.d_directed).split( ',')) + ")" #匹配至少有一个directors相交的 if item.d_starring and item.d_starring != "": regx['actors'] = re.compile( u"(" + "|".join(process_actor(item.d_starring).split(',')) + ")", re.IGNORECASE) #匹配至少有一个starring相交的 regx['actors'] = u"(" + "|".join( process_actor( item.d_starring).split(',')) + ")" #匹配至少有一个starring相交的 if item.d_area: # regx['area'] = re.compile(u"("+ "|".join(area_process(item.d_area))+")",re.IGNORECASE) regx['area'] = u"(" + "|".join(area_process(item.d_area)) + ")" print(json.dumps(regx)) mongo_rs = mongo_contents.find(regx) ls = [] lss = {} # 100:普通点播 110:连续剧父集 110:连续剧子集 print("mongo_rs", mongo_rs.count()) for mon in mongo_rs: sim = lst.ratio(title_preprocess(item.d_name), title_preprocess(mon['title'])) print("-sim:----%s----%s---%s----regx_name:%s----" % (sim, item.d_name, mon['title'], regx_name)) if sim < 0.80: continue ls.append(sim) lss[sim] = mon.get("_id") if sim == 1.0: break if len(ls) == 0: item.d_downfrom = '1' db_session.add(item) db_session.commit() continue ls.sort() try: _id = lss[ls[-1]] except Exception as e: _id = lss[ls[0]] _id = mongo_posters.insert( { "url": item.d_pic, "content_id": str(_id) }, check_keys=False) print(_id) # print("---%s--------%s----"%(item.d_name,c[0]['title'])) print("-------done--------") # return True item.d_downfrom = '1' db_session.add(item) db_session.commit() db_session.close() #db_session.commit() db_session.close() print(size, succ)