Beispiel #1
0
    def check_filename(filename):
        logger.debug('check_filename filename : %s', filename)
        try:
            ret = None
            match1 = re.compile(_REGEX_FILENAME).match(filename)
            match2 = re.compile(_REGEX_FILENAME_NO_EPISODE_NUMBER).match(filename)

            for regex in [_REGEX_FILENAME, _REGEX_FILENAME_NO_EPISODE_NUMBER]:
                match = re.compile(regex).match(filename)
                if match:
                    logger.debug('QQQQQQQQQQQ')
                    ret = {}
                    ret['title'] = match1.group('name')
                    ret['no'] = match1.group('no')
                    ret['date'] = match1.group('date')
                    ret['etc'] = match1.group('etc').replace('.', '')
                    ret['quality'] = match1.group('quality')
                    ret['release'] = None
                    if 'release' in match1.groupdict():
                        ret['release'] = match1.group('release')
                    else:
                        ret['release'] = None
                    if ret['no'] is not None and ret['no'] != '': 
                        ret['no'] = int(ret['no'])
                    else: 
                        ret['no'] = -1
                    return DaumTV.change_filename_continous_episode(ret)
        except Exception as exception: 
            logger.error('Exception:%s', exception)
            logger.error(traceback.format_exc())
Beispiel #2
0
 def change_filename_continous_episode(ret):
     try:
         if ret['title'].find(u'\ud569') == -1:
             return ret
         match = re.compile(_REGEX_FILENAME_RENAME).match(ret['title'])
         if match:
             logger.debug(u'\ud569\ubcf8 : %s', ret['filename'])
             ret['title'] = match.group('title').strip()
             if ret['no'] == -1:
                 ret['no'] = int(match.group('no'))
         return ret
     except Exception as e:
         logger.error('Exception:%s', e)
         logger.error(traceback.format_exc())
Beispiel #3
0
    def get_daum_tv_info(search_name, daum_id=None, on_home=False):
        try:
            entity = {}
            #logger.debug('get_daum_tv_info 1 %s', search_name)
            search_name = DaumTV.get_search_name_from_original(search_name)
            #logger.debug('get_daum_tv_info 2 %s', search_name)

            if daum_id is not None:
                url = 'https://search.daum.net/search?w=tv&q=%s&irk=%s&irt=tv-program&DA=TVP' % (py_urllib.quote(search_name.encode('utf8')), daum_id)
            else:
                url = 'https://search.daum.net/search?w=tv&q=%s' % (py_urllib.quote(search_name.encode('utf8')))

            data = DaumTV.get_html(url)
            match = re.compile(r'irk\=(?P<id>\d+)').search(data)
            root = lxml.html.fromstring(data)
            daum_id = match.group('id') if match else ''
            
            entity = {}
            entity['daum_id'] = daum_id
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/strong')
            if not items: 
                return None
            if len(items) == 1:
                entity['title'] = items[0].text.strip()
                entity['title'] = entity['title'] .replace('?', '').replace(':', '')

            entity['status'] = 0
            # 방송종료, 방송예정
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/span')
            if items:
                if items[0].text.strip() == u'방송종료':
                    entity['status'] = 1
                elif items[0].text.strip() == u'방송예정':
                    entity['status'] = 2
            
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[3]/span')
            # 2019-02-25 방송종료시 정보 없어짐
            if items:
                entity['studio'] = items[0].text.strip()
                try: 
                    entity['broadcast_info'] = items[1].text.strip()
                except: 
                    pass
                try: 
                    entity['broadcast_term'] = items[2].text.strip()
                except: 
                    pass
                try:
                    items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/span')
                except:
                    pass
            else:
                if on_home:
                    logger.debug('on_home : %s', search_name)
                    xml_root = DaumTV.get_show_info_on_home_title(search_name, daum_id=daum_id)
                    home_ret = DaumTV.get_show_info_on_home(xml_root)
                    if home_ret :
                        entity['studio'] = home_ret['studio']
                        entity['broadcast_info'] = home_ret['broadcast_info']
                        entity['broadcast_term'] = home_ret['broadcast_term']
                        #방송예정은 items True가 되어 여기 안온다.

            try:
                match = re.compile(r'(\d{4}\.\d{1,2}\.\d{1,2})~').search(entity['broadcast_term'])
                if match:
                    entity['start_date'] = match.group(1)
            except:
                pass

            items = root.xpath('//*[@id="tv_program"]/div[1]/dl[1]/dd')
            if len(items) == 1:
                entity['genre'] = items[0].text.strip().split(' ')[0]
                entity['genre'] = entity['genre'].split('(')[0].strip()


            items = root.xpath('//*[@id="tv_program"]/div[1]/dl[2]/dd')
            if len(items) == 1:
                entity['summary'] = items[0].text.replace('&nbsp', ' ')

            items = root.xpath('//*[@id="tv_program"]/div[1]/div[1]/a/img')
            if len(items) == 1:
                entity['poster_url'] = items[0].attrib['src'] if items[0].attrib['src'].startswith('http') else f"https:{items[0].attrib['src']}"

            items = root.xpath('//*[@id="clipDateList"]/li')
            entity['episode_list'] = {}

            if len(items) > 300: 
                items = items[len(items)-300:]
            today = int(datetime.now().strftime('%Y%m%d'))

            for item in items:
                try:
                    a_tag = item.xpath('a') 
                    if len(a_tag) == 1:
                        span_tag = a_tag[0].xpath('span[@class="txt_episode"]')
                        if len(span_tag) == 1:
                            if item.attrib['data-clip'] in entity['episode_list']:
                                #같은날짜 같은회차가 두번나올때가 있다. 버그로 보임
                                #EBS 초대석, 2010912 28회
                                if entity['episode_list'][item.attrib['data-clip']][0] == span_tag[0].text.strip().replace(u'회', ''):
                                    pass
                                else:
                                    # 에피소드 넘버가 1차이가 날때만, 마지막꺼와
                                    idx = len(entity['episode_list'][item.attrib['data-clip']]) - 1
                                    _ = abs(int(entity['episode_list'][item.attrib['data-clip']][idx]) - int(span_tag[0].text.strip().replace(u'회', '')))
                                    #2019-06-24
                                    #슬플때사랑한다, 21, 22 순서 
                                    if _ <= 4:
                                        if item.attrib['data-clip'] != '' and today >= int(item.attrib['data-clip']):
                                            entity['last_episode_date'] = item.attrib['data-clip']
                                            entity['last_episode_no'] = span_tag[0].text.strip().replace(u'회', '')
                                        entity['episode_list'][item.attrib['data-clip']].append(span_tag[0].text.strip().replace(u'회', ''))
                                    else:
                                        pass
                                    # Daum에 뜬금없는 에피소드가 끼어져있다
                                    # 무시
                                        
                            else:
                                if item.attrib['data-clip'] != '' and today >= int(item.attrib['data-clip']):
                                    entity['last_episode_date'] = item.attrib['data-clip']
                                    entity['last_episode_no'] = span_tag[0].text.strip().replace(u'회', '')
                                entity['episode_list'][item.attrib['data-clip']] = [span_tag[0].text.strip().replace(u'회', '')]
                except Exception as exception: 
                    logger.error('Exception:%s', exception)
                    logger.error(traceback.format_exc())
            #전체 에피소드 갯수 : len(items)
            #에피소드 dict 갯수  len(entity.episode_list)  
            #정확히 반이면 1일 2회 방송, 1/4이면 1일 4회 방송
            
            # 2019-06-24
            #if len(entity.episode_list) != 0 and len(items) % len(entity.episode_list) == 0:
            #    entity.episode_count_one_day = len(items) / len(entity.episode_list)
            try:
                if len(entity['episode_list']):
                    entity['episode_count_one_day'] = int(round(float(len(items)) / len(entity['episode_list'])))
                    if entity['episode_count_one_day'] == 0:
                        entity['episode_count_one_day'] = 1
                else:
                    entity['episode_count_one_day'] = 1
            except:
                entity['episode_count_one_day'] = 1

            #entity['episode_list_json'] = json.dumps(entity['episode_list'])
            #entity['episode_list_json'] = entity['episode_list']
            #logger.debug(entity['episode_list_json'])
            #entity.save()
            logger.debug('daum tv len(entity.episode_list) : %s %s %s', len(items), len(entity['episode_list']), entity['episode_count_one_day'])
            #logger.debug(entity.episode_list)
            #logger.debug(items)
            return entity  
        except Exception as exception: 
            logger.error('Exception:%s', exception)
            logger.error(traceback.format_exc())
Beispiel #4
0
    def get_show_info_on_home(root):
        try:
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/span/a')
            # 2019-05-13
            #일밤- 미스터리 음악쇼 복면가왕 A 태그 2개
            if len(tags) < 1:
                return
            tag_index = len(tags)-1
            entity = {}
            entity['title'] = tags[tag_index].text
            logger.debug('22222get_show_info_on_home title: %s', entity['title'])
            match = re.compile(r'q\=(?P<title>.*?)&').search(tags[tag_index].attrib['href'])
            if match:
                entity['title'] = py_urllib.unquote(match.group('title'))
            entity['id'] = re.compile(r'irk\=(?P<id>\d+)').search(tags[tag_index].attrib['href']).group('id')

            entity['status'] = 1  
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/span/span')
            if len(tags) == 1:
                if tags[0].text == u'방송종료':
                    entity['status'] = 2
                elif tags[0].text == u'방송예정':
                    entity['status'] = 0
            logger.debug('get_show_info_on_home status: %s', entity['status'])
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div')
            entity['extra_info'] = tags[0].text_content().strip()
            logger.debug('get_show_info_on_home extra_info: %s', entity['extra_info'])

            entity['studio'] = ''
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/a')
            if len(tags) == 1:
                entity['studio'] = tags[0].text
            else:
                tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/span[1]')
                if len(tags) == 1:
                    entity['studio'] = tags[0].text
            logger.debug('get_show_info_on_home studio: %s', entity['studio'])

            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/span')
            entity['extra_info_array'] = [tag.text for tag in tags]
            entity['broadcast_info'] = entity['extra_info_array'][-2].strip()
            entity['broadcast_term'] = entity['extra_info_array'][-1].split(',')[-1].strip()
            entity['year'] = re.compile(r'(?P<year>\d{4})').search(entity['extra_info_array'][-1]).group('year')
            logger.debug('get_show_info_on_home 1: %s', entity['status'])
            #시리즈
            entity['series'] = []
            entity['series'].append({'title':entity['title'], 'id' : entity['id'], 'year' : entity['year']})
            tags = root.xpath('//*[@id="tv_series"]/div/ul/li')

            if tags:
                # 2019-03-05 시리즈 더보기 존재시
                try:
                    more = root.xpath('//*[@id="tv_series"]/div/div/a')
                    url = more[0].attrib['href']
                    if not url.startswith('http'):
                        url = 'https://search.daum.net/search%s' % url
                    logger.debug('MORE URL : %s', url)
                    if more[0].xpath('span')[0].text == u'시리즈 더보기':
                        more_root = HTML.ElementFromURL(url)
                        tags = more_root.xpath('//*[@id="series"]/ul/li')
                except Exception as exception:
                    logger.debug('Not More!')
                    logger.debug(traceback.format_exc())

                for tag in tags:
                    dic = {}
                    dic['title'] = tag.xpath('a')[0].text
                    dic['id'] = re.compile(r'irk\=(?P<id>\d+)').search(tag.xpath('a')[0].attrib['href']).group('id')
                    if tag.xpath('span'):
                        dic['date'] = tag.xpath('span')[0].text
                        dic['year'] = re.compile(r'(?P<year>\d{4})').search(dic['date']).group('year')
                    else:
                        dic['year'] = None
                    entity['series'].append(dic)
                entity['series'] = sorted(entity['series'] , key=lambda k: int(k['id'])) 
            logger.debug('SERIES : %s', len(entity['series']))
            #동명
            entity['equal_name'] = []
            tags = root.xpath(u'//div[@id="tv_program"]//dt[contains(text(),"동명 콘텐츠")]//following-sibling::dd')
            if tags:
                tags = tags[0].xpath('*')
                for tag in tags:
                    if tag.tag == 'a':
                        dic = {}
                        dic['title'] = tag.text
                        dic['id'] = re.compile(r'irk\=(?P<id>\d+)').search(tag.attrib['href']).group('id')
                    elif tag.tag == 'span':
                        match = re.compile(r'\((?P<studio>.*?),\s*(?P<year>\d{4})?\)').search(tag.text)
                        if match:
                            dic['studio'] = match.group('studio')
                            dic['year'] = match.group('year')
                        elif tag.text == u'(동명프로그램)':
                            entity['equal_name'].append(dic)
                        elif tag.text == u'(동명회차)':
                            continue
            logger.debug(entity)
            return entity
        except Exception as exception:
            logger.debug('Exception get_show_info_by_html : %s', exception)
            logger.debug(traceback.format_exc())
Beispiel #5
0
    def get_daum_tv_info(search_name, daum_id = None, on_home = False):
        try:
            entity = {}
            logger.debug('get_daum_tv_info 1 %s', search_name)
            search_name = DaumTV.get_search_name_from_original(search_name)
            logger.debug('get_daum_tv_info 2 %s', search_name)
            if daum_id is not None:
                url = 'https://search.daum.net/search?w=tv&q=%s&irk=%s&irt=tv-program&DA=TVP' % (urllib.quote(search_name.encode('utf8')), daum_id)
            else:
                url = 'https://search.daum.net/search?w=tv&q=%s' % urllib.quote(search_name.encode('utf8'))
            data = DaumTV.get_html(url)
            match = re.compile('irk\\=(?P<id>\\d+)').search(data)
            root = lxml.html.fromstring(data)
            daum_id = match.group('id') if match else ''
            entity = {}
            entity['daum_id'] = daum_id
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/strong')
            if not items:
                return
            if len(items) == 1:
                entity['title'] = items[0].text.strip()
                entity['title'] = entity['title'].replace('?', '').replace(':', '')
            entity['status'] = 0
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/span')
            if items:
                if items[0].text.strip() == u'\ubc29\uc1a1\uc885\ub8cc':
                    entity['status'] = 1
                elif items[0].text.strip() == u'\ubc29\uc1a1\uc608\uc815':
                    entity['status'] = 2
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[3]/span')
            if items:
                entity['studio'] = items[0].text.strip()
                try:
                    entity['broadcast_info'] = items[1].text.strip()
                except:
                    pass

                try:
                    entity['broadcast_term'] = items[2].text.strip()
                except:
                    pass

                try:
                    items = root.xpath('//*[@id="tv_program"]/div[1]/div[2]/span')
                except:
                    pass

            elif on_home:
                logger.debug('on_home : %s', search_name)
                xml_root = DaumTV.get_show_info_on_home_title(search_name, daum_id=daum_id)
                home_ret = DaumTV.get_show_info_on_home(xml_root)
                if home_ret:
                    entity['studio'] = home_ret['studio']
                    entity['broadcast_info'] = home_ret['broadcast_info']
                    entity['broadcast_term'] = home_ret['broadcast_term']
            try:
                match = re.compile('(\\d{4}\\.\\d{1,2}\\.\\d{1,2})~').search(entity['broadcast_term'])
                if match:
                    entity['start_date'] = match.group(1)
            except:
                pass

            items = root.xpath('//*[@id="tv_program"]/div[1]/dl[1]/dd')
            if len(items) == 1:
                entity['genre'] = items[0].text.strip().split(' ')[0]
                entity['genre'] = entity['genre'].split('(')[0].strip()
            items = root.xpath('//*[@id="tv_program"]/div[1]/dl[2]/dd')
            if len(items) == 1:
                entity['summary'] = items[0].text.replace('&nbsp', ' ')
            items = root.xpath('//*[@id="tv_program"]/div[1]/div[1]/a/img')
            if len(items) == 1:
                entity['poster_url'] = 'https:%s' % items[0].attrib['src']
            items = root.xpath('//*[@id="clipDateList"]/li')
            entity['episode_list'] = {}
            if len(items) > 300:
                items = items[len(items) - 300:]
            today = int(datetime.now().strftime('%Y%m%d'))
            for item in items:
                try:
                    a_tag = item.xpath('a')
                    if len(a_tag) == 1:
                        span_tag = a_tag[0].xpath('span[@class="txt_episode"]')
                        if len(span_tag) == 1:
                            if item.attrib['data-clip'] in entity['episode_list']:
                                if entity['episode_list'][item.attrib['data-clip']][0] == span_tag[0].text.strip().replace(u'\ud68c', ''):
                                    pass
                                else:
                                    idx = len(entity['episode_list'][item.attrib['data-clip']]) - 1
                                    _ = abs(int(entity['episode_list'][item.attrib['data-clip']][idx]) - int(span_tag[0].text.strip().replace(u'\ud68c', '')))
                                    if _ <= 4:
                                        if item.attrib['data-clip'] != '' and today >= int(item.attrib['data-clip']):
                                            entity['last_episode_date'] = item.attrib['data-clip']
                                            entity['last_episode_no'] = span_tag[0].text.strip().replace(u'\ud68c', '')
                                        entity['episode_list'][item.attrib['data-clip']].append(span_tag[0].text.strip().replace(u'\ud68c', ''))
                            else:
                                if item.attrib['data-clip'] != '' and today >= int(item.attrib['data-clip']):
                                    entity['last_episode_date'] = item.attrib['data-clip']
                                    entity['last_episode_no'] = span_tag[0].text.strip().replace(u'\ud68c', '')
                                entity['episode_list'][item.attrib['data-clip']] = [span_tag[0].text.strip().replace(u'\ud68c', '')]
                except Exception as e:
                    logger.error('Exception:%s', e)
                    logger.error(traceback.format_exc())

            try:
                if len(entity['episode_list']):
                    entity['episode_count_one_day'] = int(round(float(len(items)) / len(entity['episode_list'])))
                    if entity['episode_count_one_day'] == 0:
                        entity['episode_count_one_day'] = 1
                else:
                    entity['episode_count_one_day'] = 1
            except:
                entity['episode_count_one_day'] = 1

            #entity['episode_list_json'] = json.dumps(entity['episode_list'])
            logger.debug('daum tv len(entity.episode_list) : %s %s %s', len(items), len(entity['episode_list']), entity['episode_count_one_day'])
            return entity
        except Exception as e:
            logger.error('Exception:%s', e)
            logger.error(traceback.format_exc())

        return
Beispiel #6
0
    def get_show_info_on_home(root):
        try:
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/span/a')
            if len(tags) < 1:
                return
            tag_index = len(tags) - 1
            entity = {}
            entity['title'] = tags[tag_index].text
            logger.debug('get_show_info_on_home title: %s', entity['title'])
            match = re.compile('q\\=(?P<title>.*?)&').search(tags[tag_index].attrib['href'])
            if match:
                entity['title'] = urllib.unquote(match.group('title'))
            entity['id'] = re.compile('irk\\=(?P<id>\\d+)').search(tags[tag_index].attrib['href']).group('id')
            entity['status'] = 1
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/span/span')
            if len(tags) == 1:
                if tags[0].text == u'\ubc29\uc1a1\uc885\ub8cc':
                    entity['status'] = 2
                elif tags[0].text == u'\ubc29\uc1a1\uc608\uc815':
                    entity['status'] = 0
            logger.debug('get_show_info_on_home status: %s', entity['status'])
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div')
            entity['extra_info'] = tags[0].text_content().strip()
            logger.debug('get_show_info_on_home extra_info: %s', entity['extra_info'])
            entity['studio'] = ''
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/a')
            if len(tags) == 1:
                entity['studio'] = tags[0].text
            else:
                tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/span[1]')
                if len(tags) == 1:
                    entity['studio'] = tags[0].text
            logger.debug('get_show_info_on_home studio: %s', entity['studio'])
            tags = root.xpath('//*[@id="tvpColl"]/div[2]/div/div[1]/div/span')
            entity['extra_info_array'] = [ tag.text for tag in tags ]
            entity['broadcast_info'] = entity['extra_info_array'][-2].strip()
            entity['broadcast_term'] = entity['extra_info_array'][-1].split(',')[-1].strip()
            entity['year'] = re.compile('(?P<year>\\d{4})').search(entity['extra_info_array'][-1]).group('year')
            logger.debug('get_show_info_on_home 1: %s', entity['status'])
            entity['series'] = []
            entity['series'].append({'title': entity['title'],
             'id': entity['id'],
             'year': entity['year']})
            tags = root.xpath('//*[@id="tv_series"]/div/ul/li')
            if tags:
                try:
                    more = root.xpath('//*[@id="tv_series"]/div/div/a')
                    url = more[0].attrib['href']
                    if not url.startswith('http'):
                        url = 'https://search.daum.net/search%s' % url
                    logger.debug('MORE URL : %s', url)
                    if more[0].xpath('span')[0].text == u'\uc2dc\ub9ac\uc988 \ub354\ubcf4\uae30':
                        more_root = HTML.ElementFromURL(url)
                        tags = more_root.xpath('//*[@id="series"]/ul/li')
                except Exception as e:
                    logger.debug('Not More!')
                    logger.debug(traceback.format_exc())

                for tag in tags:
                    dic = {}
                    dic['title'] = tag.xpath('a')[0].text
                    dic['id'] = re.compile('irk\\=(?P<id>\\d+)').search(tag.xpath('a')[0].attrib['href']).group('id')
                    if tag.xpath('span'):
                        dic['date'] = tag.xpath('span')[0].text
                        dic['year'] = re.compile('(?P<year>\\d{4})').search(dic['date']).group('year')
                    else:
                        dic['year'] = None
                    entity['series'].append(dic)

                entity['series'] = sorted(entity['series'], key=lambda k: int(k['id']))
            logger.debug('SERIES : %s', len(entity['series']))
            entity['equal_name'] = []
            tags = root.xpath(u'//div[@id="tv_program"]//dt[contains(text(),"\ub3d9\uba85 \ucf58\ud150\uce20")]//following-sibling::dd')
            if tags:
                tags = tags[0].xpath('*')
                for tag in tags:
                    if tag.tag == 'a':
                        dic = {}
                        dic['title'] = tag.text
                        dic['id'] = re.compile('irk\\=(?P<id>\\d+)').search(tag.attrib['href']).group('id')
                    elif tag.tag == 'span':
                        match = re.compile('\\((?P<studio>.*?),\\s*(?P<year>\\d{4})?\\)').search(tag.text)
                        if match:
                            dic['studio'] = match.group('studio')
                            dic['year'] = match.group('year')
                        elif tag.text == u'(\ub3d9\uba85\ud504\ub85c\uadf8\ub7a8)':
                            entity['equal_name'].append(dic)
                        elif tag.text == u'(\ub3d9\uba85\ud68c\ucc28)':
                            continue

            logger.debug(entity)
            return entity
        except Exception as e:
            logger.debug('Exception get_show_info_by_html : %s', e)
            logger.debug(traceback.format_exc())

        return