Beispiel #1
0
def get_naver(s, key, mode='blog'):
    url = 'https://openapi.naver.com/v1/search/%s?query=' % mode
    encText = urllib.parse.quote(key)
    options = '&display=20&sort=date'
    req_url = url + encText + options
    request = urllib.request.Request(req_url)
    request.add_header('X-Naver-Client-Id', s.naver_client_id)
    request.add_header('X-Naver-Client-Secret', s.naver_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()
    if (rescode != 200):
        s.logger.error('[NAVER] Error Code: %d', rescode)
        return None
    response_body = response.read()
    data = response_body.decode('utf-8')
    js = json.loads(data)
    items = int(js["display"])
    for i in range(0, items):
        # http://blog.naver.com/ecampus_kgu?Redirect=Log&logNo=220965327425
        try:
            get_naver_blog_page_num(js["items"][i]["link"])
            page_num = get_naver_blog_page_num(js["items"][i]["link"])
        except:
            continue
        # http://blog.naver.com/ecampus_kgu
        user_id = get_naver_blog_user_id(js["items"][i]["bloggerlink"])
        if user_id is None:
            continue
        naver_blog_link = '%s/%s' % (js["items"][i]["bloggerlink"], page_num)
        post_date = get_naver_blog_post_date(js["items"][i]["postdate"])
        # print(js["items"][i]["description"])
        title = js["items"][i]["title"]
        append_google_sheet(s, user_id, naver_blog_link, title, post_date,
                            'NAVER', '블로그')
    return
Beispiel #2
0
def get_daum_agora(s, key):

    url = 'http://agora.media.daum.net/nsearch/total?query=%s' % '사람'
    r = get(url)
    if r.status_code != codes.ok:
        s.logger.error('[DAUM AGORA] request error')
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    for sre in soup.find_all(s.match_soup_class(['sResult'])):
        rows = sre.findChildren(['dt', 'dl'])
        for row in rows:
            cells = row.findChildren('dt')
            for cell in cells:
                date = row.find(s.match_soup_class(['date']))
                temp_date = date.text.split(' ')
                post_date = temp_date[0].replace('.', '-')
                if post_date.startswith('2') is False:
                    continue
                for a_tag in row.find_all('a'):
                    user_id = a_tag.text  # last text is user_id, so overwrite.
                # print(row.a.text) # title
                # #print(row.a['href'])  # url
                # print(user_id.strip())
                append_google_sheet(s, user_id.strip(), row.a['href'],
                                    'No title', post_date, 'DAUM', '아고라')
Beispiel #3
0
def get_daum(s, key, mode='date'):

    # https://apis.daum.net/search/blog?apikey={apikey}&q=다음&output=json
    url = 'https://apis.daum.net/search/blog?apikey=%s&q=' % (s.daum_app_key)
    encText = urllib.parse.quote(key)
    options = '&result=20&sort=%s&output=json' % mode
    req_url = url + encText + options
    request = urllib.request.Request(req_url)
    try:
        response = urllib.request.urlopen(request)
    except:
        s.logger.error('[DAUM]error: %s %s', key, sys.exc_info()[0])
        return None
    rescode = response.getcode()
    if (rescode != 200):
        s.logger.error('[DAUM] Error Code: %d', rescode)
        return None

    # http://xxx.tistory.com
    p1 = re.compile(r'^http://\w+.tistory.com/\d+')
    # http://brunch.co.kr/@xxx/x
    p2 = re.compile(r'^https://brunch.co.kr/\@\w+/\d+')

    response_body = response.read()
    data = response_body.decode('utf-8')
    res = json.loads(data)
    for i in range(len(res['channel']['item'])):
        # title = res["channel"]['item'][i]['title']
        daum_blog_link = res["channel"]['item'][i]['link']
        # TODO : add duplicated check all functions at once.
        # if (s.check_duplicate_item(daum_blog_link, 'daum')):
        #     continue  # True duplicated
        m = p1.match(daum_blog_link)  # http://xxx.tistory.com
        if m:
            user_id = re.search(r'^http://(.*).tistory.com/\d+',
                                daum_blog_link)
            title, post_date = parse_tistory_page(s, daum_blog_link)
            if title is None or post_date is None:
                continue
            append_google_sheet(s, user_id.group(1), daum_blog_link, title,
                                post_date, 'DAUM', '블로그')
            continue

        m = p2.match(daum_blog_link)  # http://brunch.co.kr/@xxx/x
        if m:
            user_id = re.search('https://brunch.co.kr/\@(.*)/\d+',
                                daum_blog_link)
            title, post_date = parse_brunch_page(daum_blog_link)
            if title is None or post_date is None:
                continue
            append_google_sheet(s, user_id.group(1), daum_blog_link, title,
                                post_date, 'DAUM', '블로그')
            continue
        else:
            s.logger.info('[drop] %s', daum_blog_link)  # drop

    return
Beispiel #4
0
def get_dcinside(s, key):
    url = 'http://search.dcinside.com/post/q/%s' % key
    r = get(url)
    if r.status_code != codes.ok:
        s.logger.error('[DCINSIDE] request error')
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    for thumb in soup.find_all(s.match_soup_class(['thumb_list'])):
        for thumb_txt in thumb.find_all(s.match_soup_class(['thumb_txt'])):
            post_date = thumb_txt.span.string.split()
            append_google_sheet(s, '직접채워야함', thumb_txt.a['href'], 'no title',
                                post_date[0].replace('.', '-'), 'DCINSIDE')
Beispiel #5
0
def get_ilbe(s, key):
    url = 'https://www.ilbe.com/?act=IS&where=document&is_keyword=%s' % key
    r = get(url)
    if r.status_code != codes.ok:
        s.logger.error('[ILGANBEST] request error')
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    for sre in soup.find_all(s.match_soup_class(['searchResult'])):
        for li in sre.find_all('li'):
            ilbe = li.address.text.split('|')
            # print('User ID: ', ilbe[0])
            # print('post_date: ', ilbe[1].split()[0])
            append_google_sheet(s, ilbe[0], li.a['href'], 'no title',
                                ilbe[1].split()[0], '일간베스트')
Beispiel #6
0
def get_clien(s, key):
    date_regex = re.compile(r'^\d+:\d+')
    for i in range(1, 10):  # 10 page search
        url = 'http://www.clien.net/cs2/bbs/board.php?bo_table=kin&page=%d' % i
        r = get(url)
        if r.status_code != codes.ok:
            s.logger.error('[CLIEN] request error')
            return None

        soup = BeautifulSoup(r.content.decode('utf-8', 'replace'),
                             'html.parser')
        url, user_id, post_date = None, None, None
        for ps in soup.find_all(s.match_soup_class(['mytr'])):
            for td in ps.find_all('td'):
                try:
                    td.a['href']
                    title = td.a.text
                    if title.find(key) > 0:
                        url = 'http://www.clien.net/cs2/%s' % td.a['href'][3:]
                        if url.startswith(
                                'http://www.clien.net/cs2/bbs/board.php?bo_table=kin&sca='
                        ):
                            url = None
                except TypeError:
                    pass

                if user_id is None:
                    spans = td.find_all('span', attrs={'class': 'member'})
                    for span in spans:
                        user_id = span.text
                else:
                    span = td.find('span')
                    matched = date_regex.match(span.text)
                    if matched:
                        post_date = s.today
                    else:
                        if span.text == '12-30':  # TODO : need to better way
                            post_date = '%s-%s' % (s.last_year, span.text)
                        elif span.text == '12-31':  # TODO : need to better way
                            post_date = '%s-%s' % (s.last_year, span.text)
                        else:
                            post_date = '%s-%s' % (s.this_year, span.text)

                    if url is not None:
                        append_google_sheet(s, user_id, url, title, post_date,
                                            '클리앙')
                    url, user_id, post_date = None, None, None
Beispiel #7
0
def get_bobedream(s, key):
    for i in range(1, 10):  # 5 page search
        url = 'http://www.bobaedream.co.kr/list?code=freeb&s_cate=&maker_no=&model_no=&or_gu=10&or_se=desc&s_selday=&pagescale=30&info3=&noticeShow=&s_select=&s_key=&level_no=&vdate=&type=list&page=%d' % i

        r = get(url)
        if r.status_code != codes.ok:
            s.logger.error('[BOBEDREAM] request error')
            return None

        soup = BeautifulSoup(r.content.decode('utf-8', 'replace'),
                             'html.parser')
        url, user_id, post_date, title = None, None, None, None
        for td in soup.find_all('td'):
            try:
                td.a['href']
                title = td.a.text
                if (title.find(key) > 0):
                    url = td.a['href']
            except TypeError:
                txt = str(td)
                if txt.startswith('<td class="date">'):
                    if (td.text.find(':') > 0):
                        post_date = s.today
                    else:
                        if s.today[5:] == '12-31':  # TODO : need to better way
                            post_date = '%s-%s' % (s.last_year,
                                                   td.text.replace('/', '-'))
                        elif s.today[
                                5:] == '12-30':  # TODO : need to better way
                            post_date = '%s-%s' % (s.last_year,
                                                   td.text.replace('/', '-'))
                        else:
                            post_date = '%s-%s' % (s.this_year,
                                                   td.text.replace('/', '-'))
                    # print(url, user_id, post_date, title)
                    if (url is not None and
                            not url.startswith('http://www.bobaedream.co.kr')
                            and not url.endswith('%2Flist%3Fcode%3Dfreeb')
                        ):  # ignore ad and popular
                        url = 'http://www.bobaedream.co.kr%s' % url
                        append_google_sheet(s, user_id, url, title, post_date,
                                            '보배드림')
                    url, user_id, post_date, title = None, None, None, None
                else:
                    spans = td.find_all('span', attrs={'class': 'author'})
                    for span in spans:
                        user_id = span.string
Beispiel #8
0
def get_twitter_search(s, key):
    try:
        timeline = s.twitter.search(q=key, result_type='popular', count=20)

        dump_tl = json.dumps(timeline)  # dict -> json
        tl = json.loads(dump_tl)
        for i in tl['statuses']:
            for url in i['entities']['urls']:
                post_url = url['url']
                break  # need 1st url
            post_date = get_twitter_post_date(i['created_at'])
            append_google_sheet(s, i['text'], post_url, 'No title', post_date,
                                'TWITTER')
            # print('[USER_CREATED_AT]', i['user']['created_at'])

    except TwythonError as e:
        s.logger.error('TwythonError %s', e)
Beispiel #9
0
def get_today_humor(s, key):
    url = 'http://www.todayhumor.co.kr/board/list.php?kind=search&keyfield=subject&keyword=%s&Submit.x=0&Submit.y=0&Submit=검색' % key
    r = get(url)
    if r.status_code != codes.ok:
        s.logger.error('[TodayHumor] request error')
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    for l in soup.find_all(s.match_soup_class(['view'])):
        idx = 0
        temp_url = None
        for o in l.find_all('td'):
            idx += 1
            if idx == 1:
                continue
            # print('idx=', idx, '=>', o.text)
            try:
                o.a['href']
                if temp_url is None:
                    temp_url = o.a['href']
                    u = re.search('(.*)&keyfield(.*)', temp_url)
                    if u is None:
                        temp_url = None
                    else:
                        url = 'http://www.todayhumor.co.kr%s' % u.group(1)
            except TypeError:
                pass
            # print('idx=', idx, 'text=', o.text)
            if idx % 7 == 3:
                title = o.text
            if idx % 7 == 4:
                user_id = o.text
            if idx % 7 == 5:
                temp_date = o.text
                temp_date = temp_date.replace('/', '-')
                rm_hm = temp_date.split()  # rm hour, minute
                post_date = '20%s' % rm_hm[0]

                append_google_sheet(s, user_id, url, title, post_date,
                                    '오늘의 유머')
                user_id, url, title, post_date = None, None, None, None
                temp_url = None
    return None
Beispiel #10
0
def get_nate_pann(s, key):
    url = 'http://pann.nate.com/search?searchType=A&q=%s' % key
    r = get(url)
    if r.status_code != codes.ok:
        s.logger.error('[Nate] request error')
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    for srch in soup.find_all(s.match_soup_class(['srch_list'])):
        rows = srch.findChildren(['dt', 'dl'])
        for row in rows:
            cells = row.findChildren('dt')
            for cell in cells:
                info = row.text.strip().split('\n')
                title = info[0]
                date = info[-1].replace('.', '-')
                user_id, post_date = get_nate_id_and_date(date, info[-2])
                if user_id is None or post_date is None:
                    continue
                append_google_sheet(s, user_id, row.a['href'], title,
                                    post_date, 'NATE', '판')
Beispiel #11
0
def get_insoya(s, key):
    for i in range(1, 5):  # 5 page search
        url = 'http://www.insoya.com/bbs/zboard.php?id=talkmaple&page=%d&divpage=15' % i

        r = get(url)
        if r.status_code != codes.ok:
            s.logger.error('[INSOYA] request error')
            return None

        soup = BeautifulSoup(r.text, 'html.parser')
        url, user_id, post_date, title = None, None, None, None
        for td in soup.find_all('td'):
            try:
                td.a['href']
                title = td.a.string
                if title is not None and title.find(key) > 0:
                    url = td.a['href']

                if (url is not None
                        and url.startswith('zboard.php?id=talkmaple')):
                    url = 'http://www.insoya.com/bbs/%s' % url
                    # print(url)
            except TypeError:
                spans = td.find_all('span', attrs={'class': 'memberSelect'})
                for span in spans:
                    if url is None:
                        break
                    user_id = span.string
                    # print(user_id)
                txt = str(td)
                if txt.startswith('<td class="eng w_date">'):
                    if url is not None:
                        w_date = str(td.text).split()
                        post_date = '20%s' % w_date[0].replace('.', '-')
                        # print(user_id, url, title, post_date)
                        append_google_sheet(s, user_id, url, title, post_date,
                                            '인소야닷컴')
                    url, user_id, post_date, title = None, None, None, None