Esempio n. 1
0
def _get_news_url_list_by_pat_and_date(pat, date):
    list_ = []
    tag = False
    html = urlopen_and_read(pat % '', ).decode('gbk', 'ignore')
    page = eval(re.search('var maxPage = (\\d+);', html).group(1))
    soup = Soup(html)
    while True:
        for div in soup.find_all('div', class_='article'):
            url = div.find_all('a')[1]['href']
            if url[21:29] > date:
                continue
            elif url[21:29] == date:
                if '?' in url:
                    url = url[:url.find('?')]
                if not url in list_:
                    list_.append(url)
            else:
                tag = True
                break
        page -= 1
        if tag:
            break
        soup = Soup(
            urlopen_and_read(pat % '_%u' % page).decode('gbk', 'ignore'))
    return list_
Esempio n. 2
0
def _work_item(news_module_str, store_module, news_module, type_, url, logger):
    if DEBUG:
        print(url)
    try:
        try_ = TRY_TIME
        while True:
            try:
                html = urlopen_and_read(url).decode(news_module.NEWS_CHARSET, 'ignore')
                news = news_module.match_news(html, url)
                if not news:return
                break
            except:
                if try_:
                    try_ -= 1
                    continue
                else:
                    raise
        file_name = re.sub(NAME_PAT, '', news.title)
        if not type_:
            try:
                type_ = news_module.get_type(html)
            except:
                type_ = 'temp'
        store_args = news_module_str, type_, file_name
        if not DEBUG:
            store_module.store_news(news, store_args)
        comment_url_args = news.comment_url_args
        web.news.crawl_comments(news_module, _match_and_store_comments(news_module_str, news_module, store_module), store_args, comment_url_args)
    except Exception as e:
        logger.error("\"%s\" happened on '%s' '%s' work_item"%(e, news_module_str, url))
        if DEBUG:
            raise
Esempio n. 3
0
def match_news(html, url):
    soup = Soup(html)
    sid = re.search('/n(\\d+)', url).group(1)
    url2 = 'http://changyan.sohu.com/node/html?client_id=cyqemw6s1&topicsid=%s' % sid
    topic_id = eval(urlopen_and_read(url2).decode(
        'utf-8', 'ignore'))['listData']['topic_id']
    comment_url_args = (topic_id, )
    title = soup.title.text
    main_content = soup.find('div', {'itemprop': 'articleBody'})
    if not main_content: return
    if main_content.img:
        news_image = main_content.img['src']
    else:
        news_image = None
    content = '\n'.join([
        temp.strip()
        for temp in [item.get_text() for item in main_content.find_all('p')]
        if not re.match('\\s*$', temp)
    ])
    source = soup.find('span', {'itemprop': 'name'}).text
    source_url = soup.find('span', {'itemprop': 'isBasedOnUrl'}).text
    date = soup.find('div', {'itemprop': 'datePublished'}).get_text()
    date = time.strptime(date, "%Y-%m-%d %H:%M:%S")
    date = time.mktime(date)
    return News(url,
                comment_url_args,
                title,
                content,
                source,
                date,
                source_url,
                news_image=news_image)
Esempio n. 4
0
def get_news_url_list(date_=None):
    if not date_:
        today = time.localtime()
        date_ = time.strftime('%Y-%m-%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8]))
    else:
        date_ = time.strptime(date_,"%Y%m%d")
        date_ = time.strftime('%Y-%m-%d', (date_[0], date_[1], date_[2], date_[3], date_[4], date_[5], date_[6], date_[7], date_[8]))
    data = urlopen_and_read("http://news.163.com/special/0001220O/news_json.js").decode("gbk", 'ignore')
    data = eval(data[data.find('{'):data.rfind('}')+1])
    data = data['news']
    dict_ = {}
    for newsData in data:
        try:
            type_ = DICT[newsData[0]['c']]
        except:
            continue 
        dict_[type_] = []
        for newsDataItem in newsData:
            if newsDataItem['p'][0:10] > date_:
                continue
            elif newsDataItem['p'][0:10] == date_:
                url = newsDataItem['l']
                if '?' in url:
                    url = url[:url.find('?')]
                if not url in dict_[type_]:
                    dict_[type_].append(url)
            else:
                break
    return dict_
Esempio n. 5
0
 def get_news_url_list(date_=None):
     if not date_:
         time_ = time.localtime()
         time1 = time.mktime((time_[0], time_[1], time_[2]-1, 0, 0, 0, 0, 0, 0))
         time2 = time.mktime((time_[0], time_[1], time_[2]-1, 23, 59, 59, 0, 0, 0))
     else:
         time_ = time.strptime(date_,"%Y%m%d")
         time1 = time.mktime((time_[0], time_[1], time_[2], 0, 0, 0, 0, 0, 0))
         time2 = time.mktime((time_[0], time_[1], time_[2], 23, 59, 59, 0, 0, 0))
     _YESTODAY_URL_PAT = 'http://news.baidu.com/ns?bt=%u&et=%u&tn=newstitledy&rn=50&q6=%s'%(time1, time2, web_)+'&pn=%u'
     list_ = []
     pn = 0
     while True:
         web = _YESTODAY_URL_PAT%pn
         html = urlopen_and_read(web).decode('utf-8')
         soup = Soup(html)
         div_list = soup.find_all('div', class_='result')
         for div in div_list:
             web = div.a['href']
             if re.match(url_pat, web):
                 if '?' in web:
                     web = web[:web.find('?')]
                 if not web in list_:
                     list_.append(web)
         if not re.search('下一页', html):
             break
         pn += 50
     return {'':list_}
Esempio n. 6
0
def get_news_url_list(date=None):
    if not date:
        today = time.localtime()
        date = time.strftime(
            '%Y%m%d', (today[0], today[1], today[2] - 1, today[3], today[4],
                       today[5], today[6], today[7], today[8]))
    dict_ = {}
    for type_ in DICT_.keys():
        type_id = DICT_[type_]
        page = 1
        while True:
            url_ = URL_PAT % (type_id, date, page)
            try:
                soup = Soup(urlopen_and_read(url_).decode('utf-8'))
                url_list = [
                    item['href'] for item in soup.find(
                        'div', class_='newsList').ul.find_all('a')
                ]
            except:
                break
            if not type_ in dict_.keys():
                dict_[type_] = []
            dict_[type_].extend(url_list)
            page += 1
    return dict_
Esempio n. 7
0
def get_news_url_list(date_=None):
    if not date_:
        today = time.localtime()
        date_ = time.strftime('%Y%m%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8]))
    dict_ = {}
    for type_ in NEWS_URL_DICT.keys():
        url = NEWS_URL_DICT[type_]
        page = 1
        tag = False
        while True:
            data = urlopen_and_read(url%page).decode("utf-8", 'ignore') 
            data = eval(data)['result']['data']
            for item in data:
                date = time.localtime(eval(item['createtime']))
                date = time.strftime('%Y%m%d', date)
                if date > date_:
                    continue
                elif date == date_:
                    if not type_ in dict_.keys():
                        dict_[type_] = []
                    url_ = item['url'].replace('\\', '')
                    if not 'video' in url_:
                        if '?' in url_:
                            url_ = url_[:url_.find('?')]
                        if not url_ in dict_[type_]:
                            dict_[type_].append(url_)
                else:  
                    tag = True
                    break
            if tag:
                break
            page += 1
    return dict_
Esempio n. 8
0
def do_item(item):
    conn = get_conn()
    cur = conn.cursor()
    title = Soup(urlopen_and_read(item[2]).decode(NEWS_CHARSET, 'ignore')).title.text
    title = re.sub('(_新闻)_腾讯网', '', title)
    title = re.sub(NAME_PAT, '', title)
    sql = SQL_PAT%(title, item[0])
    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()
    print(title)
Esempio n. 9
0
def do_item(item):
    conn = get_conn()
    cur = conn.cursor()
    title = Soup(urlopen_and_read(item[2]).decode(NEWS_CHARSET,
                                                  'ignore')).title.text
    title = re.sub('(_新闻)_腾讯网', '', title)
    title = re.sub(NAME_PAT, '', title)
    sql = SQL_PAT % (title, item[0])
    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()
    print(title)
Esempio n. 10
0
def _get_news_url_list_by_pat_and_date(pat, date):
    list_ = []
    tag = False
    html = urlopen_and_read(pat%'', ).decode('gbk', 'ignore')
    page = eval(re.search('var maxPage = (\\d+);', html).group(1))
    soup = Soup(html)
    while True:
        for div in soup.find_all('div', class_='article'):
            url = div.find_all('a')[1]['href']
            if url[21:29] > date:
                continue
            elif url[21:29] == date:
                if '?' in url:
                    url = url[:url.find('?')]
                if not url in list_:
                    list_.append(url)
            else:
                tag = True
                break
        page -= 1
        if tag:
            break
        soup = Soup(urlopen_and_read(pat%'_%u'%page).decode('gbk', 'ignore'))
    return list_
Esempio n. 11
0
def match_news(html, url):
    soup = Soup(html)
    sid = re.search('/n(\\d+)', url).group(1)
    url2 = 'http://changyan.sohu.com/node/html?client_id=cyqemw6s1&topicsid=%s' % sid
    topic_id = eval(urlopen_and_read(url2).decode('utf-8', 'ignore'))['listData']['topic_id']
    comment_url_args = (topic_id,)
    title = soup.title.text
    main_content = soup.find('div',{'itemprop':'articleBody'})
    if not main_content:return
    if main_content.img:
        news_image = main_content.img['src']
    else:
        news_image = None
    content = '\n'.join([temp.strip() for temp in [item.get_text() for item in main_content.find_all('p')] if not re.match('\\s*$', temp)])
    source = soup.find('span', {'itemprop':'name'}).text
    source_url = soup.find('span',{'itemprop':'isBasedOnUrl'}).text
    date = soup.find('div', {'itemprop':'datePublished'}).get_text()
    date = time.strptime(date,"%Y-%m-%d %H:%M:%S")
    date = time.mktime(date)
    return News(url, comment_url_args, title, content, source, date, source_url, news_image=news_image)
Esempio n. 12
0
def get_news_url_list(date=None):
    if not date:
        today = time.localtime()
        date = time.strftime('%Y%m%d', (today[0], today[1], today[2]-1, today[3], today[4], today[5], today[6], today[7], today[8]))
    dict_ = {}
    for type_ in DICT_.keys():
        type_id = DICT_[type_]
        page = 1
        while True:
            url_ = URL_PAT%(type_id, date, page)
            try:
                soup = Soup(urlopen_and_read(url_).decode('utf-8'))
                url_list = [item['href'] for item in soup.find('div', class_='newsList').ul.find_all('a')]
            except:
                break
            if not type_ in dict_.keys():
                dict_[type_] = []
            dict_[type_].extend(url_list)
            page += 1
    return dict_
Esempio n. 13
0
def crawl_comments(module, todo, store_args, args):
    logger = logging.getLogger('crawlerLog')
    retry = _RETRY_TIME
    try_ = _TRY_TIME
    if module.TYPE:
        page = 1
    else:
        id = None
    while True:
        if module.TYPE:
            url = module.get_comment_page_url(page, args)
        else:
            url = module.get_comment_page_url(id, args)
        try:
            if type(url) == tuple:
                url, data = url
                html = urlopen_and_read(url,
                                        data).decode(module.COMMENT_CHARSET,
                                                     'ignore')
            else:
                html = urlopen_and_read(url).decode(module.COMMENT_CHARSET,
                                                    'ignore')
            if retry != _RETRY_TIME:
                retry = _RETRY_TIME
            if try_ != _TRY_TIME and module.TYPE:
                try_ = _TRY_TIME
        except:
            if retry:
                retry -= 1
                continue
            logger.error("'%s' was not accessible" % url)
            if module.TYPE and try_:
                try_ -= 1
                page += 1
                continue
            else:
                logger.error(
                    "'%s' was not accessible and it's not the first failure" %
                    url)
                break
        try:
            data = module.get_comment_source_list(html)
        except:
            break
        if not data:
            break
        list_ = []
        for comment_source in data:
            list_.append(comment_source)
        if not list:
            if module.TYPE and try_:
                try_ -= 1
                page += 1
                continue
            else:
                logger.error(
                    "The data from '%s' couldn't be found and it's not the first failure"
                    % url)
                break
        elif module.TYPE:
            try_ = _TRY_TIME
        todo(list_, store_args)
        if module.TYPE:
            page += 1
        else:
            has_next = module.has_next(html)
            if not has_next:
                break
            id = module.get_next_id(html)
Esempio n. 14
0
def crawl_comments(module, todo, store_args, args):
    logger = logging.getLogger('crawlerLog')
    retry = _RETRY_TIME
    try_ = _TRY_TIME
    if module.TYPE:
        page = 1
    else:
        id = None
    while True:
        if module.TYPE:
            url = module.get_comment_page_url(page, args)
        else:
            url = module.get_comment_page_url(id, args)
        try:
            if type(url) == tuple:
                url, data = url
                html = urlopen_and_read(url, data).decode(module.COMMENT_CHARSET, 'ignore')
            else:
                html = urlopen_and_read(url).decode(module.COMMENT_CHARSET, 'ignore')
            if retry != _RETRY_TIME:
                retry = _RETRY_TIME
            if try_ != _TRY_TIME and module.TYPE:
                try_ = _TRY_TIME
        except:
            if retry:
                retry -=1
                continue
            logger.error("'%s' was not accessible"%url)
            if module.TYPE and try_:
                try_ -= 1
                page += 1
                continue
            else:
                logger.error("'%s' was not accessible and it's not the first failure"%url)
                break
        try:
            data = module.get_comment_source_list(html)
        except:
            break
        if not data:
            break
        list_ = []
        for comment_source in data:
            list_.append(comment_source)
        if not list:
            if module.TYPE and try_:
                try_ -= 1
                page += 1
                continue
            else:
                logger.error("The data from '%s' couldn't be found and it's not the first failure"%url)
                break
        elif module.TYPE:
            try_ = _TRY_TIME
        todo(list_, store_args)
        if module.TYPE:
            page += 1
        else:
            has_next = module.has_next(html)
            if not has_next:
                break
            id = module.get_next_id(html)