コード例 #1
0
def get_article_num(tag_id):
    num = 0

    base_url = 'http://www.huxiu.com/tags/{}.html'.format(tag_id)
    api_url = 'http://www.huxiu.com/v2_action/tag_article_list'
    post_data['tag_id'] = tag_id
    page = 1

    try:
        post_data['page'] = page
        common.rand_sleep(5, 5)
        res = s.post(api_url, data=post_data)
        res_data = json.loads(res.text.encode('utf8'))
        total_page = res_data['total_page']
        if total_page == 1:
            common.rand_sleep(5, 5)
            res = s.get(base_url)
            res.encoding = "utf-8"
            soup = BeautifulSoup(res.text, 'html.parser')
            article_box = soup.find('div', class_='related-article')
            article_list = article_box.find_all('li')
            num = len(article_list)
        else:
            # 根据页数计算文章数
            num = 10 * total_page
    except Exception, e:
        # print Exception, e
        logging.error('run error', exc_info=True)
        return num
コード例 #2
0
ファイル: tuicool2.py プロジェクト: simple2source/macugEx
def main():
    # 登录
    l.login()
    # 获得主题数据的api地址
    base_url = 'http://www.tuicool.com/topics/my_hot?id=1'
    make_dir(base_path)
    try:
        common.rand_sleep(5, 10)
        res = l.session.get(base_url)
        logging.info('return url {} success'.format(res.url))
        res_data = json.loads(res.text)

        # 主题分类列表
        class_list = res_data['cats']
        for class_item in class_list:
            class_path = base_path + '/' + str(class_item['id'])\
             + '_' + class_item['name'].encode('utf8')
            make_dir(class_path)
            # 主题列表
            topic_list = class_item['items']
            for topic in topic_list:
                topic_path = class_path + '/' + str(topic['id'])\
                 + '_' + topic['name'].encode('utf8')
                make_dir(topic_path)
                get_articles_in_topic(str(topic['id']), topic_path)

    except Exception, e:
        print Exception, e
        logging.error('run error', exc_info=True)
コード例 #3
0
ファイル: tuicool2.py プロジェクト: simple2source/macugEx
def get_article(article_id, abs_file_path):
    '''获得文章,获取成功返回True,文章不在最近一周内返回False'''
    article_url = 'http://www.tuicool.com/articles/{}'.format(article_id)
    try:
        print article_url
        common.rand_sleep(5, 10)
        res = l.session.get(article_url)
        logging.info('return url {} success'.format(res.url))
        soup = BeautifulSoup(res.text, 'html.parser')
        title = str(soup.find('div', class_='article_detail_bg').find('h1')\
         .get_text())
        print title
        pub_time = re.sub(re.compile('时间[\s\S]{2}'), '', \
         str(soup.find('span', class_='timestamp').get_text()).strip())
        keywords = [str(item.get_text())\
         for item in soup.find_all('span', class_='new-label')]
        content = str(soup.find('div', class_='article_body'))

        # 只抓最近一周内的文章
        timedelta = datetime.date.today()-datetime.datetime\
         .strptime(pub_time, '%Y-%m-%d %H:%M:%S').date()
        if timedelta.days > 7:
            return False

        with open(abs_file_path, 'w') as f:
            f.write('标题:' + title + '\n')
            f.write('发布时间:' + pub_time + '\n')
            f.write('关键字:' + ', '.join(keywords) + '\n')
            f.write('内容:' + content + '\n')
        return True
    except Exception, e:
        print Exception, e
        logging.error('run error', exc_info=True)
        return False
コード例 #4
0
ファイル: tuicool2.py プロジェクト: simple2source/fetch_crwal
def main():
	# 登录
	l.login()
	# 获得主题数据的api地址
	base_url = 'http://www.tuicool.com/topics/my_hot?id=1'
	make_dir(base_path)
	try:
		common.rand_sleep(5, 10)
		res = l.session.get(base_url)
		logging.info('return url {} success'.format(res.url))
		res_data = json.loads(res.text)		

		# 主题分类列表
		class_list = res_data['cats']
		for class_item in class_list:
			class_path = base_path + '/' + str(class_item['id'])\
				+ '_' + class_item['name'].encode('utf8')
			make_dir(class_path)
			# 主题列表
			topic_list = class_item['items']
			for topic in topic_list:
				topic_path = class_path + '/' + str(topic['id'])\
					+ '_' + topic['name'].encode('utf8')
				make_dir(topic_path)
				get_articles_in_topic(str(topic['id']), topic_path)

	except Exception, e:
		print Exception, e
		logging.error('run error', exc_info=True)
コード例 #5
0
ファイル: segmentfault.py プロジェクト: simple2source/macugEx
def main():
	page = 1
	tag_url = 'https://segmentfault.com/tags/all?page={}'

	while 1:
		cur_url = tag_url.format(page)
		common.rand_sleep(5, 10)
		res = s.get(cur_url)
		soup = BeautifulSoup(res.text, 'html.parser')
		tags_list = soup.find_all('section', class_='tag-list__item')
		for tag_section in tags_list:
			tag_name = tag_section.find('h2').find('a').get_text()\
				.encode('utf8').strip()
			# 判断当前主题是否已经抓取完成
			with open('segmentfault_done.txt', 'r') as sdf:
				content = sdf.read()
				if content.find(','+tag_name+',') == -1:
					num = get_article_num(tag_name)
					with open('segmentfault_tags.txt', 'a') as stf:
						stf.write(tag_name+':'+str(num)+'\n')
				else:
					continue
			with open('segmentfault_done.txt', 'a') as f:
				f.write(','+tag_name)

		# 判断是否有下一页
		page += 1
		re_str = r'/tags/all\?page={}'.format(page)
		pat = re.compile(re_str)
		s_r = re.search(pat, res.text)
		if s_r is None:
			break
		else:
			continue
コード例 #6
0
ファイル: huxiu.py プロジェクト: simple2source/fetch_crwal
def get_article_num(tag_id):
	num = 0

	base_url = 'http://www.huxiu.com/tags/{}.html'.format(tag_id)
	api_url = 'http://www.huxiu.com/v2_action/tag_article_list'
	post_data['tag_id'] = tag_id
	page = 1

	try:
		post_data['page'] = page
		common.rand_sleep(5, 5)
		res = s.post(api_url, data=post_data)
		res_data = json.loads(res.text.encode('utf8'))
		total_page = res_data['total_page']
		if total_page == 1:
			common.rand_sleep(5, 5)
			res = s.get(base_url)
			res.encoding = "utf-8"
			soup = BeautifulSoup(res.text, 'html.parser')
			article_box = soup.find('div', class_='related-article')
			article_list = article_box.find_all('li')
			num = len(article_list)
		else:
			# 根据页数计算文章数
			num = 10*total_page
	except Exception, e:
		# print Exception, e
		logging.error('run error', exc_info=True)
		return num
コード例 #7
0
ファイル: tuicool2.py プロジェクト: simple2source/fetch_crwal
def get_article(article_id, abs_file_path):
	'''获得文章,获取成功返回True,文章不在最近一周内返回False'''
	article_url = 'http://www.tuicool.com/articles/{}'.format(article_id)
	try:
		print article_url
		common.rand_sleep(5, 10)
		res = l.session.get(article_url)
		logging.info('return url {} success'.format(res.url))
		soup = BeautifulSoup(res.text, 'html.parser')
		title = str(soup.find('div', class_='article_detail_bg').find('h1')\
			.get_text())
		print title
		pub_time = re.sub(re.compile('时间[\s\S]{2}'), '', \
			str(soup.find('span', class_='timestamp').get_text()).strip())
		keywords = [str(item.get_text())\
			for item in soup.find_all('span', class_='new-label')]
		content = str(soup.find('div', class_='article_body'))

		# 只抓最近一周内的文章
		timedelta = datetime.date.today()-datetime.datetime\
			.strptime(pub_time, '%Y-%m-%d %H:%M:%S').date()
		if timedelta.days > 7:
			return False

		with open(abs_file_path, 'w') as f:
			f.write('标题:' + title + '\n')
			f.write('发布时间:' + pub_time + '\n')
			f.write('关键字:' + ', '.join(keywords) + '\n')
			f.write('内容:' + content + '\n')
		return True
	except Exception, e:
		print Exception, e
		logging.error('run error', exc_info=True)
		return False
コード例 #8
0
ファイル: v2exemail.py プロジェクト: simple2source/macugEx
def run(page):
    r = getpage(page)
    url_list = pageparse(r.text)
    for i in url_list:
        url = i.split('#')[0]
        common.rand_sleep(6, 2)
        ff = get_page_one(url)
        if not sql_sel(url):
            sql_in(url, ff)
コード例 #9
0
ファイル: tuicool2.py プロジェクト: simple2source/macugEx
def get_articles_in_topic(topic_id, topic_path):
    '''获得主题下的文章'''
    tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\
     + '?st=0&lang=1&pn={}'

    # 判断当前主题是否已完成抓取
    if os.path.exists(topic_path + '/' + 'done'):
        print str(topic_id) + ': done'
        return

    page = 0

    while 1:
        try:
            cur_url = tp_base_url.format(page)
            print cur_url
            common.rand_sleep(5, 10)
            res = l.session.get(cur_url)
            with open('temp.html', 'w') as f:
                f.write(res.text.encode('utf8'))
            logging.info('return url {} success'.format(res.url))
            soup = BeautifulSoup(res.text, 'html.parser')

            # 获得文章列表
            articles_list = soup.find_all('div', class_='single_fake')
            for article in articles_list:
                article_id = str(article.find('a', class_='article-list-title')\
                 .get('href').split('/')[2])
                article_title = str(article.find('a', class_='article-list-title')\
                 .get_text()).strip()
                # 如果文章不存在
                abs_file_path = topic_path + '/' + article_id
                if not os.path.isfile(abs_file_path):
                    # 取到的文章不符合要求,跳出当前主题
                    if not get_article(article_id, abs_file_path):
                        # 标记当前主题抓取完成
                        with open(topic_path + '/' + 'done', 'w') as f:
                            pass
                        return
                else:
                    continue

            # 判断是否有下一页
            page += 1
            cur_url = tp_base_url.format(page)
            re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page)
            pat = re.compile(re_str)
            s_r = re.search(pat, res.text)
            if s_r is None:
                # 标记当前主题抓取完成
                with open(topic_path + '/' + 'done', 'w') as f:
                    pass
                break

        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
コード例 #10
0
def run_company(url, tp):  # 针对桔子对公司的爬取限制,另外写的
    url2 = url
    page = 10
    flag = True
    keyword_all = []
    l_list = range(2, 2427)
    l_list2 = copy.copy(l_list)
    for i in l_list2:
        store_path1 = os.path.join(common.root_path, 'juzi', str(i) + '.html')
        if os.path.isfile(store_path1):
            l_list.remove(i)
        else:
            print store_path1
    print len(l_list), 999999999
    print l_list
    aa = common.get_request(url, timeout=18)
    headers2 = {
        'Origin': 'http://www.itjuzi.com',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.8',
        'User-Agent':
        'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1',
        'Accept': '*/*',
        'Referer': 'http://www.itjuzi.com/company?page=2410',
        'Cookie':
        'grwng_uid=2e824b08-70ef-41f7-a9c8-62ff25d8f920; AWSELB=258D9D590E00B3DE939BD2301A2166BB8314D5BFDDA88D29F0E3F22E0935E83EF1C408A6B613204775BA26EA9BE8555ABB5A13289EDD9FCE01B44987A799A50A15E49578ED9A0D7D28BE3696012F59FED65EA97193',
        'Connection': 'keep-alive'
    }
    url_p = 'https://api.growingio.com/v2/eee5a46c52000d401f969f4535bdaa78/web/pv?stm=1459928218615'
    while flag:
        print url
        keyword_list = extract(aa.text, tp)
        print keyword_list
        print '---------------'
        keyword_all.extend(keyword_list)
        print keyword_all
        headers['refer'] = url
        url = url2 + "?page={}".format(page)
        print url
        headers2['refer'] = url
        aa = common.get_request(url, timeout=18)
        # bb = common.post_request(url_p, headers=headers2, data='6\x86\xf0D\x08`\x96`\\`S$\x15\x82\x01`\x1b\x01\x8d\x90&\x002\x10\x09\x9a\xf8\x08\xc0\x19\x80\x9c\x19QZ\xc8\x0c\xcc\x80FDA\x00\xec\x00q\x80\r$\x00np\xc3\xe0\x07H\xd7\x80\x96"\x03\xa8!\x90 +\x88\xeeX+w\xcb\x8b.\x00\xb4dru\xd6\x84\xd1]\x11\x91`\x8b\xa7\x91\n\x17\xf0\xf5\xcd\xca\xbf0\x01\x9dUc)\xd7\'df\xdchDd\xc6\xa4\x8c\xba\xdc\x08&\xba\x92\x08DATT\xf8\xc8\xf8R`\x00."\x82\xeeY\x02\x19\x00\xb6pd\x0cI\xce\xb8d\xdc\xc8<\x02\xee\x00\x16p\x9c\x8c\x18\xb5\x00\xeeph%\x02D"\xad}bP\x19\x00VJ\x00^PbX\x00\xf6E\x02\x00\x0e"\x00\xf4\xd3\x05s\x10\x00v\x00\x9en\x00N\x14"u\x19\x19s\xb0\x8b\x8b}\xad\x03\xc3c\x13+\xcb3k[\x00\xfck\x00\xe6\x08\x00\xbc\xb8\x9dhn\x006"Q\x9dWE\x87Y\xb8\x00\x8e"7\xa7\xdb\xe6G\xc1\x80\x00\xbe|p\n\x9e\x06\xa0\xd1ht\xfaC\x18X.d\xb3Yl\xf6T\x93\x85\xc6\xe4\xf1\xa3\xbc\xbe\x7f X*\x113\xe0"Q\x18\x9cA&\x82H\xa4\xd2n\x0c\x80>\x08\x01\xb4T\x00\x7f*\x01\xd0\x95\x00\x16\x11\x80W\x0c\x80\x01\x00\x07\xc2P\x04\x90\x00\xaa\x00T-\x00\n\xda\xdc\xa8D\x1d\xed\xca*\xc0J\xc82\xb7\x02\xa5Q\xa9\x80\xe6Yx!\xd8\xe6\xe1\xeb\xc0.W\x11\xb8\xd2c3p-\xe0\xf7U\x86\xdb`#\xd8\x1c\x8e\'3\xad\xb0om\xb8\xcc\xdd\x8fM\x8b\xc3S\x09\xf8C\xd5\xef/\xa7^\x10\x88\x02\xe9\x00')
        # print bb.content
        common.get_request('http://www.itjuzi.com/company/{}'.format(
            random.choice(range(1, 500))),
                           timeout=10)
        store_path = os.path.join(common.root_path, 'juzi',
                                  str(page) + '.html')
        with open(store_path, 'w+') as f:
            f.write(aa.text.encode('utf8'))
        common.rand_sleep(30, 15)
        if len(l_list) == 0:
            flag = False
        else:
            page = random.choice(l_list)
            l_list.remove(page)
    return keyword_all
コード例 #11
0
ファイル: tuicool2.py プロジェクト: simple2source/fetch_crwal
def get_articles_in_topic(topic_id, topic_path):
	'''获得主题下的文章'''
	tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\
		+ '?st=0&lang=1&pn={}'
	
	# 判断当前主题是否已完成抓取
	if os.path.exists(topic_path + '/' + 'done'):
		print str(topic_id) + ': done'
		return

	page = 0
	
	while 1:
		try:
			cur_url = tp_base_url.format(page)
			print cur_url
			common.rand_sleep(5, 10)
			res = l.session.get(cur_url)
			with open('temp.html', 'w') as f:
				f.write(res.text.encode('utf8'))
			logging.info('return url {} success'.format(res.url))
			soup = BeautifulSoup(res.text, 'html.parser')

			# 获得文章列表
			articles_list = soup.find_all('div', class_='single_fake')
			for article in articles_list:
				article_id = str(article.find('a', class_='article-list-title')\
					.get('href').split('/')[2])
				article_title = str(article.find('a', class_='article-list-title')\
					.get_text()).strip()
				# 如果文章不存在
				abs_file_path = topic_path + '/' + article_id
				if not os.path.isfile(abs_file_path):
					# 取到的文章不符合要求,跳出当前主题
					if not get_article(article_id, abs_file_path):
						# 标记当前主题抓取完成
						with open(topic_path + '/' + 'done', 'w') as f:
							pass
						return
				else:
					continue

			# 判断是否有下一页
			page += 1
			cur_url = tp_base_url.format(page)
			re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page)
			pat = re.compile(re_str)
			s_r = re.search(pat, res.text)
			if s_r is None:
				# 标记当前主题抓取完成
				with open(topic_path + '/' + 'done', 'w') as f:
					pass
				break

		except Exception, e:
			print Exception, e
			logging.error('run error', exc_info=True)
コード例 #12
0
def one_page(html, source):
    ll = link_list(html)
    common.rand_sleep(3, 1)
    for i in ll:
        url, title = i.split(',')
        logging.debug('next url is {}'.format(url))
        if not sql_se(source, title):
            r2 = common.get_request(url)
            title2, content, pub_time= page_parse(r2.text)
            common.sql_insert(source, url, title, content, pub_time, '')
        common.rand_sleep(6, 2)
コード例 #13
0
ファイル: tuicool.py プロジェクト: simple2source/macugEx
def count_articles_in_topic(topic_id):
    '''计算主题下一周内的文章数量'''
    result = 0
    tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\
     + '?st=0&lang=1&pn={}'

    page = 0

    while 1:
        try:
            cur_url = tp_base_url.format(page)
            print cur_url
            common.rand_sleep(5, 10)
            res = l.session.get(cur_url)
            logging.info('return url {} success'.format(res.url))
            soup = BeautifulSoup(res.text, 'html.parser')

            # 获得文章列表
            articles_list = soup.find_all('div', class_='single_fake')
            for article in articles_list:
                article_id = str(article.find('a', class_='article-list-title')\
                 .get('href').split('/')[2])
                article_title = str(article.find('a', class_='article-list-title')\
                 .get_text()).strip()
                pub_time = str(article.find('div', class_='meta-tip')\
                 .find_all('span')[1].get_text()).strip()
                if pub_time.find('201') == -1:
                    pub_time = '2016-' + pub_time + ':00'
                else:
                    return result
                # 判断时间是否在一周内
                timedelta = datetime.date.today()-datetime.datetime\
                 .strptime(pub_time, '%Y-%m-%d %H:%M:%S').date()
                if timedelta.days <= 7:
                    result += 1
                else:
                    return result

            # 判断是否有下一页
            page += 1
            cur_url = tp_base_url.format(page)
            re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page)
            pat = re.compile(re_str)
            s_r = re.search(pat, res.text)
            if s_r is None:
                return result

        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
コード例 #14
0
ファイル: tuicool.py プロジェクト: simple2source/fetch_crwal
def count_articles_in_topic(topic_id):
	'''计算主题下一周内的文章数量'''
	result = 0
	tp_base_url = 'http://www.tuicool.com/topics/{}'.format(topic_id)\
		+ '?st=0&lang=1&pn={}'

	page = 0
	
	while 1:
		try:
			cur_url = tp_base_url.format(page)
			print cur_url
			common.rand_sleep(5, 10)
			res = l.session.get(cur_url)
			logging.info('return url {} success'.format(res.url))
			soup = BeautifulSoup(res.text, 'html.parser')

			# 获得文章列表
			articles_list = soup.find_all('div', class_='single_fake')
			for article in articles_list:
				article_id = str(article.find('a', class_='article-list-title')\
					.get('href').split('/')[2])
				article_title = str(article.find('a', class_='article-list-title')\
					.get_text()).strip()
				pub_time = str(article.find('div', class_='meta-tip')\
					.find_all('span')[1].get_text()).strip()
				if pub_time.find('201') == -1:
					pub_time = '2016-' + pub_time + ':00'
				else:
					return result
				# 判断时间是否在一周内
				timedelta = datetime.date.today()-datetime.datetime\
					.strptime(pub_time, '%Y-%m-%d %H:%M:%S').date()
				if timedelta.days <= 7:
					result += 1
				else:
					return result

			# 判断是否有下一页
			page += 1
			cur_url = tp_base_url.format(page)
			re_str = r'/topics/{}\?st=0&lang=1&pn={}'.format(topic_id, page)
			pat = re.compile(re_str)
			s_r = re.search(pat, res.text)
			if s_r is None:
				return result

		except Exception, e:
			print Exception, e
			logging.error('run error', exc_info=True)
コード例 #15
0
ファイル: v2exemail.py プロジェクト: simple2source/macugEx
def get_page_one(url):
    r2 = common.get_request(url)
    cc = content(r2.text)
    # print cc
    if morepage(r2.text):
        common.rand_sleep(3, 2)
        r3 = common.get_request(url + '?p=1')  # 假如有两页
        logger.info('{} has two page, try to get page one'.format(url))
        cc += content(r3.text)
    dd = common.re_email(cc)
    print dd
    ee = list(set(dd))
    ff = ','.join(ee)
    return ff
コード例 #16
0
def main(blog_name):
    sql_name = 'cnblog_' + blog_name
    page = 1
    flag = True
    url_0 = "http://www.cnblogs.com/{}/".format(blog_name)
    url_1 = "http://www.cnblogs.com/{}/".format(blog_name)
    while flag:
        print url_1
        try:
            bb = common.get_request(url_1)
            logging.info('return url {} success '.format(bb.url))
            print bb.url
            soup_2 = BeautifulSoup(bb.text, 'html.parser')
            with open('asdf.html', 'w+') as f:
                f.write(bb.text.encode('utf8'))
            b2 = soup_2.find_all(
                'a',
                {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')})  # 某页的文章链接
            for i_text in b2:
                article_url = i_text.get('href')
                print article_url
                logging.info('article is {}'.format(article_url))
                article_title = i_text.get_text().strip()
                if not common.select(article_url, blog_name):
                    article = common.get_request(article_url)
                    pub_time = common.re_time(article.text)
                    keyword, content = extract(article.text)
                    blog_id, blog_app, post_id = blog_info(article.text)
                    keyword = kword(blog_id, blog_app, post_id)
                    common.sql_insert(sql_name, article_url, article_title,
                                      content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = url_0 + r'default\S+page={}'.format(page)
            print re_str
            pp = re.compile(re_str)
            ppp = re.search(pp, bb.text)
            if ppp is None:
                flag = False
            else:
                url_1 = ppp.group()
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
コード例 #17
0
def get_all_tags():
    tags_url = 'http://www.huxiu.com/tags'

    common.rand_sleep(5, 5)
    res = s.get(tags_url)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, 'html.parser')
    with open('temp.html', 'w') as f0:
        f0.write(res.text)
    tag_boxs = soup.find_all('div', class_='tag-cnt-box')
    with open('huxiu_tags.txt', 'w') as f:
        for box in tag_boxs:
            tags_list = box.find_all('li', class_='transition')
            for tag in tags_list:
                tag_id = tag.find('a').get('href').split('/')[-1].split('.')[0]
                tag_name = tag.find('a').get_text().encode('utf8').strip()
                # print tag_id, tag_name
                f.write(str(tag_id) + ':' + tag_name + '\n')
コード例 #18
0
ファイル: huxiu.py プロジェクト: simple2source/fetch_crwal
def get_all_tags():
	tags_url = 'http://www.huxiu.com/tags'

	common.rand_sleep(5, 5)
	res = s.get(tags_url)
	res.encoding = "utf-8"
	soup = BeautifulSoup(res.text, 'html.parser')
	with open('temp.html', 'w') as f0:
		f0.write(res.text)
	tag_boxs = soup.find_all('div', class_='tag-cnt-box')
	with open('huxiu_tags.txt', 'w') as f:
		for box in tag_boxs:
			tags_list = box.find_all('li', class_='transition')
			for tag in tags_list:
				tag_id = tag.find('a').get('href').split('/')[-1].split('.')[0]
				tag_name = tag.find('a').get_text().encode('utf8').strip()
				# print tag_id, tag_name
				f.write(str(tag_id) + ':' + tag_name + '\n')
コード例 #19
0
ファイル: mux.py プロジェクト: simple2source/macugEx
def main():
    source = 'mux'
    page = 1
    flag = True
    url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page)
    while flag:
        try:
            print url
            res = common.get_request(url)
            logging.info('return url {} success'.format(res.url))
            print res.url
            soup = BeautifulSoup(res.text, 'html.parser')
            with open('temp.html', 'w+') as f:
                f.write(res.text.encode('utf8'))
            articles = soup.find_all('div', class_='artical_inner')
            for item in articles:
                contents = item.contents
                article_url = contents[9].a.get('href')
                article_title = str(contents[3].a.get('title')).strip()
                if not common.select(article_url, source):
                    pub_time = time.strftime('%Y-%m-%d',\
                        time.strptime(str(contents[5].get_text()).split('|')[-1].strip(), '%Y年%m月%d日'))
                    keyword = str(
                        contents[5].get_text()).split('|')[-2].strip()
                    content = get_content(
                        common.get_request(article_url).text)
                    print article_title
                    common.sql_insert(source, article_url, article_title,
                                      content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = r'http://mux.baidu.com/\?page_id=10\S+paged={}'.format(
                page)
            pat = re.compile(re_str)
            s_r = re.search(pat, res.text)
            if s_r is None:
                flag = False
            else:
                url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page)
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
コード例 #20
0
ファイル: cnblog.py プロジェクト: simple2source/fetch_crwal
def main(blog_name):
    sql_name = 'cnblog_' + blog_name
    page = 1
    flag = True
    url_0 = "http://www.cnblogs.com/{}/".format(blog_name)
    url_1 = "http://www.cnblogs.com/{}/".format(blog_name)
    while flag:
        print url_1
        try:
            bb = common.get_request(url_1)
            logging.info('return url {} success '.format(bb.url))
            print bb.url
            soup_2 = BeautifulSoup(bb.text, 'html.parser')
            with open('asdf.html', 'w+') as f:
                f.write(bb.text.encode('utf8'))
            b2 = soup_2.find_all('a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')})  # 某页的文章链接
            for i_text in b2:
                article_url = i_text.get('href')
                print article_url
                logging.info('article is {}'.format(article_url))
                article_title = i_text.get_text().strip()
                if not common.select(article_url, blog_name):
                    article = common.get_request(article_url)
                    pub_time = common.re_time(article.text)
                    keyword, content = extract(article.text)
                    blog_id, blog_app, post_id = blog_info(article.text)
                    keyword = kword(blog_id, blog_app, post_id)
                    common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword)
                    common.rand_sleep(6, 1)
            page += 1
            re_str = url_0 + r'default\S+page={}'.format(page)
            print re_str
            pp = re.compile(re_str)
            ppp = re.search(pp, bb.text)
            if ppp is None:
                flag = False
            else:
                url_1 = ppp.group()
            common.rand_sleep(7, 1)
        except Exception, e:
            print Exception, e
            logging.error('run error', exc_info=True)
コード例 #21
0
ファイル: itjuziemail.py プロジェクト: simple2source/macugEx
def main():
    rootdir = os.getcwd()
    print rootdir
    try:
        company_list_dir = os.path.join(rootdir, 'juzi')
        # for subdir, dirs, files in os.walk(company_list_dir):
        #     for file in files:
        #         logger.info('current file is {}'.format(file))
        #         fff = os.path.join(subdir, file)
        #         with open(fff) as f:
        #             ff = f.read()
        #         url_list = find_all_link(ff)
        #         print url_list
        # for i in url_list:
        for num in xrange(35770, 36000):
            i = 'http://www.itjuzi.com/company/' + str(num)
            try:
                logger.info('current url is {}'.format(i))
                juzi_id = i.replace('http://www.itjuzi.com/company/', '')
                if not sql_sel(juzi_id):
                    logger.info('try to insert {} into mysql'.format(juzi_id))
                    gs_fp = os.path.join(rootdir, 'juzicompany')
                    if not os.path.exists(gs_fp):
                        os.makedirs(gs_fp)
                    job_id = str(juzi_id)
                    job_id = job_id.rjust(5, '0')
                    store_path = os.path.join(gs_fp,job_id[0:3], job_id +'.html')
                    father_dir=os.path.dirname(store_path)
                    if not os.path.exists(father_dir):
                        os.makedirs(father_dir)
                    r = common.get_request(i)
                    if r:
                        with open(store_path, 'w+') as f:
                            f.write(r.text)
                        ll = parse_page(r.text)
                        sql_in(juzi_id, ll)
                        common.rand_sleep(5, 2)
            except:
                logger.error('something wrong ', exc_info=True)
    except:
        logger.error('something wrong ', exc_info=True)
コード例 #22
0
ファイル: tuicool2.py プロジェクト: simple2source/fetch_crwal
	def login(self):
		'''返回cookie_str'''
		# 先获得authenticity_token
		common.rand_sleep(5, 10)
		res = self.session.get(self.url)
		soup = BeautifulSoup(res.text, 'html.parser')
		authenticity_token = soup.find('meta', attrs={'name': 'csrf-token'})\
			['content']
		print 'authenticity_token: ' + authenticity_token
		self.req_params['authenticity_token'] = authenticity_token

		# 使用用户名密码模拟登录
		common.rand_sleep(5, 10)
		res = self.session.post(self.url, data=self.req_params,\
			verify=False)
		cookie = requests.utils.dict_from_cookiejar(self.session.cookies)
		cookie_str = "; ".join([str(x)+"="+str(y) for x, y in cookie.items()])
		self.cookie_str = cookie_str
		self.session.headers['Cookie'] = cookie_str
		print 'cookie_str: ' + cookie_str
		return cookie_str
コード例 #23
0
ファイル: tuicool.py プロジェクト: simple2source/macugEx
    def login(self):
        '''返回cookie_str'''
        # 先获得authenticity_token
        common.rand_sleep(5, 10)
        res = self.session.get(self.url)
        soup = BeautifulSoup(res.text, 'html.parser')
        authenticity_token = soup.find('meta', attrs={'name': 'csrf-token'})\
         ['content']
        print 'authenticity_token: ' + authenticity_token
        self.req_params['authenticity_token'] = authenticity_token

        # 使用用户名密码模拟登录
        common.rand_sleep(5, 10)
        res = self.session.post(self.url, data=self.req_params,\
         verify=False)
        cookie = requests.utils.dict_from_cookiejar(self.session.cookies)
        cookie_str = "; ".join(
            [str(x) + "=" + str(y) for x, y in cookie.items()])
        self.cookie_str = cookie_str
        self.session.headers['Cookie'] = cookie_str
        print 'cookie_str: ' + cookie_str
        return cookie_str
コード例 #24
0
ファイル: tuicool.py プロジェクト: simple2source/fetch_crwal
def main():
	# 登录
	l.login()
	# 获得主题数据的api地址
	base_url = 'http://www.tuicool.com/topics/my_hot?id=1'
	try:
		common.rand_sleep(5, 10)
		res = l.session.get(base_url)
		logging.info('return url {} success'.format(res.url))
		res_data = json.loads(res.text)		

		result = {}

		# 主题分类列表
		class_list = res_data['cats']
		with open('articles_count0.txt', 'w') as f0:
			for class_item in class_list:
				class_id_name = str(class_item['id'])\
					+ '_' + class_item['name'].encode('utf8')
				# 主题列表
				topic_list = class_item['items']
				for topic in topic_list:
					topic_id_name = class_id_name + '_' + str(topic['id'])\
						+ '_' + topic['name'].encode('utf8')
					num = count_articles_in_topic(str(topic['id']))
					result[topic_id_name] = num
					print topic_id_name, num
					f0.write(topic_id_name + ': ' + str(num) + '\n')
		# 按主题名排序
		result = collections.OrderedDict(sorted(\
			result.items(), key = lambda t: t[0]))
		with open('articles_count.txt', 'w') as f:
			for topic_id, num in result.iteritems():
				f.write(topic_id + ': ' + str(num) + '\n')

	except Exception, e:
		print Exception, e
		logging.error('run error', exc_info=True)
コード例 #25
0
ファイル: tuicool.py プロジェクト: simple2source/macugEx
def main():
    # 登录
    l.login()
    # 获得主题数据的api地址
    base_url = 'http://www.tuicool.com/topics/my_hot?id=1'
    try:
        common.rand_sleep(5, 10)
        res = l.session.get(base_url)
        logging.info('return url {} success'.format(res.url))
        res_data = json.loads(res.text)

        result = {}

        # 主题分类列表
        class_list = res_data['cats']
        with open('articles_count0.txt', 'w') as f0:
            for class_item in class_list:
                class_id_name = str(class_item['id'])\
                 + '_' + class_item['name'].encode('utf8')
                # 主题列表
                topic_list = class_item['items']
                for topic in topic_list:
                    topic_id_name = class_id_name + '_' + str(topic['id'])\
                     + '_' + topic['name'].encode('utf8')
                    num = count_articles_in_topic(str(topic['id']))
                    result[topic_id_name] = num
                    print topic_id_name, num
                    f0.write(topic_id_name + ': ' + str(num) + '\n')
        # 按主题名排序
        result = collections.OrderedDict(sorted(\
         result.items(), key = lambda t: t[0]))
        with open('articles_count.txt', 'w') as f:
            for topic_id, num in result.iteritems():
                f.write(topic_id + ': ' + str(num) + '\n')

    except Exception, e:
        print Exception, e
        logging.error('run error', exc_info=True)
コード例 #26
0
def run(url, tp):
    aa = common.get_request(url, timeout=8)
    url2 = url
    page = 1
    flag = True
    keyword_all = []
    while flag:
        print url
        keyword_list = extract(aa.text, tp)
        print keyword_list
        print '---------------'
        keyword_all.extend(keyword_list)
        print keyword_all
        if aa.text.find(u'下一页') < 0:
            flag = False
        else:
            page += 1
            headers['refer'] = url
            url = url2 + "?page={}".format(page)
            print url
            aa = common.get_request(url, headers=headers, timeout=8)
        common.rand_sleep(9, 4)
    return keyword_all
コード例 #27
0
ファイル: segmentfault.py プロジェクト: simple2source/macugEx
def get_article_num(tag_name):
	num = 0
	url = 'https://segmentfault.com/t/{}/blogs'.format(urllib.quote(tag_name))

	try:
		common.rand_sleep(5, 10)
		res = s.get(url)
		soup = BeautifulSoup(res.text, 'html.parser')
		pagination = soup.find('ul', class_='pagination')
		if pagination is None:
			article_list = soup.find_all('section', class_='stream-list__item')
			num = len(article_list)
		else:
			url += '?page=1000'
			common.rand_sleep(5, 10)
			res = s.get(url)
			soup = BeautifulSoup(res.text, 'html.parser')
			pagination = soup.find('ul', class_='pagination')
			total_page = pagination.find('li', class_='active').find('a').get_text()\
				.encode('utf8')
			num = int(total_page) * 15
	except Exception, e:
		logging.error('run error', exc_info=True)
		return num