class ArticleSpider(AsySpider): coll = get_collection(DB, 'code_pyhome', 'motor') @gen.coroutine def update_doc(self, url, data_dict): """update doc in mongo""" yield ArticleSpider.coll.update({'source_url': url}, {'$set': data_dict}, True) @gen.coroutine def handle_html(self, url, html): if 'python' in url: # only save python code print(url) # article_id = url.rsplit('/', 2)[-1] data = parse_sharejs(url, html) yield self.update_doc(url, data) def save_html(self, url, html): """http://www.sharejs.com/codes/javascript/9067""" kind = url.rsplit('/', 2)[1] # kind是大分类,区别tag_list article_id = url.rsplit('/', 2)[-1] filename = './sharejs_html/' + kind + '_' + article_id + '.html' try: with open(filename, 'wb') as f: f.write(html) print('saving file', filename) except IOError: if not os.path.exists('./sharejs_html'): os.makedirs('./sharejs_html')
def migrate(coll_name='article', limit=10): res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } coll = get_collection(DB, coll_name) posts = [] posts_tags = [] index = 0 for doc in coll.find().batch_size(1000): title = doc.get('title') if not exist_or_insert(title): doc_id = doc.get('_id') post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0]) index += 1 if index > limit: break posts.append(replace_post(doc)) posts_tags.append( {"tag_id": 1, "post_id": post_id} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
def tag_migrate(limit=10): res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } coll = get_collection(DB, 'code') posts = [] tags_id_map = {} posts_tags = [] tag_id = 1000 index = 0 for doc in coll.find().batch_size(1000): #print(doc.get('title')) index += 1 if index > limit: break posts.append(replace_post(doc)) post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0]) tag_list = doc.get('tag_list') tag = tag_list[0] if tag_list else '' tag = remove_china_char(tag) if tag: save_tag = tag.replace(' ', '-').lower() save_tag = find_first_tag(save_tag) if len(save_tag) > 10: posts_tags.append( {"tag_id": 1, "post_id": post_id} ) continue if save_tag not in tags_id_map: tag_id += 1 TAGS.append({ "id": tag_id, "name": save_tag, "slug": save_tag, "description": "" }) tags_id_map[save_tag] = tag_id posts_tags.append( {"tag_id": tags_id_map[save_tag], "post_id": post_id} ) else: posts_tags.append( {"tag_id": tags_id_map[save_tag], "post_id": post_id} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
def save_to_mongo(db_name, col_name, doc_path): articles = get_collection(db_name, col_name) # collection articles max_cnt = 100 index = 0 for path in get_all_files(doc_path): print(path) index += 1 if index > max_cnt: return with open(path, 'r') as f: html = f.read() data = parse_jb51(html) file_id = os.path.basename(path).rsplit('.', 1)[0] data['source'] = 'http://www.jb51.net/article/%s.htm' % file_id data['source_id'] = file_id print(data.get('source_id')) print(data.get('title')) articles.update( {'source_id': data.get('source_id')}, { '$set': data }, True )
def tag_migrate(limit=10): res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } coll = get_collection(DB, 'code') posts = [] tags_id_map = {} posts_tags = [] tag_id = 1000 index = 0 for doc in coll.find().batch_size(1000): #print(doc.get('title')) index += 1 if index > limit: break posts.append(replace_post(doc)) post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0]) tag_list = doc.get('tag_list') tag = tag_list[0] if tag_list else '' tag = remove_china_char(tag) if tag: save_tag = tag.replace(' ', '-').lower() save_tag = find_first_tag(save_tag) if len(save_tag) > 10: posts_tags.append( {"tag_id": 1, "post_id": post_id} ) continue if save_tag not in tags_id_map: tag_id += 1 TAGS.append({ "id": tag_id, "name": save_tag, "slug": save_tag, "description": "" }) tags_id_map[save_tag] = tag_id posts_tags.append( {"tag_id": tags_id_map[save_tag], "post_id": post_id} ) else: posts_tags.append( {"tag_id": tags_id_map[save_tag], "post_id": post_id} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": users } res["data"] = data return res
def migrate(coll_name='article_pyhome', skip=0, limit=10): res = {"meta": {"exported_on": cur_timestamp(), "version": "003"}} coll = get_collection(DB, coll_name) posts = [] posts_tags = [] slug_set = set() for doc in coll.find().skip(skip).limit(limit): title = doc.get('title') slug = title.lower().strip() if slug and (slug not in slug_set): slug_set.add(slug) doc['slug'] = slug if not exist_or_insert(slug): doc_id = doc.get('_id') post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0]) posts.append(replace_post(doc)) posts_tags.append({"tag_id": 1, "post_id": post_id}) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
class User(object): __collection__ = 'user' col = get_collection(CONFIG.MONGO.DATABASE, __collection__, 'motor') _fields = { 'email', 'password', } @classmethod @coroutine def insert(cls, email, password, password2): user = yield cls.col.find_one({'email': email}) form = RegisterForm(email=email, password=password, password2=password2) if (not user) and form.validate(): password_hashed = yield executor.submit(bcrypt.hashpw, utf8(password), bcrypt.gensalt()) yield cls.col.insert({'email': email, 'password': password_hashed}) else: raise ValidationError("insert error") @classmethod @coroutine def check_password(cls, email, check_password): user = yield cls.col.find_one({'email': email}) password = user.get('password') password_hashed = yield executor.submit(bcrypt.hashpw, utf8(check_password), utf8(password)) raise Return(password == password_hashed)
def xianguo_spider(q, coll_name='tech', max_news_num=1000): _COLL = get_collection(DB, coll_name) while True: while not q.empty(): url, data_dict = q.get() try: html = fetch(url, data_dict) except Exception as e: print(e) continue if not html or html == 'null': # xianguo may returns null return o = json.loads(html) to_save = ['source', 'content', 'url', 'title', 'time', 'brief'] id_list = [] for i in o: d = {} docid = i.get('id') id_list.append(docid) section_id = i.get('user').get('id') source = i.get('user').get('username') content = i.get('linkcontent').get('content') url = i.get('linkcontent').get('originalurl') title = i.get('linkcontent').get('title') time = i.get('time') if time is None or time == 'None': time = 0 brief = i.get('content') for k, v in list(locals().items()): if k in to_save: d[k] = v _COLL.update({'_id': int(docid)}, {'$set': d}, True) maxid = min(id_list) form_dict = dict(devicemodel='motorola-XT1079', isShowContent=1, maxid=int(maxid), sectionid=int(section_id), sectiontype=0, version=77, count=25, udid=355456060447393, devicetype=5, isThumb=0) print('new url', form_dict.get('maxid'), form_dict.get('sectionid')) for i in id_list: if _COLL.find_one({'_id': int(i)}): print('************Finish#############') return q.put((URL, form_dict)) # put a tuple
def incr_migrate(coll_name, limit): coll = get_collection(DB, coll_name) all_cnt = coll.find().count() start = 0 index = 0 while start < all_cnt: res = migrate(coll_name, start, limit) filename = str(index) + '_article.json' # print(json.dumps(res, indent=4)) with io.open(filename, 'w+', encoding='utf8') as outfile: data = json.dumps(res, ensure_ascii=False, encoding='utf8', indent=4) outfile.write(unicode(data)) index += 1 start += limit
def migrate(coll_name, limit=10): coll = get_collection(DB, coll_name) # gen_tag_id() # gen tag first res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } posts = [] posts_tags = [] index = 0 slug_set = set() for doc in coll.find().sort('time', -1).batch_size(1000): if is_python_article(doc): title = doc.get('title') if not exist_or_insert(title): doc_id = doc.get('_id') index += 1 if index > limit: break slug = doc.get('title') if len(slug) > 30: slug = slug[0:30] doc['title'] = slug if slug not in slug_set: slug_set.add(slug) posts.append(replace_post(doc)) posts_tags.append( {"tag_id": TAGS[0].get('id'), "post_id": int(doc_id)} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
def migrate(coll_name, limit=10): coll = get_collection(DB, coll_name) # gen_tag_id() # gen tag first res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } posts = [] posts_tags = [] index = 0 slug_set = set() for doc in coll.find().sort('time', -1).batch_size(1000): title = doc.get('title') if not exist_or_insert(title): doc_id = doc.get('_id') index += 1 if index > limit: break slug = doc.get('title') if len(slug) > 30: slug = slug[0:30] doc['title'] = slug if slug not in slug_set: slug_set.add(slug) posts.append(replace_post(doc)) posts_tags.append( {"tag_id": TAGS[0].get('id'), "post_id": int(doc_id)} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
def migrate(coll_name, limit=10): res = { "meta": { "exported_on": cur_timestamp(), "version": "003" } } coll = get_collection(DB, coll_name) posts = [] posts_tags = [] index = 0 title_set = set() for doc in coll.find().batch_size(1000): title = doc.get('title') slug = title.lower().strip() if slug not in title_set: title_set.add(slug) if not exist_or_insert(slug): doc_id = doc.get('_id') post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0]) index += 1 if index > limit: break posts.append(replace_post(doc)) posts_tags.append( {"tag_id": 1000, "post_id": post_id} ) data = { "posts": posts, "tags": TAGS, "posts_tags": posts_tags, "users": USERS } res["data"] = data return res
class WechatPost(object): __collection__ = 'wechat_post' col = get_collection(CONFIG.MONGO.DATABASE, __collection__, 'motor') @classmethod @coroutine def query(cls, condition={}, order_by=None, limit=None, skip=None): try: cursor = cls.col.find(condition) if order_by: cursor.sort(order_by) if limit: cursor.limit(limit) if skip: cursor.skip(skip) posts = [] for doc in (yield cursor.to_list(length=limit)): post = cls.to_dict(doc) posts.append(post) raise Return(posts) except ValueError: traceback.print_exc() raise Return([]) @classmethod @coroutine def count(cls, condition={}): cnt = yield cls.col.find(condition).count() raise Return(cnt) @classmethod def to_dict(cls, doc): post = bson_to_json(doc) pre_url = 'http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl=' post['image'] = pre_url + post['cdn_url'] post['date'] = datestr_from_stamp(post['ori_create_time'], '%Y-%m-%d') return ObjectDict(post)
def test(): coll = get_collection('test', 'Articles') doc = coll.find_one({'_id': ObjectId('5649e9edea282e17fa5511f7')}) print(type(doc)) pprint(doc)
<a href="/4" id="cat-4" class="list-group-item">生活·家居</a> <a href="/5" id="cat-5" class="list-group-item">学习·工具</a> <a href="/6" id="cat-6" class="list-group-item">历史·读书</a> <a href="/7" id="cat-7" class="list-group-item">金融·理财</a> <a href="/8" id="cat-8" class="list-group-item">电影·音乐</a> <a href="/9" id="cat-9" class="list-group-item">美食·菜谱</a> <a href="/10" id="cat-10" class="list-group-item">外语·教育</a> <a href="/11" id="cat-11" class="list-group-item">宠物·休闲</a> <a href="/12" id="cat-12" class="list-group-item">健康·医疗</a> <a href="/13" id="cat-13" class="list-group-item">时尚·购物</a> <a href="/14" id="cat-14" class="list-group-item">公司·宣传</a> <a href="/15" id="cat-15" class="list-group-item">游戏·娱乐</a> """ COL = get_collection(CONFIG.MONGO.DATABASE, 'wechat_name') def wechat_list(): for _id in range(1, 16): url = 'http://www.iwgc.cn/%d' % _id page = 1 res = [] while True: page_url = url + '/p/' + str(page) html = requests.get(page_url).text detail_list = extract_all('<div class="detail">', '</div>', html) name_list = [extract('title="', '"', tag) for tag in detail_list] if not name_list: break
def xianguo_spider(q, coll_name='tech', max_news_num=1000): _COLL = get_collection(DB, coll_name) while True: while not q.empty(): url, data_dict = q.get() try: html = fetch(url, data_dict) except Exception as e: print(e) continue if not html or html == 'null': # xianguo may returns null return o = json.loads(html) to_save = ['source', 'content', 'url', 'title', 'time', 'brief'] id_list = [] for i in o: d = {} docid = i.get('id') id_list.append(docid) section_id = i.get('user').get('id') source = i.get('user').get('username') content = i.get('linkcontent').get('content') url = i.get('linkcontent').get('originalurl') title = i.get('linkcontent').get('title') time = i.get('time') if time is None or time == 'None': time = 0 brief = i.get('content') for k, v in list(locals().items()): if k in to_save: d[k] = v _COLL.update( {'_id': int(docid)}, { '$set': d }, True ) maxid = min(id_list) form_dict = dict( devicemodel='motorola-XT1079', isShowContent=1, maxid=int(maxid), sectionid=int(section_id), sectiontype=0, version=77, count=25, udid=355456060447393, devicetype=5, isThumb=0 ) print('new url', form_dict.get('maxid'), form_dict.get('sectionid')) for i in id_list: if _COLL.find_one({'_id': int(i)}): print('************Finish#############') return q.put((URL, form_dict)) # put a tuple
def __init__(self, urls, concurrency=10, results=None, **kwargs): super(Jb51Spider, self).__init__(urls, concurrency, results, **kwargs) self.db = get_collection(DB, 'article_pyhome', 'motor') # change coll
def __init__(self, urls, concurrency=10, results=None, **kwargs): super(Jb51Spider, self).__init__(urls, concurrency, results, **kwargs) self.db = get_collection(DB, 'article', 'motor') # change coll
#!/usr/bin/env python # -*- coding:utf-8 -*- """用来判断是不是已经上传到博客了 通过title字段判断,存储在表uploaded """ import _env from lib._db import get_collection from config.config import CONFIG DB = CONFIG.MONGO.DATABASE _COLL = get_collection(DB, 'uploaded') def is_uploaded(title): doc = _COLL.find_one({'title': title}) if doc is None: return False return True def insert_uploaded(title): _COLL.update( {'title': title}, { '$set': {'title': title} }, True )