Beispiel #1
0
class ArticleSpider(AsySpider):
    coll = get_collection(DB, 'code_pyhome', 'motor')

    @gen.coroutine
    def update_doc(self, url, data_dict):
        """update doc in mongo"""
        yield ArticleSpider.coll.update({'source_url': url},
                                        {'$set': data_dict}, True)

    @gen.coroutine
    def handle_html(self, url, html):
        if 'python' in url:  # only save python code
            print(url)
            # article_id = url.rsplit('/', 2)[-1]
            data = parse_sharejs(url, html)
            yield self.update_doc(url, data)

    def save_html(self, url, html):
        """http://www.sharejs.com/codes/javascript/9067"""
        kind = url.rsplit('/', 2)[1]  # kind是大分类,区别tag_list
        article_id = url.rsplit('/', 2)[-1]
        filename = './sharejs_html/' + kind + '_' + article_id + '.html'

        try:
            with open(filename, 'wb') as f:
                f.write(html)
                print('saving file', filename)
        except IOError:
            if not os.path.exists('./sharejs_html'):
                os.makedirs('./sharejs_html')
Beispiel #2
0
def migrate(coll_name='article', limit=10):
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }
    coll = get_collection(DB, coll_name)

    posts = []
    posts_tags = []
    index = 0

    for doc in coll.find().batch_size(1000):
        title = doc.get('title')
        if not exist_or_insert(title):
            doc_id = doc.get('_id')
            post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0])
            index += 1
            if index > limit:
                break

            posts.append(replace_post(doc))
            posts_tags.append(
                {"tag_id": 1, "post_id": post_id}
            )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
def tag_migrate(limit=10):
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }
    coll = get_collection(DB, 'code')

    posts = []
    tags_id_map = {}
    posts_tags = []
    tag_id = 1000
    index = 0

    for doc in coll.find().batch_size(1000):
        #print(doc.get('title'))
        index += 1
        if index > limit:
            break

        posts.append(replace_post(doc))
        post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0])
        tag_list = doc.get('tag_list')
        tag = tag_list[0] if tag_list else ''
        tag = remove_china_char(tag)
        if tag:
            save_tag = tag.replace(' ', '-').lower()
            save_tag = find_first_tag(save_tag)
            if len(save_tag) > 10:
                posts_tags.append(
                    {"tag_id": 1, "post_id": post_id}
                )
                continue

            if save_tag not in tags_id_map:
                tag_id += 1
                TAGS.append({
                    "id": tag_id,
                    "name": save_tag,
                    "slug": save_tag,
                    "description": ""
                })
                tags_id_map[save_tag] = tag_id
                posts_tags.append(
                    {"tag_id": tags_id_map[save_tag], "post_id": post_id}
                )
            else:
                posts_tags.append(
                    {"tag_id": tags_id_map[save_tag], "post_id": post_id}
                )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
Beispiel #4
0
def save_to_mongo(db_name, col_name, doc_path):
    articles = get_collection(db_name, col_name)  # collection articles

    max_cnt = 100
    index = 0

    for path in get_all_files(doc_path):
        print(path)
        index += 1
        if index > max_cnt:
            return

        with open(path, 'r') as f:
            html = f.read()
            data = parse_jb51(html)

            file_id = os.path.basename(path).rsplit('.', 1)[0]
            data['source'] = 'http://www.jb51.net/article/%s.htm' % file_id
            data['source_id'] = file_id

            print(data.get('source_id'))
            print(data.get('title'))

            articles.update(
                {'source_id': data.get('source_id')},
                {
                    '$set': data
                },
                True
            )
Beispiel #5
0
def tag_migrate(limit=10):
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }
    coll = get_collection(DB, 'code')

    posts = []
    tags_id_map = {}
    posts_tags = []
    tag_id = 1000
    index = 0

    for doc in coll.find().batch_size(1000):
        #print(doc.get('title'))
        index += 1
        if index > limit:
            break

        posts.append(replace_post(doc))
        post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0])
        tag_list = doc.get('tag_list')
        tag = tag_list[0] if tag_list else ''
        tag = remove_china_char(tag)
        if tag:
            save_tag = tag.replace(' ', '-').lower()
            save_tag = find_first_tag(save_tag)
            if len(save_tag) > 10:
                posts_tags.append(
                    {"tag_id": 1, "post_id": post_id}
                )
                continue

            if save_tag not in tags_id_map:
                tag_id += 1
                TAGS.append({
                    "id": tag_id,
                    "name": save_tag,
                    "slug": save_tag,
                    "description": ""
                })
                tags_id_map[save_tag] = tag_id
                posts_tags.append(
                    {"tag_id": tags_id_map[save_tag], "post_id": post_id}
                )
            else:
                posts_tags.append(
                    {"tag_id": tags_id_map[save_tag], "post_id": post_id}
                )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": users
    }
    res["data"] = data
    return res
Beispiel #6
0
def migrate(coll_name='article_pyhome', skip=0, limit=10):
    res = {"meta": {"exported_on": cur_timestamp(), "version": "003"}}
    coll = get_collection(DB, coll_name)

    posts = []
    posts_tags = []
    slug_set = set()

    for doc in coll.find().skip(skip).limit(limit):
        title = doc.get('title')
        slug = title.lower().strip()

        if slug and (slug not in slug_set):
            slug_set.add(slug)
            doc['slug'] = slug

            if not exist_or_insert(slug):
                doc_id = doc.get('_id')
                post_id = int(doc['source_url'].rsplit('/',
                                                       1)[1].split('.')[0])

                posts.append(replace_post(doc))
                posts_tags.append({"tag_id": 1, "post_id": post_id})

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
Beispiel #7
0
class User(object):
    __collection__ = 'user'
    col = get_collection(CONFIG.MONGO.DATABASE, __collection__, 'motor')
    _fields = {
        'email',
        'password',
    }

    @classmethod
    @coroutine
    def insert(cls, email, password, password2):
        user = yield cls.col.find_one({'email': email})
        form = RegisterForm(email=email,
                            password=password,
                            password2=password2)

        if (not user) and form.validate():
            password_hashed = yield executor.submit(bcrypt.hashpw,
                                                    utf8(password),
                                                    bcrypt.gensalt())
            yield cls.col.insert({'email': email, 'password': password_hashed})
        else:
            raise ValidationError("insert error")

    @classmethod
    @coroutine
    def check_password(cls, email, check_password):
        user = yield cls.col.find_one({'email': email})
        password = user.get('password')
        password_hashed = yield executor.submit(bcrypt.hashpw,
                                                utf8(check_password),
                                                utf8(password))
        raise Return(password == password_hashed)
Beispiel #8
0
def xianguo_spider(q, coll_name='tech', max_news_num=1000):
    _COLL = get_collection(DB, coll_name)
    while True:
        while not q.empty():
            url, data_dict = q.get()
            try:
                html = fetch(url, data_dict)
            except Exception as e:
                print(e)
                continue

            if not html or html == 'null':  # xianguo may returns null
                return

            o = json.loads(html)
            to_save = ['source', 'content', 'url', 'title', 'time', 'brief']
            id_list = []

            for i in o:
                d = {}
                docid = i.get('id')
                id_list.append(docid)
                section_id = i.get('user').get('id')
                source = i.get('user').get('username')
                content = i.get('linkcontent').get('content')
                url = i.get('linkcontent').get('originalurl')
                title = i.get('linkcontent').get('title')
                time = i.get('time')
                if time is None or time == 'None':
                    time = 0

                brief = i.get('content')
                for k, v in list(locals().items()):
                    if k in to_save:
                        d[k] = v

                _COLL.update({'_id': int(docid)}, {'$set': d}, True)
            maxid = min(id_list)

            form_dict = dict(devicemodel='motorola-XT1079',
                             isShowContent=1,
                             maxid=int(maxid),
                             sectionid=int(section_id),
                             sectiontype=0,
                             version=77,
                             count=25,
                             udid=355456060447393,
                             devicetype=5,
                             isThumb=0)
            print('new url', form_dict.get('maxid'),
                  form_dict.get('sectionid'))

            for i in id_list:
                if _COLL.find_one({'_id': int(i)}):
                    print('************Finish#############')
                    return

            q.put((URL, form_dict))  # put a tuple
Beispiel #9
0
def incr_migrate(coll_name, limit):
    coll = get_collection(DB, coll_name)
    all_cnt = coll.find().count()
    start = 0
    index = 0

    while start < all_cnt:
        res = migrate(coll_name, start, limit)
        filename = str(index) + '_article.json'
        # print(json.dumps(res, indent=4))
        with io.open(filename, 'w+', encoding='utf8') as outfile:
            data = json.dumps(res,
                              ensure_ascii=False,
                              encoding='utf8',
                              indent=4)
            outfile.write(unicode(data))

        index += 1
        start += limit
Beispiel #10
0
def migrate(coll_name, limit=10):
    coll = get_collection(DB, coll_name)
    # gen_tag_id()    # gen tag first
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }

    posts = []
    posts_tags = []
    index = 0

    slug_set = set()
    for doc in coll.find().sort('time', -1).batch_size(1000):
        if is_python_article(doc):
            title = doc.get('title')
            if not exist_or_insert(title):
                doc_id = doc.get('_id')
                index += 1
                if index > limit:
                    break
                slug = doc.get('title')
                if len(slug) > 30:
                    slug = slug[0:30]
                doc['title'] = slug
                if slug not in slug_set:
                    slug_set.add(slug)
                    posts.append(replace_post(doc))
                    posts_tags.append(
                        {"tag_id": TAGS[0].get('id'), "post_id": int(doc_id)}
                    )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
def migrate(coll_name, limit=10):
    coll = get_collection(DB, coll_name)
    # gen_tag_id()    # gen tag first
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }

    posts = []
    posts_tags = []
    index = 0

    slug_set = set()
    for doc in coll.find().sort('time', -1).batch_size(1000):
        title = doc.get('title')
        if not exist_or_insert(title):
            doc_id = doc.get('_id')
            index += 1
            if index > limit:
                break
            slug = doc.get('title')
            if len(slug) > 30:
                slug = slug[0:30]
            doc['title'] = slug
            if slug not in slug_set:
                slug_set.add(slug)
                posts.append(replace_post(doc))
                posts_tags.append(
                    {"tag_id": TAGS[0].get('id'), "post_id": int(doc_id)}
                )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
Beispiel #12
0
def migrate(coll_name, limit=10):
    res = {
        "meta": {
            "exported_on": cur_timestamp(),
            "version": "003"
        }
    }
    coll = get_collection(DB, coll_name)

    posts = []
    posts_tags = []
    index = 0
    title_set = set()

    for doc in coll.find().batch_size(1000):
        title = doc.get('title')
        slug = title.lower().strip()
        if slug not in title_set:
            title_set.add(slug)

            if not exist_or_insert(slug):
                doc_id = doc.get('_id')
                post_id = int(doc['source_url'].rsplit('/', 1)[1].split('.')[0])
                index += 1
                if index > limit:
                    break

                posts.append(replace_post(doc))
                posts_tags.append(
                    {"tag_id": 1000, "post_id": post_id}
                )

    data = {
        "posts": posts,
        "tags": TAGS,
        "posts_tags": posts_tags,
        "users": USERS
    }
    res["data"] = data
    return res
Beispiel #13
0
class WechatPost(object):
    __collection__ = 'wechat_post'
    col = get_collection(CONFIG.MONGO.DATABASE, __collection__, 'motor')

    @classmethod
    @coroutine
    def query(cls, condition={}, order_by=None, limit=None, skip=None):
        try:
            cursor = cls.col.find(condition)
            if order_by:
                cursor.sort(order_by)
            if limit:
                cursor.limit(limit)
            if skip:
                cursor.skip(skip)

            posts = []
            for doc in (yield cursor.to_list(length=limit)):
                post = cls.to_dict(doc)
                posts.append(post)
            raise Return(posts)
        except ValueError:
            traceback.print_exc()
            raise Return([])

    @classmethod
    @coroutine
    def count(cls, condition={}):
        cnt = yield cls.col.find(condition).count()
        raise Return(cnt)

    @classmethod
    def to_dict(cls, doc):
        post = bson_to_json(doc)
        pre_url = 'http://read.html5.qq.com/image?src=forum&q=5&r=0&imgflag=7&imageUrl='
        post['image'] = pre_url + post['cdn_url']
        post['date'] = datestr_from_stamp(post['ori_create_time'], '%Y-%m-%d')
        return ObjectDict(post)
Beispiel #14
0
def test():
    coll = get_collection('test', 'Articles')
    doc = coll.find_one({'_id': ObjectId('5649e9edea282e17fa5511f7')})
    print(type(doc))
    pprint(doc)
Beispiel #15
0
<a href="/4" id="cat-4" class="list-group-item">生活&middot;家居</a>
<a href="/5" id="cat-5" class="list-group-item">学习&middot;工具</a>
<a href="/6" id="cat-6" class="list-group-item">历史&middot;读书</a>
<a href="/7" id="cat-7" class="list-group-item">金融&middot;理财</a>
<a href="/8" id="cat-8" class="list-group-item">电影&middot;音乐</a>
<a href="/9" id="cat-9" class="list-group-item">美食&middot;菜谱</a>
<a href="/10" id="cat-10" class="list-group-item">外语&middot;教育</a>
<a href="/11" id="cat-11" class="list-group-item">宠物&middot;休闲</a>
<a href="/12" id="cat-12" class="list-group-item">健康&middot;医疗</a>
<a href="/13" id="cat-13" class="list-group-item">时尚&middot;购物</a>
<a href="/14" id="cat-14" class="list-group-item">公司&middot;宣传</a>
<a href="/15" id="cat-15" class="list-group-item">游戏&middot;娱乐</a>

"""

COL = get_collection(CONFIG.MONGO.DATABASE, 'wechat_name')


def wechat_list():
    for _id in range(1, 16):
        url = 'http://www.iwgc.cn/%d' % _id
        page = 1
        res = []

        while True:
            page_url = url + '/p/' + str(page)
            html = requests.get(page_url).text
            detail_list = extract_all('<div class="detail">', '</div>', html)
            name_list = [extract('title="', '"', tag) for tag in detail_list]
            if not name_list:
                break
Beispiel #16
0
<a href="/4" id="cat-4" class="list-group-item">生活&middot;家居</a>
<a href="/5" id="cat-5" class="list-group-item">学习&middot;工具</a>
<a href="/6" id="cat-6" class="list-group-item">历史&middot;读书</a>
<a href="/7" id="cat-7" class="list-group-item">金融&middot;理财</a>
<a href="/8" id="cat-8" class="list-group-item">电影&middot;音乐</a>
<a href="/9" id="cat-9" class="list-group-item">美食&middot;菜谱</a>
<a href="/10" id="cat-10" class="list-group-item">外语&middot;教育</a>
<a href="/11" id="cat-11" class="list-group-item">宠物&middot;休闲</a>
<a href="/12" id="cat-12" class="list-group-item">健康&middot;医疗</a>
<a href="/13" id="cat-13" class="list-group-item">时尚&middot;购物</a>
<a href="/14" id="cat-14" class="list-group-item">公司&middot;宣传</a>
<a href="/15" id="cat-15" class="list-group-item">游戏&middot;娱乐</a>

"""

COL = get_collection(CONFIG.MONGO.DATABASE, 'wechat_name')


def wechat_list():
    for _id in range(1, 16):
        url = 'http://www.iwgc.cn/%d' % _id
        page = 1
        res = []

        while True:
            page_url = url + '/p/' + str(page)
            html = requests.get(page_url).text
            detail_list = extract_all('<div class="detail">', '</div>', html)
            name_list = [extract('title="', '"', tag) for tag in detail_list]
            if not name_list:
                break
Beispiel #17
0
def xianguo_spider(q, coll_name='tech', max_news_num=1000):
    _COLL = get_collection(DB, coll_name)
    while True:
        while not q.empty():
            url, data_dict = q.get()
            try:
                html = fetch(url, data_dict)
            except Exception as e:
                print(e)
                continue

            if not html or html == 'null':    # xianguo may returns null
                return

            o = json.loads(html)
            to_save = ['source', 'content',
                       'url', 'title', 'time', 'brief']
            id_list = []

            for i in o:
                d = {}
                docid = i.get('id')
                id_list.append(docid)
                section_id = i.get('user').get('id')
                source = i.get('user').get('username')
                content = i.get('linkcontent').get('content')
                url = i.get('linkcontent').get('originalurl')
                title = i.get('linkcontent').get('title')
                time = i.get('time')
                if time is None or time == 'None':
                    time = 0

                brief = i.get('content')
                for k, v in list(locals().items()):
                    if k in to_save:
                        d[k] = v

                _COLL.update(
                    {'_id': int(docid)},
                    {
                        '$set': d
                    },
                    True
                )
            maxid = min(id_list)

            form_dict = dict(
                devicemodel='motorola-XT1079',
                isShowContent=1,
                maxid=int(maxid),
                sectionid=int(section_id),
                sectiontype=0,
                version=77,
                count=25,
                udid=355456060447393,
                devicetype=5,
                isThumb=0
            )
            print('new url', form_dict.get('maxid'), form_dict.get('sectionid'))

            for i in id_list:
                if _COLL.find_one({'_id': int(i)}):
                    print('************Finish#############')
                    return

            q.put((URL, form_dict))    # put a tuple
Beispiel #18
0
 def __init__(self, urls, concurrency=10, results=None, **kwargs):
     super(Jb51Spider, self).__init__(urls, concurrency, results, **kwargs)
     self.db = get_collection(DB, 'article_pyhome', 'motor')  # change coll
Beispiel #19
0
 def __init__(self, urls, concurrency=10, results=None, **kwargs):
     super(Jb51Spider, self).__init__(urls, concurrency, results, **kwargs)
     self.db = get_collection(DB, 'article', 'motor')    # change coll
Beispiel #20
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""用来判断是不是已经上传到博客了
通过title字段判断,存储在表uploaded
"""

import _env
from lib._db import get_collection
from config.config import CONFIG


DB = CONFIG.MONGO.DATABASE
_COLL = get_collection(DB, 'uploaded')


def is_uploaded(title):
    doc = _COLL.find_one({'title': title})
    if doc is None:
        return False
    return True


def insert_uploaded(title):
    _COLL.update(
        {'title': title},
        {
            '$set': {'title': title}
        },
        True
    )