Example #1
0
def _tokenize_title(hostname, api_key):
    """ 全ブログエントリを取得し、転置インデックス用のデータを作る """
    _, total = tumblr.getTotalPosts(hostname, api_key)
    docs = []
    if total:
        offset = 0
        limit = 50
        while offset < total:
            res = tumblr.getPosts(hostname, api_key, offset, limit)
            if res:
                for post in res['response']['posts']:
                    dt = datetime.strptime(post['date'], '%Y-%m-%d %H:%M:%S %Z')
                    # スコアは投稿時刻が新しいものほど高い(とりあえず)
                    # XXX スコアはドキュメント単位でなく、ドキュメント-tokenのペアごとに付与してもいい XXX
                    score = (dt - EPOCH).total_seconds()
                    title = post.get('title', '')
                    nouns = _extract_nouns(title) if title else set()
                    # ドキュメントID, タイトル(元の文字列)、token、スコア値
                    doc = {'id': str(post['id']), 'title': title,
                           'tokens': nouns, 'score': score}
                    docs.append(doc)
            offset += limit
    return docs
Example #2
0
def updateAll(config, hostname):
    api_key = config.get("Target", "api_key")
    (blog_name, total) = tumblr.getTotalPosts(hostname, api_key)
    if total:
        docids_new = set()
        offset = 0
        limit = 50
        # insert & update
        while offset < total:
            res = tumblr.getPosts(hostname, api_key, offset, limit)
            if res:
                for post in res['response']['posts']:
                    doc = _makeDoc(post)
                    docids_new.add(doc['id'])
                    _postToSolr(doc)
                _commit()
            offset += limit
        # remove deleted documents
        docids_old = _getAllDocIds(blog_name)
        docids_deleted = docids_old - docids_new
        for docid in docids_deleted:
            _deleteFromSolr(str(docid))
        _commit()