Esempio n. 1
0
def _tokenize_title(hostname, api_key):
    """ 全ブログエントリを取得し、転置インデックス用のデータを作る """
    _, total = tumblr.getTotalPosts(hostname, api_key)
    docs = []
    if total:
        offset = 0
        limit = 50
        while offset < total:
            res = tumblr.getPosts(hostname, api_key, offset, limit)
            if res:
                for post in res['response']['posts']:
                    dt = datetime.strptime(post['date'], '%Y-%m-%d %H:%M:%S %Z')
                    # スコアは投稿時刻が新しいものほど高い(とりあえず)
                    # XXX スコアはドキュメント単位でなく、ドキュメント-tokenのペアごとに付与してもいい XXX
                    score = (dt - EPOCH).total_seconds()
                    title = post.get('title', '')
                    nouns = _extract_nouns(title) if title else set()
                    # ドキュメントID, タイトル(元の文字列)、token、スコア値
                    doc = {'id': str(post['id']), 'title': title,
                           'tokens': nouns, 'score': score}
                    docs.append(doc)
            offset += limit
    return docs
Esempio n. 2
0
def updateAll(config, hostname):
    api_key = config.get("Target", "api_key")
    (blog_name, total) = tumblr.getTotalPosts(hostname, api_key)
    if total:
        docids_new = set()
        offset = 0
        limit = 50
        # insert & update
        while offset < total:
            res = tumblr.getPosts(hostname, api_key, offset, limit)
            if res:
                for post in res['response']['posts']:
                    doc = _makeDoc(post)
                    docids_new.add(doc['id'])
                    _postToSolr(doc)
                _commit()
            offset += limit
        # remove deleted documents
        docids_old = _getAllDocIds(blog_name)
        docids_deleted = docids_old - docids_new
        for docid in docids_deleted:
            _deleteFromSolr(str(docid))
        _commit()
except getopt.GetoptError as e:
    onError(1, str(e))

if len(sys.argv) == 1:  # no options passed
    onError(2, 2)
    
verbose = False
keepGoing = False
writeLog = False
    
for option, argument in myopts:
    if option in ('-b', '--blog'):
        blog = argument
    elif option in ('-k', '--keepgoing'):
        keepGoing = True
    elif option in ('-l', '--log'):
        writeLog = True
    elif option in ('-v', '--verbose'):  # verbose output
        verbose = True
    elif option in ('-h', '--help'):  # display help text
        usage(0)

mainDir, downloadDir, gifDir, videoDir = checkDirectories(defaultDownloadDir, subDir, blog, gifDir, videoDir, verbose)

client = authenticateClient(verbose)

posts = getPosts(client, blog, mainDir, downloadDir, gifDir, videoDir, keepGoing, writeLog, verbose)



except getopt.GetoptError as e:
    onError(1, str(e))

if len(sys.argv) == 1:  # no options passed
    onError(2, 2)

verbose = False
keepGoing = False
writeLog = False

for option, argument in myopts:
    if option in ('-b', '--blog'):
        blog = argument
    elif option in ('-k', '--keepgoing'):
        keepGoing = True
    elif option in ('-l', '--log'):
        writeLog = True
    elif option in ('-v', '--verbose'):  # verbose output
        verbose = True
    elif option in ('-h', '--help'):  # display help text
        usage(0)

mainDir, downloadDir, gifDir, videoDir = checkDirectories(
    defaultDownloadDir, subDir, blog, gifDir, videoDir, verbose)

client = authenticateClient(verbose)

posts = getPosts(client, blog, mainDir, downloadDir, gifDir, videoDir,
                 keepGoing, writeLog, verbose)