コード例 #1
0
ファイル: pixnet.py プロジェクト: ryanchao2012/marginalbear
def algorithm(raw):
    query, ctype = clean_query(raw)
    words = [
        w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query)
        if bool(w.word.strip())
    ]
    comments = RetrievalEvaluate(
        'ccjieba',
        pweight=JiebaPosWeight.weight,
        title_ranker=pos_idf_jaccard_similarity).retrieve(words)
    candidates = []
    unique_cache = []
    rank = 0
    for cmt in comments:
        if cmt.body not in unique_cache:
            rank += 1
            unique_cache.append(cmt.body)
            candidates.append({
                'rank': rank,
                'score': cmt.score,
                'answer': cmt.body
            })
            if rank >= topn:
                break
    return candidates
コード例 #2
0
def algorithm(raw):
    # reply = MessengerBot(string, 'messenger', 'slack').retrieve()
    query, ctype = clean_query(raw)
    words = [
        w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query)
        if bool(w.word.strip())
    ]
    comments = RetrievalEvaluate(
        'ccjieba',
        pweight=JiebaPosWeight.weight,
        title_ranker=pos_idf_jaccard_similarity).retrieve(words)
    candidates = []
    unique_cache = []
    rank = 0
    for cmt in comments:
        if cmt.body not in unique_cache:
            rank += 1
            unique_cache.append(cmt.body)
            candidates.append('[{}] <{:.2f}> {}'.format(
                rank, cmt.score, cmt.body))
            if rank >= topn:
                break

    reply = '\n'.join(candidates)

    return reply
コード例 #3
0
ファイル: query_chat.py プロジェクト: ifengc/marginalbear
config_parser = RawConfigParser()
config_parser.read('../config.ini')

PsqlAbstract.set_database_info(
    config_parser.get('global', 'dbuser'),
    config_parser.get('global', 'dbname'),
    config_parser.get('global', 'dbpassword')
)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(
            '''
            usage: python query_chat <query-string>
            ex: python query_chat 安安幾歲住哪
            '''
        )
        sys.exit(0)
    raw = sys.argv[1]
    query, ctype = clean_query(raw)
    words = [w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query) if bool(w.word.strip())]
    print(words)
    comments = RetrievalEvaluate('ccjieba').retrieve(words)
    # response = comment.body

    for i, cmt in enumerate(comments, 1):
        print('[{}] <{:.2f}> {}'.format(i, cmt.score, cmt.body))

コード例 #4
0
tokenizer = 'ccjieba'

for line in fileinput.input():
    fields = json.loads(line)
    id_field = fields['id']
    title_field = fields['title']

    title_half = to_halfwidth(title_field)

    idx = title_half.find(']')
    if idx > 0:
        title = title_field[idx + 1:]
    else:
        title = title_field

    title_cleaned, ctype = clean_query(title)

    if ctype == 'text':
        wds = OpenCCTokenizer(JiebaTokenizer()).cut(title_cleaned)
        words = [w for w in wds if bool(w.word.strip())]
    else:
        words = [Word(title_cleaned, 'url')]

    tokenized = ' '.join([w.word.strip() for w in words])
    tokenized = tokenized.replace('\\', '\\\\')
    tokenized = tokenized.replace('"', '\\"').strip()
    grammar = ' '.join([w.pos.strip() for w in words]).strip()

    print(
        '"{ctype}"\t"{tokenizer}"\t"{tokenized}"\t"{grammar}"\t"{retrieval_count}"\t"{post}"\t"{quality}"'
        .format(ctype=ctype,
コード例 #5
0
from core.tokenizer import (OpenCCTokenizer, JiebaTokenizer)
from core.utils import (clean_query, to_halfwidth, Word, aggregate_comment)

tokenizer = 'ccjieba'

for line in fileinput.input():
    fields = json.loads(line)
    id_field = fields['id']
    comment_field = fields['comment']

    comments = aggregate_comment(comment_field)

    outputs = []

    for cmt in comments:
        content, ctype = clean_query(cmt['comment'])
        if ctype == 'text':
            wds = OpenCCTokenizer(JiebaTokenizer()).cut(content)
            words = [w for w in wds if bool(w.word.strip())]
        else:
            words = [Word(content, 'url')]

        tokenized = ' '.join([w.word.strip() for w in words])
        tokenized = tokenized.replace('\\', '\\\\')
        tokenized = tokenized.replace('"', '\\"').strip()
        grammar = ' '.join([w.pos.strip() for w in words]).strip()
        # cmt['tokenized'] = tokenized
        # cmt['grammar'] = grammar
        # cmt['ctype'] = ctype

        outputs.append(