def algorithm(raw): query, ctype = clean_query(raw) words = [ w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query) if bool(w.word.strip()) ] comments = RetrievalEvaluate( 'ccjieba', pweight=JiebaPosWeight.weight, title_ranker=pos_idf_jaccard_similarity).retrieve(words) candidates = [] unique_cache = [] rank = 0 for cmt in comments: if cmt.body not in unique_cache: rank += 1 unique_cache.append(cmt.body) candidates.append({ 'rank': rank, 'score': cmt.score, 'answer': cmt.body }) if rank >= topn: break return candidates
def algorithm(raw): # reply = MessengerBot(string, 'messenger', 'slack').retrieve() query, ctype = clean_query(raw) words = [ w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query) if bool(w.word.strip()) ] comments = RetrievalEvaluate( 'ccjieba', pweight=JiebaPosWeight.weight, title_ranker=pos_idf_jaccard_similarity).retrieve(words) candidates = [] unique_cache = [] rank = 0 for cmt in comments: if cmt.body not in unique_cache: rank += 1 unique_cache.append(cmt.body) candidates.append('[{}] <{:.2f}> {}'.format( rank, cmt.score, cmt.body)) if rank >= topn: break reply = '\n'.join(candidates) return reply
config_parser = RawConfigParser() config_parser.read('../config.ini') PsqlAbstract.set_database_info( config_parser.get('global', 'dbuser'), config_parser.get('global', 'dbname'), config_parser.get('global', 'dbpassword') ) if __name__ == '__main__': if len(sys.argv) < 2: print( ''' usage: python query_chat <query-string> ex: python query_chat 安安幾歲住哪 ''' ) sys.exit(0) raw = sys.argv[1] query, ctype = clean_query(raw) words = [w for w in OpenCCTokenizer(JiebaTokenizer()).cut(query) if bool(w.word.strip())] print(words) comments = RetrievalEvaluate('ccjieba').retrieve(words) # response = comment.body for i, cmt in enumerate(comments, 1): print('[{}] <{:.2f}> {}'.format(i, cmt.score, cmt.body))
tokenizer = 'ccjieba' for line in fileinput.input(): fields = json.loads(line) id_field = fields['id'] title_field = fields['title'] title_half = to_halfwidth(title_field) idx = title_half.find(']') if idx > 0: title = title_field[idx + 1:] else: title = title_field title_cleaned, ctype = clean_query(title) if ctype == 'text': wds = OpenCCTokenizer(JiebaTokenizer()).cut(title_cleaned) words = [w for w in wds if bool(w.word.strip())] else: words = [Word(title_cleaned, 'url')] tokenized = ' '.join([w.word.strip() for w in words]) tokenized = tokenized.replace('\\', '\\\\') tokenized = tokenized.replace('"', '\\"').strip() grammar = ' '.join([w.pos.strip() for w in words]).strip() print( '"{ctype}"\t"{tokenizer}"\t"{tokenized}"\t"{grammar}"\t"{retrieval_count}"\t"{post}"\t"{quality}"' .format(ctype=ctype,
from core.tokenizer import (OpenCCTokenizer, JiebaTokenizer) from core.utils import (clean_query, to_halfwidth, Word, aggregate_comment) tokenizer = 'ccjieba' for line in fileinput.input(): fields = json.loads(line) id_field = fields['id'] comment_field = fields['comment'] comments = aggregate_comment(comment_field) outputs = [] for cmt in comments: content, ctype = clean_query(cmt['comment']) if ctype == 'text': wds = OpenCCTokenizer(JiebaTokenizer()).cut(content) words = [w for w in wds if bool(w.word.strip())] else: words = [Word(content, 'url')] tokenized = ' '.join([w.word.strip() for w in words]) tokenized = tokenized.replace('\\', '\\\\') tokenized = tokenized.replace('"', '\\"').strip() grammar = ' '.join([w.pos.strip() for w in words]).strip() # cmt['tokenized'] = tokenized # cmt['grammar'] = grammar # cmt['ctype'] = ctype outputs.append(