def extract_review(): timed_reviews = {} num_docs = 0 num_words = 0 timed_reviews[app] = [] with open(app_files) as fin: lines = fin.readlines() for l_id, line in enumerate(lines): line = line.strip() terms = line.split("******") if len(terms) != 6: logging.error("review format error at %s in %s" % (app, line)) continue if not StoreNum: # for ios date = terms[3] version = terms[4] else: # for android date = terms[2] version = terms[3] review_o = terms[1] review_p, wc = extractSentenceWords(review_o, repeat=True) for list_text in review_p: for index, value in enumerate(list_text): list_text[index] = pycorrector.en_correct(value) review = list(build_phrase(review_p)) review = [list(replace_digit(s)) for s in review] rate = float(terms[0]) if re.match( r'\d*\.?\d+', terms[0]) else 2.0 # 若评论星级缺失,则用平均评论星级2代替 timed_reviews[app].append({ "review": review, "date": date, "rate": rate, "version": version }) num_docs += 1 num_words += wc if l_id % 1000 == 0: logging.info("processed %d docs of %s" % (l_id, app)) logging.info("total read %d reviews, %d words." % (num_docs, num_words)) return timed_reviews
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("..") import pycorrector if __name__ == '__main__': sent = "what happending ? how to speling it can you gorrect it" r = pycorrector.en_correct(sent) print(sent, '=>', r) sent_lst = ['what', 'hapenning', 'how', 'to', 'speling', 'it', 'you', 'can', 'gorrect', 'it'] for i in sent_lst: print(i, '=>', pycorrector.en_correct(i))
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("..") import pycorrector if __name__ == '__main__': # 1. 演示英文句子纠错 sent = "what happending? how to speling it, can you gorrect it?" corrected_text, details = pycorrector.en_correct(sent) print(sent, '=>', corrected_text) print(details) print() # 2. 演示英文句子列表纠错 sent_lst = ['what hapenning?','how to speling it', 'gorrect', 'i know'] for sent in sent_lst: corrected_text, details = pycorrector.en_correct(sent) if details: print('[error] ', sent, '=>', corrected_text, details) print() # 3. 演示自定义英文词典 from pycorrector.en_spell import EnSpell
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("..") import pycorrector if __name__ == '__main__': sent_lst = [ 'what', 'hapenning', 'how', 'to', 'speling', 'it', 'you', 'can', 'gorrect', 'it' ] for i in sent_lst: print(i, '=>', pycorrector.en_correct(i))