def __init__(self,f): Dict.__init__(self) self.mp3File = MP3(f,ID3=EasyID3) self['fileName']=f self['srcName']=f
def __init__(self, f): Dict.__init__(self) self.mp3File = MP3(f, ID3=EasyID3) self['fileName'] = f self['srcName'] = f
def test_article(article, svm, pos_dict, neg_dict): # Word segmentation on raw articles file_context = str(article) seg_list = jieba.cut(file_context.strip(), cut_all=False) test_data = seg_article(article, seg_list) # Pre-judge based on dictionary dict = Dict(file_context, seg_list) factor = dict.calculate_factor(test_data, pos_dict, neg_dict) # SVM's prediction result = [] for each in test_data: if (each == '') is False: result.append(svm.predict(each)) # Calculate points and normalize polar = np.mean(result) final_score = 0.7*polar + 0.3*factor if (final_score < 0.5): return '-1' elif (final_score == 0.5): return '0' else: return '1'
def run(self): super().run() # split input file into num_core many files self.prep_input() # get slang, stop words and emoticon dict # NOTE: For now, we load these dicts here (shared between threads) # but we load one enchant dict per thread. This has concurrency reasons. # We could load these dicts also one per thread, but we need to do # some adjustements. dict = Dict() slang_dict = dict.get_slang() stop_words = dict.get_stopwords() emoji_dict = dict.get_emoticon() # process input files ts = [ threading.Thread(target=self.checker, args=(i, slang_dict, stop_words, emoji_dict)) for i in range(self.cores) ] for t in ts: t.start() for t in ts: t.join() # merge num_core output files into one, delete the split files self.merge_and_delete()
def run(self): super().run() dict = Dict() slang_dict = dict.get_slang() stop_words = dict.get_stopwords() emoji_dict = dict.get_emoticon() d = enchant.Dict("en_US") self.prep_input() # dictionnary defined in MMST __init___ share = floor(self.nb / self.cores) ts = [threading.Thread(target=self.checker, args=(i, d, slang_dict, stop_words, emoji_dict)) for i in range(self.cores)] for t in ts: t.start() for t in ts: t.join() print("merging") self.merge_outputs()
def load_db(self, SQLiteDB): if self.path1 == ".": return False #print self.path + SQLiteDB self.dict_obj = Dict( self.path1 + SQLiteDB) #Always intitiate first Dict object then Word object self.word_obj = Word()
def dict_test(): x = Dict() x['dict1'] = {'1': 1, '2': 2, '3': 3} # x['dict1','5']='5' # del x['dict1','5'] # for i in x.ergodic('dict1'): # print(i) print(x['dict1'])
def run(self): super().run() spec_sign = "[@_!#$%^&*()<>?/\|}{~:];'-" # init emoticon dict d = Dict() emot_dict = d.get_emoticon() # normalize words output = open(self.output, 'w+') with open(self.input, mode='r') as input: for line in input: words = line.split() i = 0 while i < len(words): word = words[i] # remove emoticon spacing word_given = '' word_nospace = '' while word in spec_sign and len( word) == 1 and i < len(words): word = words[i] word_given += word + ' ' word_nospace += word if word_nospace in emot_dict: output.write(word_nospace + ' ') word_given = '' word_nospace = '' i += 1 if len(word_given) > 0: output.write(word_given) else: # question mark if word[-1] == '?': word = word[:-1] + ' ' + '?' output.write(word + ' ') i += 1 output.write('\n') output.close()
def makeVocabulary(filename, size): vocab = Dict([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) with open(filename) as f: for sent in f.readlines(): for word in sent.split(): vocab.add(word.lower()) # Lowercase all words originalSize = vocab.size() vocab = vocab.prune(size) print("Created dictionary of size %d (pruned from %d)" % (vocab.size(), originalSize)) return vocab
def run(self): super().run(); # get emoticon dict d = Dict() dict = d.get_emoticon() # replace emoticons in input file output = open(self.output, 'w+') with open(self.input, mode='r') as input: for line in input: for word in line.split(): if word in dict: output.write(dict[word] + ' ') else: output.write(word + ' ') output.write('\n') output.close()
def run(self): super().run() # init english dict self.en_dict = enchant.Dict("en_US") d = Dict() self.slang_dict = d.get_slang() self.emoticon_dict = d.get_emoticon() # normalize words output = open(self.output, 'w+') with open(self.input, mode='r') as input: for line in input: for word in line.split(): if not self.en_dict.check(word): l = [''.join(g) for _, g in groupby(word)] if len(l) <= 10: word, _ = self.get_norm_string(l, 0) output.write(word + ' ') output.write('\n') output.close()
def test_attr(self): d = Dict() d.key = 'value' self.assertTrue('key' in d) self.assertEquals(d['key'], 'value')
def test_key(self): d = Dict() d['key'] = 'value' self.assertEquals(d.key, 'value')
def test_e2c_words(capfd): # 所继承的capfd不知道是哪里来的? Dict(['Test']) # 通过关键字来调用程序 out, err = capfd.readouterr() # 捕获到的输出和错误,返回的应该是个列表或字典什么的 assert '测试' in out # 验证是否在输出中包含了所要测试的内容
sendmessage('Start to capture') # import pyscreenshot as ImageGrab # 1. Screenshot cmd = 'scrot -s -q 100 /tmp/foo.png ; xclip -selection c -t image/png < /tmp/foo.png' os.system(cmd) # 2. OCR # Define config parameters. # '-l eng' for using the English language # '--oem 1' for using LSTM OCR Engine config = ('-l eng --oem 1 --psm 3') im = cv2.imread('/tmp/foo.png', cv2.IMREAD_COLOR) # print(im.size) # Run tesseract OCR on image text = pytesseract.image_to_string(im, config=config) if (len(text) == 0): exit() # print(text) input_str = text.split() # print(input_str) # 3. Translation dc = Dict(input_str) result = dc.translate() print(result) # # 4. Output Result sendmessage(result) cmd_add_to_a = 'echo ' + '\"' + result + '\"' + '| xclip' # print(cmd_add_to_a) os.system(cmd_add_to_a)
def test_c2e_sentences(capfd): Dict(['我爱你']) out, err = capfd.readouterr() assert 'I love you' in out
def test_c2e_words(capfd): Dict(['测试']) out, err = capfd.readouterr() assert 'Test' in out
from classParser import Parser from dict import Dict from classesWordRootType import Word, Root from classValidator import Validator import re parser = Parser(Dict()) for i in range(1, 16): fileName = str(i) + '.txt' inFile = open(fileName, 'r', encoding='utf8').read() formula = parser.parse(inFile) print(i, inFile) print(formula.words) print(i, formula.get_schema()) print(i, formula.types) print(i, formula.is_valid()) print(i, formula.get_value()) print(formula.words[1], formula.words[1].get_type()) print(formula.words[1].get_type().get_correlation()) print(formula.words[-1], formula.words[-1].get_type()) print(formula.words[-1].get_type().get_correlation()) print('\n')
def test_init(self): d = Dict(a=1, b='test') self.assertEquals(d.a, 1) self.assertEquals(d.b, 'test') self.assertTrue(isinstance(d, dict))
def test_keyerror(self): d = Dict() with self.assertRaises(KeyError): value = d['empty']
cfg.param_file = 'data/rmrb_ngram_changed.json' if cfg.use_re else 'data/rmrb_ngram_nochanged.json' # Generate n-grame parameters # param_file = 'data/rmrb_ngram_changed.json' if cfg.use_re else 'data/rmrb_ngram_nochanged.json' print("Loading model parameters calculated from rmrb ... ") if os.path.exists(cfg.param_file): f = open(cfg.param_file, 'r', encoding='utf-8') params = json.load(f) f.close() else: params = get_ngram_prob(cfg) f = open(cfg.param_file, 'w', encoding='utf-8') json.dump(params, f) f.close() test_targets, dicts = get_test_sets() dicts = Dict(dicts, data_structure="set") # Simple 2-gram model from rmrb-train model_simple = HMM_word(params['p3'], '<BOS>', '<EOS>') # Build an HMM model model_hmm = HMM(params['p2'], params['p1']) results = [] model_rmrb = model_simple if args.score == 'Markov' else model_hmm print("Test on rmrb train subset") for sen in tqdm(test_targets[:10000:100]): # Get candidates ori_sen = ''.join(sen) nums, words, cands = get_proposals(ori_sen, dicts, cfg)
def test_dev(): # load test set nlpcc_f = open('data/nlpcc2016-wordseg-dev.dat', 'r', encoding='utf-8') lines = nlpcc_f.readlines() lines = [changenum(line) for line in lines] lines = [line.strip().split() for line in lines] nlpcc_f.close() # get dict from rmrb _, dicts = get_test_sets() dicts = Dict(dicts, data_structure="ac") # or "set" # model from rmrb cfg = Config() params = get_ngram_prob(cfg) print( "Simple 2-gram model trained from rmrb, test on nlpcc-dev, with re-match" ) # Simple 2-gram model from rmrb-train model_rmrb = HMM_word(params['p3'], '<BOS>', '<EOS>') results = [] for line in tqdm(lines): ori_line = ''.join(line) res = model_rmrb.find(ori_line) results.append(res) evaluateSet(results, lines) # Simple n-gram model from weibo-train print( "Simple 2-gram model from nlpcc-train, test on nlpcc-dev, with re-match" ) filename = 'weibo_model/nlpcc_train.replace-2gram' with open(filename, 'r', encoding='utf-8') as f: dict_lines = f.readlines() dict_lines = [l.strip().split('\t') for l in dict_lines] probs = {} for l in dict_lines: if len(l) < 2: continue probs[l[0]] = float(l[1]) model_weibo_train = HMM_word(probs) # Test with Simple 2-gram model results = [] for line in tqdm(lines): ori_line = ''.join(line) res = model_weibo_train.find(ori_line) results.append(res) evaluateSet(results, lines) # load test set without number and english replace nlpcc_f = open('data/nlpcc2016-wordseg-dev.dat', 'r', encoding='utf-8') lines = nlpcc_f.readlines() lines = [line.strip().split() for line in lines] nlpcc_f.close() # Simple n-gram model from weibo-train print( "Simple 2-gram model from nlpcc-train, test on nlpcc-dev, without re-match" ) filename = 'weibo_model/nlpcc_train.mod-2gram' with open(filename, 'r', encoding='utf-8') as f: dict_lines = f.readlines() dict_lines = [l.strip().split('\t') for l in dict_lines] probs = {} for l in dict_lines: if len(l) < 2: continue probs[l[0]] = float(l[1]) model_weibo_train = HMM_word(probs) # Test with Simple 2-gram model results = [] for line in tqdm(lines): ori_line = ''.join(line) res = model_weibo_train.find(ori_line) results.append(res) evaluateSet(results, lines) # model from rmrb cfg = Config() cfg.use_re = 0 params = get_ngram_prob(cfg) # Simple 2-gram model from rmrb-train print("Simple 2-gram model from rmrb, test on nlpcc-dev, without re-match") model_rmrb = HMM_word(params['p3'], '<BOS>', '<EOS>') results = [] for line in tqdm(lines): ori_line = ''.join(line) res = model_rmrb.find(ori_line) results.append(res) evaluateSet(results, lines)
def test_attrerror(self): d = Dict() with self.assertRaises(AttributeError): value = d.empty
def test_e2c_words(capfd): Dict(['Test']) out, err = capfd.readouterr() assert '测试' in out
def test_e2c_sentences(capfd): Dict(['I', 'Love', 'You']) out, err = capfd.readouterr() assert '我爱你' in out
p2[keysN] = t_numN[keysN] / t_numN_[keysN_] # calc p3: p(w|w,..,w) for keysN in w_numN.keys(): tmp = keysN.split() keysN_ = " ".join(tmp[:-1]) p3[keysN] = w_numN[keysN] / w_numN_[keysN_] return { "p1": p1, "p2": p2, "p3": p3, } def get_ngram_prob(cfg): lines = readfile(cfg) return calc_prob_fun(lines, cfg) if __name__ == '__main__': cfg = Config() superline = readfile(cfg) # fs = get_prob_fun(superline, cfg) dict = Dict(["中共","总书记"],"set") # s = "迈向,,,充满123希望的word新世纪,一九九八新年讲话。" # s = "刚刚看到的一段话:“你特别烦的时候先保持冷静或者看一部开心的电影者喝一大杯水不要试图跟朋友聊天朋友是跟你分享快乐的人而不是分享你痛苦的人不要做一个唠唠叨叨的抱怨者从现在起要学会自己去化解去承受”送给和我一样最近有点烦闷的人" s = "中共中央总书记、国家主席江泽民" # s = "你好吗" digit, english, pro = get_proposals(s, dict, cfg) print(pro)