Example #1
0
 def __init__(self,f):
     Dict.__init__(self)
     
     self.mp3File = MP3(f,ID3=EasyID3)
     
     self['fileName']=f
     self['srcName']=f
Example #2
0
    def __init__(self, f):
        Dict.__init__(self)

        self.mp3File = MP3(f, ID3=EasyID3)

        self['fileName'] = f
        self['srcName'] = f
def test_article(article, svm, pos_dict, neg_dict):
    # Word segmentation on raw articles
    file_context = str(article)
    seg_list = jieba.cut(file_context.strip(), cut_all=False)
    test_data = seg_article(article, seg_list)

    # Pre-judge based on dictionary
    dict = Dict(file_context, seg_list)
    factor = dict.calculate_factor(test_data, pos_dict, neg_dict)

    # SVM's prediction
    result = []
    for each in test_data:
        if (each == '') is False:
            result.append(svm.predict(each))

    # Calculate points and normalize
    polar = np.mean(result)
    final_score = 0.7*polar + 0.3*factor
    if (final_score < 0.5):
        return '-1'
    elif (final_score == 0.5):
        return '0'
    else:
        return '1'
Example #4
0
    def run(self):
        super().run()

        # split input file into num_core many files
        self.prep_input()

        # get slang, stop words and emoticon dict
        # NOTE: For now, we load these dicts here (shared between threads)
        # but we load one enchant dict per thread. This has concurrency reasons.
        # We could load these dicts also one per thread, but we need to do
        # some adjustements.
        dict = Dict()
        slang_dict = dict.get_slang()
        stop_words = dict.get_stopwords()
        emoji_dict = dict.get_emoticon()

        # process input files
        ts = [
            threading.Thread(target=self.checker,
                             args=(i, slang_dict, stop_words, emoji_dict))
            for i in range(self.cores)
        ]

        for t in ts:
            t.start()

        for t in ts:
            t.join()

        # merge num_core output files into one, delete the split files
        self.merge_and_delete()
Example #5
0
    def run(self):
        super().run()

        dict = Dict()
        slang_dict = dict.get_slang()
        stop_words = dict.get_stopwords()
        emoji_dict = dict.get_emoticon()
        d = enchant.Dict("en_US")

        self.prep_input()

        # dictionnary defined in MMST __init___
        share = floor(self.nb / self.cores)

        ts = [threading.Thread(target=self.checker, args=(i, d, slang_dict, stop_words, emoji_dict)) for i in range(self.cores)]

        for t in ts:
            t.start()

        for t in ts:
            t.join()

        print("merging")

        self.merge_outputs()
Example #6
0
 def load_db(self, SQLiteDB):
     if self.path1 == ".":
         return False
     #print self.path + SQLiteDB
     self.dict_obj = Dict(
         self.path1 +
         SQLiteDB)  #Always intitiate first Dict object then Word object
     self.word_obj = Word()
def dict_test():
    x = Dict()
    x['dict1'] = {'1': 1, '2': 2, '3': 3}
    # x['dict1','5']='5'
    # del x['dict1','5']
    # for i in x.ergodic('dict1'):
    #     print(i)
    print(x['dict1'])
    def run(self):
        super().run()

        spec_sign = "[@_!#$%^&*()<>?/\|}{~:];'-"

        # init emoticon dict
        d = Dict()
        emot_dict = d.get_emoticon()

        # normalize words
        output = open(self.output, 'w+')
        with open(self.input, mode='r') as input:
            for line in input:
                words = line.split()

                i = 0
                while i < len(words):
                    word = words[i]

                    # remove emoticon spacing
                    word_given = ''
                    word_nospace = ''
                    while word in spec_sign and len(
                            word) == 1 and i < len(words):
                        word = words[i]
                        word_given += word + ' '
                        word_nospace += word
                        if word_nospace in emot_dict:
                            output.write(word_nospace + ' ')
                            word_given = ''
                            word_nospace = ''
                        i += 1

                    if len(word_given) > 0:
                        output.write(word_given)

                    else:
                        # question mark
                        if word[-1] == '?':
                            word = word[:-1] + ' ' + '?'

                        output.write(word + ' ')
                        i += 1

                output.write('\n')

        output.close()
Example #9
0
def makeVocabulary(filename, size):
    vocab = Dict([
        Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
        Constants.EOS_WORD
    ])

    with open(filename) as f:
        for sent in f.readlines():
            for word in sent.split():
                vocab.add(word.lower())  # Lowercase all words

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print("Created dictionary of size %d (pruned from %d)" %
          (vocab.size(), originalSize))

    return vocab
Example #10
0
    def run(self):
        super().run();

        # get emoticon dict
        d = Dict()
        dict = d.get_emoticon()

        # replace emoticons in input file
        output = open(self.output, 'w+')
        with open(self.input, mode='r') as input:
                for line in input:
                    for word in line.split():
                        if word in dict:
                            output.write(dict[word] + ' ')
                        else:
                            output.write(word + ' ')

                    output.write('\n')

        output.close()
Example #11
0
    def run(self):
        super().run()

        # init english dict
        self.en_dict = enchant.Dict("en_US")
        d = Dict()
        self.slang_dict = d.get_slang()
        self.emoticon_dict = d.get_emoticon()

        # normalize words
        output = open(self.output, 'w+')
        with open(self.input, mode='r') as input:
            for line in input:
                for word in line.split():
                    if not self.en_dict.check(word):
                        l = [''.join(g) for _, g in groupby(word)]
                        if len(l) <= 10:
                            word, _ = self.get_norm_string(l, 0)

                    output.write(word + ' ')

                output.write('\n')

        output.close()
Example #12
0
 def test_attr(self):
     d = Dict()
     d.key = 'value'
     self.assertTrue('key' in d)
     self.assertEquals(d['key'], 'value')
Example #13
0
 def test_key(self):
     d = Dict()
     d['key'] = 'value'
     self.assertEquals(d.key, 'value')
Example #14
0
def test_e2c_words(capfd):  # 所继承的capfd不知道是哪里来的?
    Dict(['Test'])  # 通过关键字来调用程序
    out, err = capfd.readouterr()  # 捕获到的输出和错误,返回的应该是个列表或字典什么的
    assert '测试' in out  # 验证是否在输出中包含了所要测试的内容
sendmessage('Start to capture')
# import pyscreenshot as ImageGrab
# 1. Screenshot
cmd = 'scrot -s -q 100 /tmp/foo.png ; xclip -selection c -t image/png < /tmp/foo.png'
os.system(cmd)
# 2. OCR
# Define config parameters.
# '-l eng'  for using the English language
# '--oem 1' for using LSTM OCR Engine
config = ('-l eng --oem 1 --psm 3')
im = cv2.imread('/tmp/foo.png', cv2.IMREAD_COLOR)
# print(im.size)
# Run tesseract OCR on image
text = pytesseract.image_to_string(im, config=config)
if (len(text) == 0):
    exit()
# print(text)
input_str = text.split()
# print(input_str)
# 3. Translation
dc = Dict(input_str)
result = dc.translate()
print(result)

# # 4. Output Result
sendmessage(result)
cmd_add_to_a = 'echo ' + '\"' + result + '\"' + '| xclip'
# print(cmd_add_to_a)
os.system(cmd_add_to_a)
Example #16
0
def test_c2e_sentences(capfd):
    Dict(['我爱你'])
    out, err = capfd.readouterr()
    assert 'I love you' in out
Example #17
0
def test_c2e_words(capfd):
    Dict(['测试'])
    out, err = capfd.readouterr()
    assert 'Test' in out
Example #18
0
from classParser import Parser
from dict import Dict
from classesWordRootType import Word, Root
from classValidator import Validator
import re

parser = Parser(Dict())
for i in range(1, 16):
    fileName = str(i) + '.txt'
    inFile = open(fileName, 'r', encoding='utf8').read()
    formula = parser.parse(inFile)
    print(i, inFile)
    print(formula.words)
    print(i, formula.get_schema())
    print(i, formula.types)
    print(i, formula.is_valid())
    print(i, formula.get_value())
    print(formula.words[1], formula.words[1].get_type())
    print(formula.words[1].get_type().get_correlation())
    print(formula.words[-1], formula.words[-1].get_type())
    print(formula.words[-1].get_type().get_correlation())
    print('\n')
Example #19
0
 def test_init(self):
     d = Dict(a=1, b='test')
     self.assertEquals(d.a, 1)
     self.assertEquals(d.b, 'test')
     self.assertTrue(isinstance(d, dict))
Example #20
0
 def test_keyerror(self):
     d = Dict()
     with self.assertRaises(KeyError):
         value = d['empty']
Example #21
0
    cfg.param_file = 'data/rmrb_ngram_changed.json' if cfg.use_re else 'data/rmrb_ngram_nochanged.json'
    # Generate n-grame parameters
    # param_file = 'data/rmrb_ngram_changed.json' if cfg.use_re else 'data/rmrb_ngram_nochanged.json'
    print("Loading model parameters calculated from rmrb ... ")
    if os.path.exists(cfg.param_file):
        f = open(cfg.param_file, 'r', encoding='utf-8')
        params = json.load(f)
        f.close()
    else:
        params = get_ngram_prob(cfg)
        f = open(cfg.param_file, 'w', encoding='utf-8')
        json.dump(params, f)
        f.close()
    test_targets, dicts = get_test_sets()

    dicts = Dict(dicts, data_structure="set")

    # Simple 2-gram model from rmrb-train
    model_simple = HMM_word(params['p3'], '<BOS>', '<EOS>')

    # Build an HMM model
    model_hmm = HMM(params['p2'], params['p1'])
    results = []

    model_rmrb = model_simple if args.score == 'Markov' else model_hmm

    print("Test on rmrb train subset")
    for sen in tqdm(test_targets[:10000:100]):
        # Get candidates
        ori_sen = ''.join(sen)
        nums, words, cands = get_proposals(ori_sen, dicts, cfg)
def test_dev():
    # load test set
    nlpcc_f = open('data/nlpcc2016-wordseg-dev.dat', 'r', encoding='utf-8')
    lines = nlpcc_f.readlines()
    lines = [changenum(line) for line in lines]
    lines = [line.strip().split() for line in lines]
    nlpcc_f.close()

    # get dict from rmrb
    _, dicts = get_test_sets()
    dicts = Dict(dicts, data_structure="ac")  # or "set"

    # model from rmrb
    cfg = Config()
    params = get_ngram_prob(cfg)

    print(
        "Simple 2-gram model trained from rmrb, test on nlpcc-dev, with re-match"
    )
    # Simple 2-gram model from rmrb-train
    model_rmrb = HMM_word(params['p3'], '<BOS>', '<EOS>')
    results = []
    for line in tqdm(lines):
        ori_line = ''.join(line)
        res = model_rmrb.find(ori_line)
        results.append(res)
    evaluateSet(results, lines)

    # Simple n-gram model from weibo-train
    print(
        "Simple 2-gram model from nlpcc-train, test on nlpcc-dev, with re-match"
    )
    filename = 'weibo_model/nlpcc_train.replace-2gram'
    with open(filename, 'r', encoding='utf-8') as f:
        dict_lines = f.readlines()
        dict_lines = [l.strip().split('\t') for l in dict_lines]
        probs = {}
        for l in dict_lines:
            if len(l) < 2:
                continue
            probs[l[0]] = float(l[1])
    model_weibo_train = HMM_word(probs)
    # Test with Simple 2-gram model
    results = []
    for line in tqdm(lines):
        ori_line = ''.join(line)
        res = model_weibo_train.find(ori_line)
        results.append(res)
    evaluateSet(results, lines)

    # load test set without number and english replace
    nlpcc_f = open('data/nlpcc2016-wordseg-dev.dat', 'r', encoding='utf-8')
    lines = nlpcc_f.readlines()
    lines = [line.strip().split() for line in lines]
    nlpcc_f.close()
    # Simple n-gram model from weibo-train
    print(
        "Simple 2-gram model from nlpcc-train, test on nlpcc-dev, without re-match"
    )
    filename = 'weibo_model/nlpcc_train.mod-2gram'
    with open(filename, 'r', encoding='utf-8') as f:
        dict_lines = f.readlines()
        dict_lines = [l.strip().split('\t') for l in dict_lines]
        probs = {}
        for l in dict_lines:
            if len(l) < 2:
                continue
            probs[l[0]] = float(l[1])
    model_weibo_train = HMM_word(probs)

    # Test with Simple 2-gram model
    results = []
    for line in tqdm(lines):
        ori_line = ''.join(line)
        res = model_weibo_train.find(ori_line)
        results.append(res)
    evaluateSet(results, lines)

    # model from rmrb
    cfg = Config()
    cfg.use_re = 0
    params = get_ngram_prob(cfg)

    # Simple 2-gram model from rmrb-train
    print("Simple 2-gram model from rmrb, test on nlpcc-dev, without re-match")
    model_rmrb = HMM_word(params['p3'], '<BOS>', '<EOS>')
    results = []
    for line in tqdm(lines):
        ori_line = ''.join(line)
        res = model_rmrb.find(ori_line)
        results.append(res)
    evaluateSet(results, lines)
Example #23
0
 def test_attrerror(self):
     d = Dict()
     with self.assertRaises(AttributeError):
         value = d.empty
Example #24
0
def test_e2c_words(capfd):
    Dict(['Test'])
    out, err = capfd.readouterr()
    assert '测试' in out
Example #25
0
def test_e2c_sentences(capfd):
    Dict(['I', 'Love', 'You'])
    out, err = capfd.readouterr()
    assert '我爱你' in out
Example #26
0
		p2[keysN] = t_numN[keysN] / t_numN_[keysN_]

	# calc p3: p(w|w,..,w)
	for keysN in w_numN.keys():
		tmp = keysN.split()
		keysN_ = " ".join(tmp[:-1])
		p3[keysN] = w_numN[keysN] / w_numN_[keysN_]

	return {
		"p1": p1,
		"p2": p2,
		"p3": p3,
	}

def get_ngram_prob(cfg):
	lines = readfile(cfg)
	return calc_prob_fun(lines, cfg)



if __name__ == '__main__':
	cfg = Config()
	superline = readfile(cfg)
	# fs = get_prob_fun(superline, cfg)
	dict = Dict(["中共","总书记"],"set")
	# s = "迈向,,,充满123希望的word新世纪,一九九八新年讲话。"
	# s = "刚刚看到的一段话:“你特别烦的时候先保持冷静或者看一部开心的电影者喝一大杯水不要试图跟朋友聊天朋友是跟你分享快乐的人而不是分享你痛苦的人不要做一个唠唠叨叨的抱怨者从现在起要学会自己去化解去承受”送给和我一样最近有点烦闷的人"
	s = "中共中央总书记、国家主席江泽民"
	# s  = "你好吗"
	digit, english, pro = get_proposals(s, dict, cfg)
	print(pro)