コード例 #1
0
ファイル: markov.py プロジェクト: joelburton/pyplay
def make_chains(in_string):
    """Make markov chains from text in in_string."""

    chains = defaultdict(list)
    words = in_string.strip().split()

    for i in range(len(words) - 2):
        chains[Bigram(words[i], words[i + 1])].append(words[i + 2])

    # Add marker for end-of-text
    chains[Bigram(words[i + 1], words[i + 2])].append(None)   

    return chains
コード例 #2
0
 def __init__(self, lambda_1=0.67):
     self.unigram = Unigram()
     self.bigram = Bigram()
     # just needed for languageModel.py to work
     self.word_dict = self.bigram.word_dict
     self.lambda_1 = lambda_1
     self.lambda_2 = 1 - lambda_1
コード例 #3
0
 def __init__(self):
     self.docs = dict()
     self.index = dict()
     self.vecs = None
     self.consts = None
     self.modified = False
     self.bigram_index = Bigram()
コード例 #4
0
 def __init__(self):
     self.unigram_model = Unigram()
     self.bigram_model = Bigram()
     self.trigram_model = Trigram()
     self.unigram_lambda = .25
     self.bigram_lambda = .25
     self.trigram_lambda = .5
コード例 #5
0
ファイル: huntag.py プロジェクト: zbxzc35/HunTag
def main_bigramTrain(options, input):
    bigramModel = Bigram(0.000000000000001)
    for sen, _ in sentenceIterator(input):
        tags = [tok[options.tagField] for tok in sen]
        bigramModel.obsSequence(tags)
    bigramModel.count()
    bigramModel.writeToFile(options.bigramModelFile)
コード例 #6
0
def main():
    raw_data = get_data()
    # Unigram
    uni = Unigram(raw_data)
    uni.main()
    # Bigram
    bi = Bigram(raw_data)
    bi.main()
コード例 #7
0
def main():
    bg = Bigram()
    bg.train()
    print(sys.argv[1])
    p, q, r = bg.test(sys.argv[1])
    print("------Unsmooth Probability---------")
    print('{:.60f}'.format(p))
    print("------Laplace Smooth Prob---------")
    print('{:.60f}'.format(q))
    print("------Good Turing Prob---------")
    print('{:.60f}'.format(r))
コード例 #8
0
ファイル: markov.py プロジェクト: joelburton/pyplay
def make_text(chains):
    """Generate markov-chain-generated text from chains."""

    bigram = choice(list(chains))
    print (bigram, end=' ')

    while True:
        follows = choice(chains[bigram])
        if follows is None:
            break
        print(follows, end=' ')
        bigram = Bigram(bigram.word2, follows)

    print()
コード例 #9
0
 def get_bigrams(self):
     bigram_list = []
     for bigram in nltk.bigrams(self.low_case_words_list):
         bigram_list.append(Bigram(self.doc_id, bigram))
     return bigram_list
コード例 #10
0
from hmm import HMM
from bigram import Bigram
import time
import re
import codecs

if __name__ == "__main__":

    print('将对配置文件config.py中TESTFILE=' + TESTFILE + '进行分词...')
    start = time.time()

    with codecs.open(TESTFILE, 'r', 'gbk') as f:
        text = f.read()
    lines = text.split('\r\n')

    bigram = Bigram()
    lines_segs = []
    # 逐行进行分词
    for i, line in enumerate(lines):
        line_segs = []
        if line != '':
            ###### 将每行从传入分词模型 #####
            line_segs = bigram.cut(line)
            ###############################
        else:
            line_segs = line

        lines_segs.append(line_segs)
        # 没千行打印耗时
        if i % 1000 == 0:
            print(str(i) + '/' + str(len(lines)), time.time() - start)
コード例 #11
0
from unigram import Unigram
from bigram import Bigram
from trigram import Trigram

inputs = read('input.txt')[0].strip().split(" ")
V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]),
                                            float(inputs[2]), inputs[3],
                                            inputs[4])
OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt"

t1 = time()
if V == 3:
    print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}")
    BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BYOM.execute()
elif N == 1:
    print(f"unigram: V = {V} d = {S_FACTOR}")
    UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    UNIGRAM.execute()
elif N == 2:
    print(f"bigram: V = {V} d = {S_FACTOR}")
    BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BIGRAM.execute()
elif N == 3:
    print(f"trigram: V = {V} d = {S_FACTOR}")
    TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    TRIGRAM.execute()
t2 = time()

print(f"execution time: {t2 - t1}s")
コード例 #12
0
 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()
コード例 #13
0
ファイル: hmm.py プロジェクト: iseesaw/Pinyin2ChineseChars
 def __init__(self):
     self.load_param()
     self.bigram = Bigram()
 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()
     self.coef = 0.5
     print("W(bigram):W(unigram) coefficient is 1 :", self.coef)
コード例 #15
0
ファイル: problem5.py プロジェクト: samanthaks/ML
    ratios = np.arange(0.05, 1.05, 0.05)
    unigram_accuracies = []
    tfidf_accuracies = []
    bigram_accuracies = []
    for r in ratios:
        unigram_perceptron = Unigram(train_ratio=r)
        unigram_accuracy = unigram_perceptron.accuracy
        unigram_accuracies.append(unigram_accuracy)
        print(r, "unigram_perceptron", unigram_accuracy)

        tfidf_perceptron = Tfidf(train_ratio=r)
        tfidf_accuracy = tfidf_perceptron.accuracy
        tfidf_accuracies.append(tfidf_accuracy)
        print(r, "tfidf_perceptron", tfidf_accuracy)

        bigram_perceptron = Bigram(train_ratio=r)
        bigram_accuracy = bigram_perceptron.accuracy
        bigram_accuracies.append(bigram_accuracy)
        print(r, "bigram_perceptron", bigram_accuracy)

    pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))
    pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb"))
    pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb"))
    # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb"))
    # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb"))
    # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    num_samples = ratios * 1000000
    ax1.scatter(num_samples, unigram_accuracies, c='b', label='Unigrams')