def make_chains(in_string): """Make markov chains from text in in_string.""" chains = defaultdict(list) words = in_string.strip().split() for i in range(len(words) - 2): chains[Bigram(words[i], words[i + 1])].append(words[i + 2]) # Add marker for end-of-text chains[Bigram(words[i + 1], words[i + 2])].append(None) return chains
def __init__(self, lambda_1=0.67): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = lambda_1 self.lambda_2 = 1 - lambda_1
def __init__(self): self.docs = dict() self.index = dict() self.vecs = None self.consts = None self.modified = False self.bigram_index = Bigram()
def __init__(self): self.unigram_model = Unigram() self.bigram_model = Bigram() self.trigram_model = Trigram() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5
def main_bigramTrain(options, input): bigramModel = Bigram(0.000000000000001) for sen, _ in sentenceIterator(input): tags = [tok[options.tagField] for tok in sen] bigramModel.obsSequence(tags) bigramModel.count() bigramModel.writeToFile(options.bigramModelFile)
def main(): raw_data = get_data() # Unigram uni = Unigram(raw_data) uni.main() # Bigram bi = Bigram(raw_data) bi.main()
def main(): bg = Bigram() bg.train() print(sys.argv[1]) p, q, r = bg.test(sys.argv[1]) print("------Unsmooth Probability---------") print('{:.60f}'.format(p)) print("------Laplace Smooth Prob---------") print('{:.60f}'.format(q)) print("------Good Turing Prob---------") print('{:.60f}'.format(r))
def make_text(chains): """Generate markov-chain-generated text from chains.""" bigram = choice(list(chains)) print (bigram, end=' ') while True: follows = choice(chains[bigram]) if follows is None: break print(follows, end=' ') bigram = Bigram(bigram.word2, follows) print()
def get_bigrams(self): bigram_list = [] for bigram in nltk.bigrams(self.low_case_words_list): bigram_list.append(Bigram(self.doc_id, bigram)) return bigram_list
from hmm import HMM from bigram import Bigram import time import re import codecs if __name__ == "__main__": print('将对配置文件config.py中TESTFILE=' + TESTFILE + '进行分词...') start = time.time() with codecs.open(TESTFILE, 'r', 'gbk') as f: text = f.read() lines = text.split('\r\n') bigram = Bigram() lines_segs = [] # 逐行进行分词 for i, line in enumerate(lines): line_segs = [] if line != '': ###### 将每行从传入分词模型 ##### line_segs = bigram.cut(line) ############################### else: line_segs = line lines_segs.append(line_segs) # 没千行打印耗时 if i % 1000 == 0: print(str(i) + '/' + str(len(lines)), time.time() - start)
from unigram import Unigram from bigram import Bigram from trigram import Trigram inputs = read('input.txt')[0].strip().split(" ") V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]), float(inputs[2]), inputs[3], inputs[4]) OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt" t1 = time() if V == 3: print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}") BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BYOM.execute() elif N == 1: print(f"unigram: V = {V} d = {S_FACTOR}") UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) UNIGRAM.execute() elif N == 2: print(f"bigram: V = {V} d = {S_FACTOR}") BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BIGRAM.execute() elif N == 3: print(f"trigram: V = {V} d = {S_FACTOR}") TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) TRIGRAM.execute() t2 = time() print(f"execution time: {t2 - t1}s")
def __init__(self): self.unigram = Unigram() self.bigram = Bigram()
def __init__(self): self.load_param() self.bigram = Bigram()
def __init__(self): self.unigram = Unigram() self.bigram = Bigram() self.coef = 0.5 print("W(bigram):W(unigram) coefficient is 1 :", self.coef)
ratios = np.arange(0.05, 1.05, 0.05) unigram_accuracies = [] tfidf_accuracies = [] bigram_accuracies = [] for r in ratios: unigram_perceptron = Unigram(train_ratio=r) unigram_accuracy = unigram_perceptron.accuracy unigram_accuracies.append(unigram_accuracy) print(r, "unigram_perceptron", unigram_accuracy) tfidf_perceptron = Tfidf(train_ratio=r) tfidf_accuracy = tfidf_perceptron.accuracy tfidf_accuracies.append(tfidf_accuracy) print(r, "tfidf_perceptron", tfidf_accuracy) bigram_perceptron = Bigram(train_ratio=r) bigram_accuracy = bigram_perceptron.accuracy bigram_accuracies.append(bigram_accuracy) print(r, "bigram_perceptron", bigram_accuracy) pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb")) pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb")) pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb")) # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb")) # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb")) # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb")) fig = plt.figure() ax1 = fig.add_subplot(111) num_samples = ratios * 1000000 ax1.scatter(num_samples, unigram_accuracies, c='b', label='Unigrams')