def get_test_feature(text_file, standardize_file, output_file): param = json.loads(open(standardize_file, 'r').read()) text = load_text(text_file) labels = [0] * len(text) feat = [ ( {'name':'label', 'type':'{0, 1}'}, labels ) ] ''' Call functions to extract features and add to data. ''' feat += avg_sen_len(text) feat += w2v_sim(text) feat += ngram(text, './models/3.binlm', 'tri') feat += ngram(text, './models/4.binlm', 'quad') # text = load_raw_text(text_file) # pos_feat = extract_pos_feat(text) # with open('temp_pos', 'w') as f: # for line in pos_feat: # f.write(line) # pos_labels = load_text('temp_pos') # feat += ngram(pos_labels, './models/pos3.binlm', 'pos-tri') # feat += ngram(pos_labels, './models/pos4.binlm', 'pos-quad') ''' Output the libsvm file. ''' arff_dump('temp_feat.arff', feat, param=param) data, label = load_arff('temp_feat.arff') to_libsvm(data, label, output_file)
def get_test_feature(text_file, standardize_file, output_file): param = json.loads(open(standardize_file, 'r').read()) text = load_text(text_file) labels = [0] * len(text) feat = [({'name': 'label', 'type': '{0, 1}'}, labels)] ''' Call functions to extract features and add to data. ''' feat += avg_sen_len(text) feat += w2v_sim(text) feat += ngram(text, './models/3.binlm', 'tri') feat += ngram(text, './models/4.binlm', 'quad') # text = load_raw_text(text_file) # pos_feat = extract_pos_feat(text) # with open('temp_pos', 'w') as f: # for line in pos_feat: # f.write(line) # pos_labels = load_text('temp_pos') # feat += ngram(pos_labels, './models/pos3.binlm', 'pos-tri') # feat += ngram(pos_labels, './models/pos4.binlm', 'pos-quad') ''' Output the libsvm file. ''' arff_dump('temp_feat.arff', feat, param=param) data, label = load_arff('temp_feat.arff') to_libsvm(data, label, output_file)
def run(train,nmax,reps,out): #Get probabilities with arbitrary precision fh = open(train) ngram.set_fractions(True) probs_ap=ngram.probabilities(ngram.good_turing(ngram.ngram(nmax,filters.unk(filters.shakespeare(fh))))) #Get probabilities with logs fh = open(train) ngram.set_fractions(False) probs_log=ngram.probabilities(ngram.good_turing(ngram.ngram(nmax,filters.unk(filters.shakespeare(fh))))) #Make sentences sentence_generation(train,out,nmax,reps,probs_ap,probs_log)
def ngram_plot(values, top = 10): """ Plot an ngram :param values: Is either a string or an array containing strings :param top: Is the number of results you want returned (optional, defaults to 10) :return A matplotlib of the actual n gram e.g. ngram_plot(A1:10) """ dict = ngram(values) max_freq = dict[0][1] #the max_frequency is the value of the first word words = [] freq = [] for key, val in dict: #add words to matplotlib at random coordinates with font size relative to max frequency, rotate every other word words.append(key) y_pos = np.arange(len(words)) freq.append(val) plt.bar(y_pos, freq, align='center') plt.xticks(y_pos, words) plt.ylabel('Frequency') plt.title('Common Ngrams') top -= 1 if top == 0: break return plt
def add_model(self, model_name): self.model_results[model_name] = {} #self.model_results[model_name]["batch count"] = 0 self.model_results[model_name]["count"] = 0 self.model_results[model_name]["BLEU_count"] = 0 self.model_results[model_name]["BLEU_count_arr"] = [] self.model_results[model_name]["BLEU"] = 0 self.model_results[model_name]["BLEU_arr"] = [] self.model_results[model_name]["context_len_arr"] = [] self.model_results[model_name]["context BLEU"] = 0 self.model_results[model_name]["gen_list_temp"] = [] self.model_results[model_name]["self BLEU count"] = 0 self.model_results[model_name]["self BLEU"] = 0 self.model_results[model_name]["self BLEU arr"] = [] self.model_results[model_name]["self BLEU count arr"] = [] self.model_results[model_name]["perplexity"] = 0 self.model_results[model_name]["ngram"] = ngram() self.model_results[model_name]["ngram count"] = 0 self.model_results[model_name]["ngram count arr"] = [] self.model_results[model_name]["unigram"] = 0 self.model_results[model_name]["bigram"] = 0 self.model_results[model_name]["unigram arr"] = [] self.model_results[model_name]["bigram arr"] = [] self.model_results[model_name]["time_sum"] = 0 self.model_results[model_name]["time_count"] = 0
def checkPlagiarism(file): isPlagiarized = False grams = fileOpen(file) ngrams = ngram(grams, 9) ngrams = [' '.join(i) for i in ngrams] for i in range(len(ngrams)): driver = webdriver.Firefox() toSearch = ngrams[i].encode('utf-8') driver.get("http://google.com") search = driver.find_element_by_name('q') search.send_keys(ngrams[i]) search.send_keys(Keys.RETURN) googleResult = googleSearch(toSearch) search = driver.find_element_by_name('q') search.send_keys(ngrams[i]) search.send_keys(Keys.RETURN) for result in googleResult: similarity = getSimilarity(toSearch, strip_tags(result.description)) if similarity >= 70: print("This file was plagiarized!") isPlagiarized = True driver.quit() return isPlagiarized driver.quit() print("This file is original with no evidence of plagiarism.") return isPlagiarized
def add_model(self, model_name): self.model_results[model_name] = {} self.model_results[model_name]["batch count"] = 0 self.model_results[model_name]["count"] = 0 self.model_results[model_name]["BLEU_count"] = 0 self.model_results[model_name]["BLEU_count_arr"] = [] self.model_results[model_name]["BLEU"] = 0 self.model_results[model_name]["BLEU_arr"] = [] self.model_results[model_name]["context_len_arr"] = [] self.model_results[model_name]["context BLEU"] = 0 self.model_results[model_name]["gen_list_temp"] = [] self.model_results[model_name]["self BLEU count"] = 0 self.model_results[model_name]["self BLEU"] = 0 self.model_results[model_name]["token hit"] = 0 self.model_results[model_name]["word type hit"] = 0 self.model_results[model_name]["topic hit"] = 0 self.model_results[model_name]["exact token hit"] = 0 self.model_results[model_name]["exact word type hit"] = 0 self.model_results[model_name]["exact topic hit"] = 0 self.model_results[model_name]["perplexity"] = 0 self.model_results[model_name]["ngram"] = ngram() self.model_results[model_name]["unigram"] = 0 self.model_results[model_name]["bigram"] = 0 self.model_results[model_name]["time_sum"] = 0 self.model_results[model_name]["time_count"] = 0
def renew_ngram(self): for model_name, model in self.model_results.items(): model["batch count"] += 1 unigram, bigram = model["ngram"].diversity_n() #print(model["ngram"].diversity_n()) model["unigram"] += unigram model["bigram"] += bigram model["ngram"] = ngram()
def main(): import ngram sentence = 'I am an NLPer' word_2gram = ngram.word_ngram(sentence, 2) char_2gram = ngram.ngram(sentence, 2) print(word_2gram) print(char_2gram)
def renew_ngram(self, head_idx): for model_name, model in self.model_results.items(): model["ngram count"] += 1 model["ngram count arr"][head_idx] += 1 unigram, bigram = model["ngram"].diversity_n() #print(model["ngram"].diversity_n()) model["unigram"] += unigram model["bigram"] += bigram model["unigram arr"][head_idx] += unigram model["bigram arr"][head_idx] += bigram model["ngram"] = ngram()
def getwords(doc): menus = [ "备孕","怀孕","产后", "发烧","胎教","母胎", "疾病","营养","护理", "疾病","孕妇","孕妈", "生育", ] tg = ngram(menus,min_sim=0.0) words = tg.getSimilarStrings(doc.encode("utf8")).keys() return dict([(w,1) for w in words])
def main(): import ngram sentecne1 = 'paraparaparadise' sentecne2 = 'paragraph' X = set(ngram.ngram(sentecne1, 2)) Y = set(ngram.ngram(sentecne2, 2)) XYunion = X | Y XYintersection = X & Y XYdiference = X - Y print(X) print(Y) print(XYunion) print(XYintersection) print(XYdiference) if 'se' in X and 'se' in Y: print("'se' is in X and Y") else: print("'se' is not in X and Y")
def makengramtext(n, fname): fngram = fname[:-4] + '_{}-gram.txt'.format(n) inittext(fngram) with open(fname, mode='r', encoding='utf-8') as f: for line in tqdm.tqdm(f): line = line.rstrip() line = re.sub(' ', 'R', line) ngram_list = ngram.ngram(line, n) with open(fngram, mode='a') as fn: fn.write(' '.join(ngram_list) + '\n')
def compareAuthors(authors, compareDict, tg_dict): # this dict contains how many texts (that we are comparing against the corpus) each author has written authorMade = {} # The dict that contains which authors have had their text attributed to whom resultDict = {} for value in compareDict: textToCompare = value["text"] realAuthor = value["user_id"] # Compare value compareTo = [textToCompare] com = ngram.ngram(compareTo) (gram, workList) = com.total_ngram(compareTo) dataDist = {} # We do the actual testing for author in authors: sum = 0.0 tg = tg_dict[author] for word in workList: sum += tg.propability(word, 0) dataDist[author] = sum value = -1 for an in dataDist.keys(): newValue = dataDist[an] if newValue > value: value = newValue author = an print "Real author:", realAuthor print "Most likely author:", author if not authorMade.has_key(author): authorMade[author] = [realAuthor] else: authorMade[author].append(realAuthor) # We take score of the attributions if not resultDict.has_key(realAuthor) : resultDict[realAuthor] = {author: 1} elif not resultDict[realAuthor].has_key(author): resultDict[realAuthor][author] = 1 else: resultDict[realAuthor][author] += 1 return (authorMade, resultDict)
def makeNgram(filename): worker = JSON.workOnJSON() dict = worker.read_JSON_file(filename) authorDict = {} authorWrittenDict = {} tg_dict = {} authorNameDirec = {} num = 0 for entry in dict: author = entry["user_id"] id = entry["post_id"] value = {"user_id": author, "text": entry["text"]} if authorDict.has_key(author): authorDict[author].append(value) authorWrittenDict[author].append(id) else: authorDict[author] = [value] authorWrittenDict[author] = [id] newAuthorDict = {} authorTexts = {} for authorName in authorDict.keys(): author = authorDict[authorName] newAuthorDictTemp = {} listOfEntries = [entry["text"] for entry in author] newAuthorDict[authorName] = listOfEntries text = ''.join(listOfEntries) tg = ngram.ngram(listOfEntries) tg.corp = text tg.newRemember() tg_dict[authorName] = tg return (newAuthorDict, tg_dict)
def __init__(self, corpus, n): # corpus, 训练标注器的语料, 格式为 [[('Hello', 'NNP'), ('world', 'NN'), ('!', '.')], [...], ...] # n - 语言模型 n-gram 中的 n # 定义词性标注任务 # 1. transition 为 n-gram 模型 # 2. emission 为 P( pos |Word ) # 3. initial distribution 为 P('START') = 1.0 # 预处理词库,给每句话加上开始和结束符号 brown_tags_words = [] for sent in corpus: brown_tags_words.append(('START', 'START')) brown_tags_words.extend([(tag[:2], word) for word, tag in sent]) brown_tags_words.append(('END', 'END')) # 从语料集获得 emission - 统计条件概率 cfd_tagwords = ConditionalFreqDist(brown_tags_words) # P(W = word, condition = pos) cpd_tagwords = ConditionalProbDist(cfd_tagwords, MLEProbDist) emission = { tag: {word: cpd_tagwords[tag].prob(word) for word in cfd_tagwords[tag]} for tag in cpd_tagwords } # 从语料集获得 transition - 调用 n-gram 模型 tags = [[tag for _, tag in sent] for sent in corpus] transition = Transition(ngram(tags, n)) # 定义 initial distribution - 以 START 为句首, 概率为 1 initial_distribution = {('START', ): 1.0} # 定义 词性标注器 HMM.__init__(self, initial_distribution, transition, emission, n)
def nlist(arr, n=2): nlist = [] for text in arr: nlist.extend(ngram(text, n)) return nlist
async def mimic(ctx, word, num_words=0): """Mimics Owen given a starting word.""" sentence = ngram.ngram(counts, word, num_words) await ctx.send(sentence)
def __init__(self,fileName): self.sList = readXml(fileName) self.n = ngram.ngram() self.usedDist = 1;
def differenceSet(a,b): product = productSet(a,b) result = [] for w in a: if w not in product: result.append(w) return result if __name__ == '__main__': from ngram import ngram sentence1 = "paraparaparadise" sentence2 = "paragraph" X = ngram(2, sentence=sentence1, mode='char') Y = ngram(2, sentence=sentence2, mode='char') union = unionSet(X, Y) product = productSet(X, Y) difference1 = differenceSet(X, Y) difference2 = differenceSet(Y, X) print("unionSet is {0}".format(union)) print("productSet is {0}".format(product)) print("differenceSet X-Y is {0}".format(difference1)) print("differenceSet Y-X is {0}".format(difference2)) print("'se' is in X : {0}".format(str('se' in X))) print("'se' is in Y : {0}".format(str('se' in Y)))
import ngram import re import string from preprocess import * from stemming.porter2 import stem import nltk import numpy as np if __name__ == "__main__": model = ngram.ngram(3) # Train the model on text file preprocessed with open("preprocessed/train_set.txt", 'r') as train_set: model.train(train_set.read()) with open("Holmes.machine_format.questions.txt", 'r') as questions_machine, \ open("preprocessed/questions.txt", 'r') as questions_set, \ open("Holmes.machine_format.answers.txt", 'r') as ans_machine, \ open("preprocessed/answers.txt", 'r') as answers_set: # prepare questions for preprocessing questions = questions_machine.read().split('\n') # format answers for easier comparison answers = ans_machine.read().split('\n') questions_set = questions_set.read().split('\n') answers_set = answers_set.read().split('\n') i = 0 sentences = [] words = "" rights = 0
def getModels(): tru_unigram = ngram.ngram('true.train', 1, ngram.Smooth.GOOD_TURING, True) fal_unigram = ngram.ngram('false.train', 1, ngram.Smooth.GOOD_TURING, True) tru_uni_rl = ngram.ngram('true.train', 1, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) fal_uni_rl = ngram.ngram('false.train', 1, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) tru_bigram = ngram.ngram('true.train', 2, ngram.Smooth.GOOD_TURING, True) fal_bigram = ngram.ngram('false.train', 2, ngram.Smooth.GOOD_TURING, True) tru_bi_rl = ngram.ngram('true.train', 2, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) fal_bi_rl = ngram.ngram('false.train', 2, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) #trial code tru_trigram = ngram.ngram('true.train', 3, ngram.Smooth.GOOD_TURING, True) fal_trigram = ngram.ngram('false.train', 3, ngram.Smooth.GOOD_TURING, True) tru_tri_rl = ngram.ngram('true.train', 3, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) fal_tri_rl = ngram.ngram('false.train', 3, ngram.Smooth.GOOD_TURING, True, ngram.Direction.RL) tru_quadgram = ngram.ngram('true.train', 4, ngram.Smooth.GOOD_TURING, True) fal_quadgram = ngram.ngram('false.train', 4, ngram.Smooth.GOOD_TURING, True) return [tru_unigram, fal_unigram, tru_bigram, fal_bigram, tru_trigram, fal_trigram, tru_quadgram, fal_quadgram , tru_uni_rl, fal_uni_rl, tru_bi_rl, fal_bi_rl, tru_tri_rl, fal_tri_rl]
def train(self, tokens): bigram = ngram(tokens) for s1, s2 in bigram: self.chains.setdefault(s1, []) self.chains[s1].append(s2)
import ngram str1 = "paraparaparadise" str2 = "paragraph" tool = ngram.ngram() list1 = tool.literaln(str1, 2) list2 = tool.literaln(str2, 2)
# Call rate Labeling import ngram import pandas as pd ng = ngram.ngram() fl = ng.search(r'C:\Users\student\nlp_project\bok_project\ngram\ngram_data') for doc_num in range(len(fl)): datas = ng.select_file(doc_num) if doc_num == 0: ngram_df = datas[['date', 'ngram']] else: ngram_df = pd.concat([ngram_df, datas[['date', 'ngram']]]) ngram_df.reset_index()[['date', 'ngram']].to_json('total_ngram.json')
def __ngram(x): if skipgram: return skip_bigram(x, max_step=skipgram) if n > 1: return ngram(x, n=n) return x
import ngram str1 = "paraparaparadise" str2 = "paragraph" tool = ngram.ngram() list1 = tool.literaln(str1,2) list2 = tool.literaln(str2,2)
def __init__(self): # Conversion of some turkish language specific characters self.charList = {"\x80":"c", "\x8e":"a", "\x99":"o", "\xa3":"u", "\x81":"", "\x89":"", "\x93":"", "\x94":"", "\xa1":"i", "\xad":"", "\xb0":"", "\xb1":"", "\xb3":"", "\xba":"", "\xe3":"", "\xe4":"a", chr(0xe2):"o", # 0xe2 karakterini silemiyorum!!! :S "é":"e", "ğ":"g", "ü":"u", "ş":"s", "ı":"i", "ö":"o", "ç":"c", "Ğ":"G", "Ü":"U", "Ş":"S", "İ":"I", "Ö":"O", "Ç":"C", "!":"", ",":"", ".":"", ";":"", ":":"", ")":"", "(":"", "'":"", '"':"", "1":"", "2":"", "3":"", "4":"", "5":"", "6":"", "7":"", "8":"", "9":"", "0":"", "-":"", "_":"", "?":"", "%":"", "$":"", "&":"", "/":"", "\\":""} #This set is not complete!! self.n = ngram.ngram() self.feedList = []
def __init__(self): # Conversion of some turkish language specific characters self.charList = { "\x80": "c", "\x8e": "a", "\x99": "o", "\xa3": "u", "\x81": "", "\x89": "", "\x93": "", "\x94": "", "\xa1": "i", "\xad": "", "\xb0": "", "\xb1": "", "\xb3": "", "\xba": "", "\xe3": "", "\xe4": "a", chr(0xe2): "o", # 0xe2 karakterini silemiyorum!!! :S "é": "e", "ğ": "g", "ü": "u", "ş": "s", "ı": "i", "ö": "o", "ç": "c", "Ğ": "G", "Ü": "U", "Ş": "S", "İ": "I", "Ö": "O", "Ç": "C", "!": "", ",": "", ".": "", ";": "", ":": "", ")": "", "(": "", "'": "", '"': "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "0": "", "-": "", "_": "", "?": "", "%": "", "$": "", "&": "", "/": "", "\\": "" } #This set is not complete!! self.n = ngram.ngram() self.feedList = []
param = None if len(sys.argv) > 4: standardize_file = sys.argv[4] param = json.loads(open(standardize_file, 'r').read()) text = load_text(text_file) labels = [int(line.strip()) for line in open(label_file, 'r')] data = [ ( {'name':'label', 'type':'{0, 1}'}, labels ) ] assert len(labels) == len(text) ''' Call functions to extract features and add to data. ''' data += avg_sen_len(text) data += w2v_sim(text) data += ngram(text, './models/3.binlm', 'tri') data += ngram(text, './models/4.binlm', 'quad') #print pos_file #pos_labels = load_text(pos_file) #data += ngram(pos_labels, './models/pos3.binlm', 'pos-tri') #data += ngram(pos_labels, './models/pos4.binlm', 'pos-quad') ''' Output the arff file. ''' param = arff_dump(output_file, data, param=param) if len(sys.argv) <= 4: f = open('param.json', 'w') f.write(json.dumps(param, indent = 2))
pos_file = './data/' + pos_file param = None if len(sys.argv) > 4: standardize_file = sys.argv[4] param = json.loads(open(standardize_file, 'r').read()) text = load_text(text_file) labels = [int(line.strip()) for line in open(label_file, 'r')] data = [({'name': 'label', 'type': '{0, 1}'}, labels)] assert len(labels) == len(text) ''' Call functions to extract features and add to data. ''' data += avg_sen_len(text) data += w2v_sim(text) data += ngram(text, './models/3.binlm', 'tri') data += ngram(text, './models/4.binlm', 'quad') #print pos_file #pos_labels = load_text(pos_file) #data += ngram(pos_labels, './models/pos3.binlm', 'pos-tri') #data += ngram(pos_labels, './models/pos4.binlm', 'pos-quad') ''' Output the arff file. ''' param = arff_dump(output_file, data, param=param) if len(sys.argv) <= 4: f = open('param.json', 'w') f.write(json.dumps(param, indent=2)) f.close()
import ngram import list_subtraction a="paraparaparadise" b="paragraph" x=sorted(set(ngram.ngram(a,2))) y=sorted(set(ngram.ngram(b,2))) print("x:",x) print("y:",y) w=x+y """ 関数ngramの内容 def ngram(text,n): #n字数ごとずらす x=[] for y in range(len(text)): if y==len(text)-n+1: break; x.append(text[y:y+n]) return x """ f=sorted(set(w)) #set型は重複しない要素(同じ値ではない要素、ユニークな要素)のコレクション #しかし元の順序とは異なる s=sorted(set(list_subtraction.list_subtraction(w,f))) c=sorted(set(list_subtraction.list_subtraction(x,s)))
def test_ngram_twogram(self): import ngram cnts = ngram.ngram(n=2, k=1, document='test.txt') self.assertTrue(cnts.has_key("a a"), 'ngram does not contain appropriate keys' + str(cnts)) self.assertEqual(cnts["a a"], 2, 'ngram count is incorrect' + str(cnts)) self.assertEqual(len(cnts), 1, 'ngram should only contain top k counts' + str(cnts))