def updata_searchengine(Knowledge,SeIndexDir): #remove old index os.system('rm -rf %s' % SeIndexDir) os.system('mkdir %s' % SeIndexDir) QuestionNorm = [] for KID,item in Knowledge.items(): for question in item[0]: if int(question.IsNorm) == 1: QuestionNorm.append(question) question = [] for item in QuestionNorm: mid = {} mid['question'] = item.Question mid['questionID'] = item.QuestionID question.append(mid) schema = Schema(title=TEXT(stored=True),quesId=ID(stored=True)) ix = create_in(SeIndexDir, schema) writer = ix.writer() for i in question: word = PProc.cut_for_search(i['question']) PProc.syn_wordlist(word) writer.add_document(title=u' '.join(word),quesId = u'%s' % i['questionID']) writer.commit()
def processConversation(conversation): global bag_of_words bag_of_words = {} sentences = conversation.split(".") tokenized = PreProcess.tokenize_sentences(sentences) filtered = PreProcess.RemovePunctAndStopWords(tokenized) bag_of_words = FreqDist(word.lower() for word in filtered)
def ProcessRow_chat(row): result = [] for question in row: filter = [u',',u'?',u'。',u';',u'!',u'“',u'”',u'’',u'‘',u',',u'.',u'!','?'] for i in filter: question = question.replace(i, '') lenOfQ = len(question) if lenOfQ == 0: continue listOfWildcard = [] for i in range(lenOfQ): if question[i] == '=': listOfWildcard.append(i) word,tag = PProc.withtag_cut(question) for idx in listOfWildcard: imid = 0 for idxOfword,item in enumerate(word): if imid == idx: word.insert(idxOfword, u'=') tag.insert(idxOfword, 'wc') break imid += len(item) word, tag = PProc.wordtag_process(word, tag) #print ' '.join(word) result += QuestionGet_chat(word, tag) return result
def WriteAiml_how(knowledge,string_how,ID, fw_match): if len(knowledge) < 2: return question = [] for item in knowledge: question.append(item.Question) parm = question[1] key = parm.split('>')[0] verb = parm.split('>')[1] keyList = PProc.withtag_cut(key)[0] PProc.syn_wordlist(keyList) verb = PProc.syn_word(verb) if '0' not in keyList: stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8')) stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8')) stringx = stringx.replace('VERB', verb.encode('utf-8')) stringx = stringx.replace('VALUE', (u'match-what|%s' %ID).encode('utf-8')) fw_match.write(stringx) else: keyList[keyList.index('0')] = '*' stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8')) stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8')) stringx = stringx.replace('VERB', verb.encode('utf-8')) stringx = stringx.replace('VALUE', (u'match-what|%s' % ID).encode('utf-8')) fw_match.write(stringx) keyList.remove('*') stringx = string_how.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8')) stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8')) stringx = stringx.replace('VERB', verb.encode('utf-8')) stringx = stringx.replace('VALUE', (u'match-what|%s' % ID)) fw_match.write(stringx)
def WriteAiml_what(knowledge, string_what, ID, fw_match): if len(knowledge) < 2: return key = knowledge[1].Question keyList, tag = PProc.withtag_cut(key) PProc.syn_wordlist(keyList) stringx = string_what.replace('KNOW', (u'KNOWLEDGE %s' % ID).encode('utf-8')) stringx = stringx.replace('KEY', ' '.join(keyList).encode('utf-8')) stringx = stringx.replace('VALUE', (u'match-what|%s' % ID).encode('utf-8')) fw_match.write(stringx)
def main(): vocab = pp.initVocab() output = pp.indexToLabel("output.mat", vocab) labelfile = "tag.txt" f = open(labelfile, "wb") for i in range(len(output)): prob = output['data'][:][i] idx = np.argmax(prob) tag = vocab[i] f.write(tag + '\n') f.close() f = open(labelfile, "wb")
def processConversation(conversation, category): global bag_of_words, documentClass bag_of_words = {} sentences = conversation.split(".") tokenized = PreProcess.tokenize_sentences(sentences) filtered = PreProcess.RemovePunctAndStopWords(tokenized) for word in filtered: if word in bag_of_words: bag_of_words[word] = int(bag_of_words[word]) + 1 else: bag_of_words[word] = 1 #total=len(filtered) #bag_of_words=calculateFrequencies(total) addTermFrequency(bag_of_words)
def Process1(df): pri_id = "企业名称" res = pd.DataFrame() res[pri_id] = df[pri_id].unique() # 转换币种 df = prep.Convert_money(df) # 提取注册资金特征(最大值,最小值,均值,方差) res = pd.merge(res,fea.GetValAvg(df,pri_id,"注册资金(元)"),on=pri_id) res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"注册资金(元)"),on=pri_id) res = pd.merge(res,fea.GetValVar(df,pri_id,"注册资金(元)"),on=pri_id) # 提取类别特征 num_fea = ['注册资金(元)',"出资比例"] cat_fea = [col for col in df.columns if col != pri_id and col not in num_fea] for col in cat_fea: res = pd.merge(res,fea.GetCategroicalCount(df,pri_id,col),on=pri_id) # 法定代表人和首席代表标志为空统计 res = pd.merge(res,fea.GetValNaCount(df,pri_id,"法定代表人标志","姓名"),on=pri_id) res = pd.merge(res,fea.GetValNaCount(df,pri_id,"首席代表标志","姓名"),on=pri_id) # 统计 相应职务个树 res = pd.merge(res,fea.CatRowsToCols(df,pri_id,"职务","姓名")) # 提取出资比例(最大值,最小值,均值,方差) res = pd.merge(res,fea.GetValAvg(df,pri_id,"出资比例"),on=pri_id) res = pd.merge(res,fea.GetValMaxMin(df,pri_id,"出资比例"),on=pri_id) res = pd.merge(res,fea.GetValVar(df,pri_id,"出资比例"),on=pri_id) return res
def preProc(self): dataCleaner = PreProcess(self.df) self.df = dataCleaner.df #alert user tkMessageBox.showinfo("K Means Clustering", "Preprocessing completed successfully!") pass
def top_k_words_of(platform, k=20, cmt_num=0, by_category=False, cate_name="", show_cmt_of_words=False): cmt_corpus = PreProcess.get_review_corpus_by(platform=platform, num=cmt_num, by_category=by_category, cate_name=cate_name) tpk_words1 = top_k_fre_of(cmt_corpus, k=20) tpk_words2 = tf_idf_topk(cmt_corpus, k=20) if show_cmt_of_words: sentence_list = [] for cmt in cmt_corpus: sentence_list += cut_ch_sentence(cmt) from collections import defaultdict words_comment_dir1 = defaultdict(list) wc_dir2 = defaultdict(list) for words in tpk_words1: for sentence in filter(lambda sent: words in sent, sentence_list): words_comment_dir1[words].append(sentence) for words in tpk_words2: for sentence in filter(lambda sent: words in sent, sentence_list): wc_dir2[words].append(sentence) for key in words_comment_dir1.keys(): print("------------------%s------------------\n" % key, words_comment_dir1[key], "\n") print("=========================================") for key in wc_dir2.keys(): print("-------------------%s------------------\n" % key, wc_dir2[key], "\n")
def get_aspect_json(platform, k=20, cmt_num=0, by_category=False, cate_name=""): import json cmt_corpus = PreProcess.get_review_corpus_by(platform=platform, num=cmt_num, by_category=by_category, cate_name=cate_name) tpk_words = tf_idf_topk(cmt_corpus, k=20) sentence_list = [] for cmt in cmt_corpus: sentence_list += cut_ch_sentence(cmt) words_tree = dict() words_tree["name"] = cate_name words_tree["child"] = [] for words in tpk_words: comment_dic = dict() comment_dic["child"] = [] comment_dic["name"] = words for sentence in filter(lambda sent: words in sent, sentence_list): comment_dic["child"].append(sentence) words_tree["child"].append(comment_dic) return json.dumps(words_tree)
def _F_Clsuter_Geo(): if os.path.exists(data_path + "data/_F_geo.feather"): df = feather.read_dataframe(data_path + "data/_F_geo.feather") return df # 合并 operation和transaction的uid,geo_code geo_info = pd.concat( (op_info[[pri_id, 'geo_code']], trans_info[[pri_id, 'geo_code']])) geo_info['pos'] = geo_info['geo_code'].apply(_F.Decode) temp = geo_info[geo_info['pos'] != -1] res = [x for x in temp['pos'].values] X = np.asarray(res) from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=20, random_state=2018).fit(X) temp['cluster_id'] = kmeans.labels_ t = temp.groupby([ 'UID', 'cluster_id' ])['pos'].count().reset_index().rename(columns={'pos': 'cluster_count'}) c = pd.pivot_table(t, index='UID', columns='cluster_id', values='cluster_count').fillna(0).reset_index() # 重命名列 _Prep = _P.Process() c = _Prep.RenameColumns(c, [pri_id], 'cluster') # 持久化 feather.write_dataframe(c, data_path + "data/_F_geo.feather") return c
def convert(src, tgt, txt, nativize, preoptions, postoptions): txt = PreProcess.PreProcess(txt, src, tgt) if 'siddhamUnicode' in postoptions and tgt == 'Siddham': tgt = 'SiddhamUnicode' if 'LaoNative' in postoptions and tgt == 'Lao': tgt = 'Lao2' if 'siddhamUnicode' in preoptions and src == 'Siddham': src = 'SiddhamUnicode' if 'egrantamil' in preoptions and src == 'Grantha': src = 'GranthaGrantamil' if 'egrantamil' in postoptions and tgt == 'Grantha': tgt = 'GranthaGrantamil' for options in preoptions: txt = getattr(PreProcess, options)(txt) transliteration = Convert.convertScript(txt, src, tgt) if nativize: transliteration = PostOptions.ApplyScriptDefaults( transliteration, src, tgt) if tgt != 'Tamil': transliteration = PostProcess.RemoveDiacritics(transliteration) else: transliteration = PostProcess.RemoveDiacriticsTamil( transliteration) for options in postoptions: transliteration = getattr(PostProcess, options)(transliteration) return transliteration
def removeRibosomalRNA(fastq1, outfile): '''Remove ribosomal RNA using sortMeRNA''' if PARAMS['data_type'] == 'metatranscriptome': tool = pp.runSortMeRNA( fastq1, outfile, **{ **PARAMS, **{ 'fn_suffix': '_deadapt.' + FASTQ1_SUFFIX } }) tool.run(**PARAMS) else: assert PARAMS['data_type'] == 'metagenome', \ 'Unrecognised data type: {}'.format(PARAMS['data_type']) inf1 = fastq1 inf2 = P.snip(inf1, '.fastq.1.gz') + '.fastq.2.gz' inf3 = P.snip(inf1, '.fastq.1.gz') + '.fastq.3.gz' outf1 = outfile outf2 = P.snip(outf1, '.fastq.1.gz') + '.fastq.2.gz' outf3 = P.snip(outf1, '.fastq.1.gz') + '.fastq.3.gz' symlink(inf1, outf1) if os.path.exists(inf2): symlink(inf2, outf2) if os.path.exists(inf3): symlink(inf3, outf3)
def assignment_fairea(cands, pos, fitness, G, weight_probability=[1, 0, 0], version=4, kk=0, local=False): matched_1 = PP.pre_assignment(cands, pos, fitness, G) if weight_probability[2] == 1: if kk != 0: matched_1.extend( PP.support_group_assignment(cands, pos, fitness, G, p=kk)) else: matched_1.extend( PP.support_group_assignment(cands, pos, fitness, G)) i = -1 while True: i += 1 positions, final_matched = select_positions(G, pos, cands, fitness, weight_probability, i + 1, version=version, local=local) if len(final_matched) < len(positions): final_matched, G = Hu.Hungarian( cands, positions, final_matched, G, fitness, weight_probability=weight_probability, version=2, local=local) if final_matched == []: return [], [] if len(final_matched) == len(pos): break final_matched = list(final_matched) final_matched.extend(matched_1) return G, set(final_matched)
def SaveToFolder(gtImage,sliceNum, imgNumFolder): sampledImg = preProc.SampleTest1(gtImage[:,:,sliceNum]) #print(sampledImg.GetSpacing()) sampledImgArr = sitk.GetArrayFromImage(sampledImg) #slice1Copy = np.uint8(sampledImgArr) path = './TrainingImages' cv2.imwrite(os.path.join(path , 'testImage{0}.png'.format(imgNumFolder)), sampledImgArr) cv2.waitKey(0)
def assignment_max_weight(cands, pos, fitness, G, weight_probability=[1, 0, 0], version=3): matched_1 = PP1.pre_assignment(cands, pos, fitness, G) if weight_probability[2] == 1: matched_1.extend(PP1.support_group_assignment(cands, pos, fitness, G)) bi_G = nx.Graph() bi_edges, edge_weights, l = UW.p_based_weight(pos, cands, fitness, G) bi_G.add_weighted_edges_from(bi_edges) final_matched = nx.max_weight_matching(bi_G) final_matched = list(final_matched) final_matched.extend(matched_1) final_matched = set(final_matched) return G, final_matched
def processConversation(conversation,category): global bag_of_words,documentClass bag_of_words={} sentences=conversation.split(".") tokenized=PreProcess.tokenize_sentences(sentences) filtered=PreProcess.RemovePunctAndStopWords(tokenized) for word in filtered: if word in bag_of_words: bag_of_words[word]=int(bag_of_words[word])+1 else: bag_of_words[word]=1 total=len(filtered) bag_of_words=calculateFrequencies(total) if category in documentClass: new_dict=merge_two_dicts(documentClass[category],bag_of_words) documentClass[category]=new_dict else: documentClass[category]=bag_of_words
def application(file_path): data = PP.image_process(file_path) lable = '' if (len(data) == 0): print("识别失败,请传入更清晰的图片") else: print("正在识别......") for i in range(len(data)): preValue = restore_model(data[i:i + 1])[0] lable += str(preValue) print("识别结果:" + lable)
def Encode(input_dir, output_dir, codebook_name, pixel_size, start): global codebook with open(codebook_name, 'r') as f: codebook = f.read() codebook = ast.literal_eval(codebook) num = 1 PreProcess.dir_check(output_dir, emptyflag=True) compress_rate = [] for f in os.listdir(input_dir): img_path = os.path.join(input_dir, f) img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) ret, img = cv2.threshold(img, 0.5, 1, cv2.THRESH_BINARY) height = len(img) width = len(img[0]) codevalue = encode(img, height=height, width=width) codevalue = dict( sorted(codevalue.items(), key=lambda item: (item[0][0], item[0][1]))) codevalue = tobinary(codevalue, height, width) output_path = os.path.join(output_dir, f[0:f.rfind('.bmp')]) + '.tt' original_pixel = height * width * len(format(pixel_size - 1, 'b')) final_pixel = len(codevalue) with open(output_path, 'wb') as g: g.write(codevalue.encode()) end = datetime.datetime.now() compress_rate.append(original_pixel / final_pixel) print( '\rSaving encoding results for picture %d, program has run %s, the mean compression ratio is %0.2f' % (num, end - start, np.mean(compress_rate)), end='') num = num + 1 return np.mean(compress_rate)
def Decode(input_dir, output_dir, original_img_dir, codebook_name, start): with open(codebook_name, 'r') as f: codebook = f.read() codebook = ast.literal_eval(codebook) global decodebook decodebook = {v: k for k, v in codebook.items()} num = 1 PreProcess.dir_check(output_dir, emptyflag=True) error_rate_total = [] for f in os.listdir(input_dir): tt_path = os.path.join(input_dir, f) if os.path.splitext(tt_path)[1] == '.tt': with open(tt_path, 'rb') as g: tt = g.read() img = decode(tt) ret, img = cv2.threshold(img, 0.5, 255, cv2.THRESH_BINARY) img_original_path = os.path.join(original_img_dir, f[0:f.rfind('.tt')]) + '.bmp' img_original = cv2.imread(img_original_path, cv2.IMREAD_GRAYSCALE) output_path = os.path.join(output_dir, f[0:f.rfind('.tt')]) + '.bmp' cv2.imwrite(output_path, img) error_rate = fidelity(img_original, img) error_rate_total.append(error_rate) end = datetime.datetime.now() print( '\rSaving decoding results for picture %d,SNR is %0.2f,the mean SNR is %0.2f, the program has run %s' % (num, error_rate, np.mean(error_rate_total), end - start), end='') num = num + 1 return np.mean(error_rate_total)
def pre_process(): global resized, processed_images try: cut_images = CutUp.box_extraction(resized) resized = None except IndexError: resized = None return 'We couldn\'t detect all of the gridlines.' for cut_image in cut_images: processed_images.append( PreProcess.pre_process(cut_image, b=7, by_mass=False, boundary=8)) return 'success'
def _F_GeoCode(encode_type="LabelEncode", n=3): if os.path.exists(data_path + "data/_F_geo_code.feather"): df = feather.read_dataframe(data_path + "data/_F_geo_code.feather") return df # 取每个用户经常活跃的topN geo_code geo_info = pd.concat( (op_info[[pri_id, 'geo_code', 'day']], trans_info[[pri_id, 'geo_code', 'day']])) temp = _F.TopNGeo_code(geo_info, pri_id, 'day', n) # 编码 _Prep = _P.Process() temp = _Prep.CatColConvert(temp, pri_id, encode_type) # 持久化 feather.write_dataframe(temp, data_path + "data/_F_geo_code.feather") return temp
def learn(self, text_df): """Spark transformation to learn the adjacent terms of a given ngram""" ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram') ngram_df = ngram.transform(text_df) # create the ngram to adjacent term mappings ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect() self.ngram_model = ngram_df.rdd \ .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \ .flatMap(lambda xs: [x for x in xs]) \ .map(lambda y: (y[0], [y[1]])) \ .reduceByKey(lambda a, b: a + b).collect() # create list of the keys in the model and store them self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
def update(loss, reviews): global model lr = 0.000003 #update 'W' parameter of model for (userid, productid) in loss: for word in reviews[(userid, productid)][:-1]: model["W"][words.index(word)] -= lr * loss[(userid, productid)] #update 'U' parameter of model for user in users: userid = users[user] productlist = pre.getproductlist(userid) for product in productlist: factor = np.zeros(D) for word in reviews[(userid, product)][:-1]: factor += model["P"][words.index(word)] model["U"][userid] -= lr * loss[ (userid, product)] * (factor * model["V"][product]) #update 'V' parameter of model for product in products: productid = products[product] userlist = pre.getreviewers(productid) for user in userlist: factor = np.zeros(D) for word in reviews[(user, productid)][:-1]: factor += model["P"][words.index(word)] model["V"][productid] -= lr * loss[ (user, productid)] * (factor * model["U"][user]) #update 'P' parameter of model for wordid in range(len(words)): for (userid, productid) in reviews: if words[wordid] in reviews[(userid, productid)][:-1]: model["P"][wordid] -= lr * loss[(userid, productid)] * ( model["U"][userid] * model["V"][productid])
def convert(src, tgt, txt, nativize, preoptions, postoptions): txt = PreProcess.PreProcess(txt, src, tgt) if 'siddhammukta' in postoptions and tgt == 'Siddham': tgt = 'SiddhamDevanagari' if 'siddhamap' in postoptions and tgt == 'Siddham': tgt = 'SiddhamDevanagari' if 'siddhammukta' in preoptions and src == 'Siddham': src = 'SiddhamDevanagari' if 'LaoNative' in postoptions and tgt == 'Lao': tgt = 'Lao2' if 'egrantamil' in preoptions and src == 'Grantha': src = 'GranthaGrantamil' if 'egrantamil' in postoptions and tgt == 'Grantha': tgt = 'GranthaGrantamil' if 'nepaldevafont' in postoptions and tgt == 'Newa': tgt = 'Devanagari' if 'ranjanalantsa' in postoptions and tgt == 'Ranjana': tgt = 'Tibetan' nativize = False if 'ranjanawartu' in postoptions and tgt == 'Ranjana': tgt = 'Tibetan' nativize = False for options in preoptions: txt = getattr(PreProcess, options)(txt) transliteration = Convert.convertScript(txt, src, tgt) if nativize: transliteration = PostOptions.ApplyScriptDefaults( transliteration, src, tgt) if tgt != 'Tamil': transliteration = PostProcess.RemoveDiacritics(transliteration) else: transliteration = PostProcess.RemoveDiacriticsTamil( transliteration) for options in postoptions: transliteration = getattr(PostProcess, options)(transliteration) if src == "Tamil" and tgt == "IPA": r = requests.get("http://anunaadam.appspot.com/api?text=" + txt + "&method=2") r.encoding = r.apparent_encoding transliteration = r.text return transliteration
def setup(): global testParagraphs global trainingParagraphs global happySadScoredWords testParagraphs = [] trainingParagraphs = [] happySadScoredWords = [] print("Loading Corpus...") testParagraphs, trainingParagraphs = PreProcess.getRatedParagraphs() print(len([r for r in trainingParagraphs if r["overAllRating"] == 1])) print(len([r for r in trainingParagraphs if r["overAllRating"] == 2])) print(len([r for r in trainingParagraphs if r["overAllRating"] == 3])) print(len([r for r in trainingParagraphs if r["overAllRating"] == 4])) print(len([r for r in trainingParagraphs if r["overAllRating"] == 5])) print("Loading Happy/Sad Words...") happySadScoredWords = HappySad.loadHSWords("./words/happyAndSadWords3.txt")
def chooseFile(self, item): for index in range(self.listWidget.count()): if self.listWidget.item(index).text() == item.text(): self.itemIndex = index preProcess = PreProcess.PreProcess() content = preProcess.getArticleContent(repertory + "/" + item.text()) if self.method == 1: self.sents = preProcess.getSents(content) size = len(self.sents) else: size, self.sents = preProcess.getXMLsents(content) self.labelRest.setText('0/' + str(size)) self.file = item.text() self.newSent = [] self.pushButton_save.setDisabled(True)
def RunPreprocess(): print "---PreProcess" PreProcess.PreProcess() print "---PreProcess1" PreProcess1.PreProcess1() print "---PreProcess2" PreProcess2.PreProcess2() print "---PreProcess3" PreProcess3.PreProcess3() print "---PreProcess4,40" PreProcess4.PreProcess4(40) print "---PreProcess4,30" PreProcess4.PreProcess4(30) print "---PreProcess4Base,40" PreProcess4Base.PreProcess4Base(40) print "---PreProcess4Base,30" PreProcess4Base.PreProcess4Base(30)
def match(gen, fitness, cand, cands, pos, m ,G): final_matched =[] gender = nx.get_node_attributes(G, 'att') edges = [] bi_G = nx.Graph() for (u, v), w in fitness.items(): if v in cand and cands[u] == gen: edges.append((u, v, w)) bi_G.add_weighted_edges_from(edges) matched = nx.max_weight_matching(bi_G) l = {} for (u, v) in matched: if u in cand: temp = u u = v v = temp l[(u, v)] = fitness[(u, v)] l = {k: v for k, v in sorted(l.items(), key=lambda item: item[1], reverse=True)} l = list(l.keys()) i,j = 0, 0 while j<=len(l)-1 and i < m: (u,v) = l[j] j+=1 if u in cands: temp = u u = v v = temp if u in pos and v in cands: final_matched.append((u,v)) i+=1 gender[u] = gen pos.remove(u) del cands[v] remove_list = [] for (a, b) in fitness.keys(): if a == v or b == u: remove_list.append((a, b)) for item in remove_list: del fitness[item] nx.set_node_attributes(G, gender, 'att') final_matched.extend(PP.pre_assignment(cands, pos, fitness, G)) return final_matched
def setup(): global testReviews global trainingReviews global Iclassifiers global IEx1features global IIclassifiers global IIEx1features print("Loading Corpus...") testReviews, trainingReviews = PreProcess.getRatedReviews() print(len([r for r in trainingReviews if r["overAllRating"] == 1])) print(len([r for r in trainingReviews if r["overAllRating"] == 2])) print(len([r for r in trainingReviews if r["overAllRating"] == 3])) print(len([r for r in trainingReviews if r["overAllRating"] == 4])) print(len([r for r in trainingReviews if r["overAllRating"] == 5])) print("IGNORE////////////////") #Get the classifier from Exercise 1 to compute rating for each paragraph Iclassifiers, IEx1features = Exercise1.partI(ClassifierRunner.naiveBayes) #Get the classifier from Exercise 1 to compute rating for each paragraph IIclassifiers, IIEx1features = Exercise1.partII(ClassifierRunner.maxEnt) print("END IGNORE////////////////")
def partI(classifier): print("PART I Classify by author") print("Loading Corpus...") testReviews, trainingReviews = PreProcess.getByAuthor() authorTagTraining = [(e["text"], e["author"]) for e in trainingReviews] authorTagTesting = [(e["text"], e["author"]) for e in testReviews] featureExtractors = [] if classifier == ClassifierRunner.naiveBayes: featureExtractors.append(HappySad.featureNumericScore) featureExtractors.append(HappySad.featureHitCountBucketed) featureExtractors.append(AuthorshipFeatures.typeTokenRatioBucketed) featureExtractors.append(AuthorshipFeatures.vocabSizeBucketed) else: featureExtractors.append(HappySad.featureNumericScore) featureExtractors.append(HappySad.featureHitCount) featureExtractors.append(AuthorshipFeatures.typeTokenRatio) featureExtractors.append(AuthorshipFeatures.vocabSize) #BASELINE RUN print("Running Baseline") trainedBaseline = ClassifierRunner.runNfoldCrossValidation(ClassifierRunner.mostCommonTag, authorTagTraining, featureExtractors, 4) predictionsBaseline = [c[2] for c in trainedBaseline] truthsBaseline = [c[3] for c in trainedBaseline] predictionsTesting,bAcc = ClassifierRunner.predictTagged(trainedBaseline[0][0], featureExtractors, authorTagTesting) truthsTesting = [c[1] for c in authorTagTesting] bRMS = Evaluator.rmsBinaryDifference(predictionsTesting, truthsTesting) print("BaseLine RMS Error:", bRMS) #OUR CLASSIFIER RUN trainedClassifiers = ClassifierRunner.runNfoldCrossValidation(classifier, authorTagTraining, featureExtractors, 4) predictions = [c[2] for c in trainedClassifiers] truths = [c[3] for c in trainedClassifiers] print("Running most accurate trained classifier on test set") predictionsTesting, cAcc = ClassifierRunner.predictTagged(trainedClassifiers[0][0], featureExtractors, authorTagTesting) truthsTesting = [c[1] for c in authorTagTesting] cRMS = Evaluator.rmsBinaryDifference(predictionsTesting, truthsTesting) Evaluator.createConfusionMatrix([t for d,t in authorTagTraining], predictionsTesting, truthsTesting) print("Our RMS Error:", cRMS) print("Accuracy improvement over baseline:", cAcc - bAcc) print("RMS Error reduction from baseline:", bRMS - cRMS)
def main(dataPath): PreProcess.csvPath=dataPath # PreProcess.ReadFilesToDataFrame() # PreProcess.SplitToPictureAndFacesAnswers() # BadParticipantsRemove.RemoveParticipantsNotReportPANAS() # PreProcess.GetTestRates() # PreProcess.SaveTests() # PreProcess.LoadTestsWithScores() # PreProcess.GetTestMovie() # PreProcess.addUserInfoToTest() # PreProcess.SaveTests() # PreProcess.LoadTestsWithScores() PreProcess.LoadTestsWithScores() Graphs.ShowFalsePositiveGraphs(PreProcess.TestsWithScores.loc[PreProcess.TestsWithScores['video']=='calm']) # Graphs.ShowDifferenceBetweenTests(PreProcess.TestsWithScores) # firstCalm=PreProcess.TestsWithScores.loc[PreProcess.TestsWithScores['userId']==1] Graphs.GraphByQuestionType(PreProcess.TestsWithScores) Graphs.CalmMinusPositiveGraph(PreProcess.TestsWithScores)
def compile_string_as_string(program, debug=0, opt_vec=0, sim_end_time_fs=100000, top_module=""): """ This is a helper function """ preProcess = PreProcess() preProcess.load_source_from_string(program) preProcess.preprocess_text() # comments and includes and defines and undefs if debug: preProcess.print_text() data = "".join(preProcess.text) parser = new_Verilog_EBNF_parser() try: parsed_data = parser.parseString(data, True) except ParseException, err: print "err.line is ", err.line print "col is ", err.column text_lines = err.line.split(";") line_num = 0 char_count = 0 last_line = None print_next_line = False for line in text_lines: line += ";" line_num += 1 if print_next_line: print "[%3d] %s" % (line_num, line) break if (char_count + len(line)) >= err.column: if last_line: print "[%3d] %s" % (line_num - 1, last_line) print "[%3d] %s" % (line_num, line) print " " + " " * (err.column - char_count - 1) + "^" print_next_line = True else: last_line = line char_count += len(line) print err return None
float(line.rstrip('\n')) for line in open('csvData/train_label.data') ] rawData = open('csvData/test.data', 'rb') temp = np.loadtxt(rawData, delimiter=',') testset = np.c_[np.ones(len(temp)), temp] test_labels = [ float(line.rstrip('\n')) for line in open('csvData/test_label.data') ] ### # pre-process PP = PreProcess.PreProcess(data, n_buckets=10, func='boolean') #,swap_labels=True) data = PP.fit(data) testset = PP.fit(testset) data_labels = PP.processLabels(data_labels) test_labels = PP.processLabels(test_labels) # cross-validation best_C = 2 best_ro = 0.01 best_accuracy = 0 best_epoch = 10 best_g0 = 1.001 ''' for C in [4,2,0.5,0.25,0.125]:#,0.0625,0.03125]:
if file.endswith(".csv"): filename = (file).split('.') #Split.callSplit(inputdir,filename[0]) #print file #headers = LabelEncoding.convertlabels(trainingdir+'/'+file,outputdir) for file in os.listdir(trainingdir): if file.endswith(".csv"): #print file headers = LabelEncoding.convertlabels(trainingdir+'/'+file,outputdir) labelledFilesPath = outputdir+'/labelled' #Using one hot encoder for file in os.listdir(labelledFilesPath): if file.endswith(".csv"): PreProcess.convertallAttributes(labelledFilesPath+'/'+file,outputdir) preprocessedFilesPath = outputdir+'/preprocessed' outputfile=open(outputdir+'/ClassifiersResults.txt','a') truelabelPath = inputdir+'/truelabels' for file in os.listdir(labelledFilesPath): if file.endswith(".csv"): filename = (file).split('.') #trainSet,trainSetTrueLabel = EvaluatingClassifiers.callClassifiers(preprocessedFilesPath+'/'+file,truelabelPath+'/'+file,filename[0],outputdir,inputdir,filename[1]) trainSet,trainSetTrueLabel = EvaluatingClassifiers.callClassifiers(preprocessedFilesPath+'/'+file,truelabelPath+'/'+file,filename[0],outputdir) #LearningCurve.plotLearningCurve(trainSet, trainSetTrueLabel,outputdir+'/LearningCurve',filename[0])