def write_sens_for_reading(syns_wn, syns_ox, filename_output): for i_wn in range(len(syns_wn)): for i_ox in range(len(syns_ox)): defi_wn = syns_wn[i_wn].definition() defi_ox = syns_ox[str(i_ox)]["d"] value = defi_wn + "\t" + defi_ox FileProcess.append_value_to_file(value, filename_output)
def parse_ox_wn_defi_to_input(word): defis_wn = WordnetHandler.get_definitions_for_word(word) defis_ox = OxfordParser.get_definitions_of_word_for_svm(word) for defi_wn in defis_wn: for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox FileProcess.append_value_to_file(value, __filename_input_sen__)
def map_wordnet_EVD(): print "loading EVD" dict_EVD = EVDParser.readEVDFile() print "loading WN" dict_wn = WordnetProcessForEVD.read_nouns() for key, values in dict_wn.items(): key_lemmas = key.split("=")[1] key_lemmas = key_lemmas.split("-") key = key.split("=")[0] key_definition = key.split("-")[1] key = key.split("-")[0] test_flag = 0 for lemma in key_lemmas: if lemma[:1] == "b" : test_flag = 1 if test_flag == 0: continue print "map_wordnet_EVD " + key vi_means = get_EVD_means(key, key_lemmas, values, dict_EVD) ox_means = get_Ox_means(key, key_lemmas) means = get_best_mean(vi_means, ox_means, 2) ################################################################################ # get greatest duplicated mean # if len(values) == 1: # means = vi_means # item_count = [(item,count) for item, count in collections.Counter(vi_means).items() if count > 1] # if len(item_count) > 0: # means = [max(item_count,key = itemgetter(1))[0]] # # items_2 = [item for item, count in collections.Counter(vi_means).items() if count > 2] # for item in items_2: # means.append(item) # means = list(set(means)) # # else: # item_count = [(item,count) for item, count in collections.Counter(vi_means).items() if count > 1] # if len(item_count) == 0: # continue # means = [max(item_count,key = itemgetter(1))[0]] # # items_2 = [item for item, count in collections.Counter(vi_means).items() if count > 2] # for item in items_2: # means.append(item) # means = list(set(means)) ################################################################################ if len(means) > 0: means = [means[0]] means.insert(0,key + "-" + key_definition) filename = "Results/EVD/wn_evd_b_0_1.csv" FileProcess.append_result_to_excel_file(filename, means)
def get_synset_gloss(synset, filename): result = "" for lemma in synset.lemmas(): gloss = lemma.name().replace("_", " ") result += gloss + ". " result += synset.definition() + ". " for example in synset.examples(): result += example + "." FileProcess.append_value_to_file(result, filename) for hypo in synset.hyponyms(): get_synset_gloss(hypo, filename)
def create_input_sens_test(dict_ox): flag_can_go = False for word in dict_ox: if word == "blockage": flag_can_go = True if flag_can_go == False: continue if len(dict_ox[word]) == 0: continue defis_wn = WordnetHandler.get_definitions_for_word(word) defis_ox = OxfordParser.get_definitions_of_word_for_svm(word) if len(defis_ox) == 1 and len(defis_wn) == 1: continue if len(defis_ox) == 1 and len(defis_wn) > 1: all_defi_wn = "" for defi_wn in defis_wn: all_defi_wn += defi_wn + "\t" if all_defi_wn != "": all_defi_wn = all_defi_wn[:-1] for defi_wn in defis_wn: for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox + "\t" + all_defi_wn FileProcess.append_value_to_file(value, __filename_input_sen_test__) else: for defi_wn in defis_wn: all_defi_ox = "" for defi_ox in defis_ox: all_defi_ox += defi_ox + "\t" if all_defi_ox != "": all_defi_ox = all_defi_ox[:-1] for defi_ox in defis_ox: value = defi_wn + "\t" + defi_ox + "\t" + all_defi_ox FileProcess.append_value_to_file(value, __filename_input_sen_test__)
def cal_features_from_sens_write_to_file(filename_sens, filename_output): f = open(filename_sens,'r'); line = f.readline(); while (line): if len(line) > 0: feature_values = "" sens = line.split("\t") sen_1 = sens[0] sen_2 = sens[1] feature_values += str(Literal.levenshtein_in_context(sen_1, sen_2, sens)) + "\t" # feature_values += str(ShallowSyntactic.jaccard_POS_in_context(sen_1, sen_2, sens)) + "\t" feature_values += str(WordnetBased.wordnet_based_in_context(sen_1, sen_2, sens, 0)) # feature_values += str(WordnetBased.wordnet_based_in_context(sen_1, sen_2, sens, 1)) FileProcess.append_value_to_file(feature_values, filename_output) line = f.readline(); f.close()
def create_input_sen_via_ox_vn(dict_vn, dict_ox): for word in dict_ox: if len(dict_ox[word]) == 0: continue if word in dict_vn: word_syns_vn = dict_vn[word] word_syns_ox = dict_ox[word] if len(word_syns_ox) == 1 and len(word_syns_ox) == 1: continue for i_vn in word_syns_vn: syn_vn = word_syns_vn[i_vn] all_defi_ox = "" for i_ox in word_syns_ox: syn_ox = word_syns_ox[i_ox] if "tv" not in syn_ox: continue defi_ox = syn_ox['d'] all_defi_ox += defi_ox + "\t" flag_can_use = False for i_ox in word_syns_ox: syn_ox = word_syns_ox[i_ox] if "tv" not in syn_ox: continue if check_tv_similar(syn_vn['tv'], syn_ox['tv']) == 1: defi_vn = syn_vn['d'] defi_ox = syn_ox['d'] value = defi_vn + "\t" + defi_ox + all_defi_ox FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("1", __filename_input_gs__) flag_can_use = True else: if flag_can_use == True: defi_vn = syn_vn['d'] defi_ox = syn_ox['d'] value = defi_vn + "\t" + defi_ox + all_defi_ox FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("0", __filename_input_gs__)
seed = 2017 np.random.seed(seed) ''' data = load_iris() idx = np.random.permutation(150) X = data.data[idx] y = data.target[idx] print("Iris data shape and format:") print(type(X)) print(type(y)) print(y.shape) ''' # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) X_test, y_test = fipr.load_csv("test_file.csv", True) time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True) training_data = X_train training_labels = y_train.flatten() test_data = X_test test_labels = y_test.flatten() test_data_even = X_test_even test_labels_even = y_test_even.flatten()
def compareVietNetAndOxford(dict_VietNet, dict_Oxford): for WORD in dict_Oxford: if len(dict_Oxford[WORD]) == 0: continue # if WORD == "BA": # print "holyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyyholyyy" wn_words = wn.synsets(WORD, pos="n") if wn_words == None: continue if WORD == "baby": a = 1 if dict_VietNet.has_key(WORD): arr_VietNet = dict_VietNet[WORD] arr_Oxford = dict_Oxford[WORD] matrix_similarity = [[0 for x in range(len(arr_Oxford))] for x in range(len(wn_words))] for iWn in range(len(wn_words)): definitionWn = wn.synset(wn_words[iWn].name()).definition() vietNet = {} for iVietNet in arr_VietNet: levenshtein_vn_wn = Util.levenshtein(arr_VietNet[iVietNet]["d"], definitionWn) if levenshtein_vn_wn < len(definitionWn) / 2.0: vietNet = arr_VietNet[iVietNet] break if not vietNet.has_key("tv"): vietNet["tv"] = "" viet_net_tv = vietNet["tv"] for iOxford in range(len(arr_Oxford)): oxford = arr_Oxford[str(iOxford)] vietNet_tv = viet_net_tv if not oxford.has_key("tv"): continue oxford_tv = oxford["tv"].encode("utf-8") vietNet_tv.replace(";", "") oxford_tv = oxford_tv.replace(";", "") oxford_tv = oxford_tv.replace(",", "") oxford_tv = oxford_tv.replace("/", " ") arr_tv_oxford = set(oxford_tv.split(" ")) arr_tv_vietnet = set(vietNet_tv.split(" ")) jaccard = jaccard_distance(arr_tv_oxford, arr_tv_vietnet) print arr_tv_vietnet print arr_tv_oxford print jaccard matrix_similarity[iWn][iOxford] = 0 if jaccard < 0.95: matrix_similarity[iWn][iOxford] = 1 matrix_similarity[iWn].insert(0, viet_net_tv + "<>" + definitionWn.encode("utf-8")) print matrix_similarity # - - - - - - - - - - - - - - - - - - - - - - - - - # col # for i in range(len(dict_VietNet[WORD])): # matrix_similarity[i].insert(0,dict_VietNet[WORD][i]["tv"] + "<>" + dict_VietNet[WORD][i]["d"]); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = [] arrRowDict.append(WORD) for i in range(len(dict_Oxford[WORD])): if not dict_Oxford[WORD][str(i)].has_key("tv"): dict_Oxford[WORD][str(i)]["tv"] = "-" if not dict_Oxford[WORD][str(i)].has_key("d"): dict_Oxford[WORD][str(i)]["d"] = "-" if dict_Oxford[WORD][str(i)]["d"] == None: dict_Oxford[WORD][str(i)]["d"] = "-" arrRowDict.append( dict_Oxford[WORD][str(i)]["tv"].encode("utf-8") + "<>" + dict_Oxford[WORD][str(i)]["d"].encode("utf-8") ) FileProcess.append_to_excel_file( "Results/parameters/VN_Ox/" + "compare_VN_Ox_2_2.1.csv", arrRowDict, matrix_similarity )
def create_input_sen_via_gold_data(dict_vn, dict_ox, dict_gold): for word in dict_ox: if len(dict_ox[word]) == 0 or word not in dict_gold: continue if word == "blockage": return if word in dict_vn: word_syns_vn = dict_vn[word] word_syns_ox = dict_ox[word] if len(word_syns_ox) == 1 and len(word_syns_vn) == 1: continue if len(word_syns_ox) == 1 and len(word_syns_vn) > 1: all_defi_vn = "" for i_vn in word_syns_vn: syn_vn = word_syns_vn[i_vn] if "tv" not in syn_vn: continue defi_vn = syn_vn['d'] all_defi_vn += defi_vn + "\t" if all_defi_vn != "": all_defi_vn = all_defi_vn[:-1] for i_vn in word_syns_vn: syn_vn = word_syns_vn[i_vn] for i_ox in word_syns_ox: syn_ox = word_syns_ox[i_ox] if "tv" not in syn_ox: continue defi_vn = syn_vn['d'] defi_ox = syn_ox['d'] value = defi_vn + "\t" + defi_ox + "\t" + all_defi_vn if dict_gold[word][int(i_vn)][int(i_ox)] == "1": FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("1", __filename_input_gs__) else: FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("0", __filename_input_gs__) else: for i_vn in word_syns_vn: syn_vn = word_syns_vn[i_vn] all_defi_ox = "" for i_ox in word_syns_ox: syn_ox = word_syns_ox[i_ox] if "tv" not in syn_ox: continue defi_ox = syn_ox['d'] all_defi_ox += defi_ox + "\t" if all_defi_ox != "": all_defi_ox = all_defi_ox[:-1] for i_ox in word_syns_ox: syn_ox = word_syns_ox[i_ox] if "tv" not in syn_ox: continue defi_vn = syn_vn['d'] defi_ox = syn_ox['d'] value = defi_vn + "\t" + defi_ox + "\t" + all_defi_ox if dict_gold[word][int(i_vn)][int(i_ox)] == "1": FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("1", __filename_input_gs__) else: FileProcess.append_value_to_file(value, __filename_input_sen__) FileProcess.append_value_to_file("0", __filename_input_gs__)
def similarity_by_synsets_synsets_nbest_withword_average(WORD, dict_words): if WORD == "bank": asf = 0; # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD); # print "dict-word_synsets" # print dict_words_synsets # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data wn_words = wn.synsets(WORD, pos = 'n'); print "wn_words -------" print wn_words; wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words); print wn_words_synsets # matrix for similarity dict_words vs wn_words matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; # for dict_synset in dict_words_synsets[iDictWord]: # print "------------ dict noun" # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: # p_max = dict_synset.path_similarity(wn_synset); if p_max == None: continue arr_p.append(p_max); # print p_max arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001; for i in xrange(0, len(arr_p)-1): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count += 1; p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in range(len(arr_p_word)): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data wn_words = dict_words; wn_words_synsets = get_nbest_synsets_n_v_with_word(wn_words,WORD); # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data dict_words = wn.synsets(WORD, pos = 'n'); # print wn_words; dict_words_synsets = WordnetProcess.get_synsets_n_v(WORD, dict_words); print "sysnets -----------------------.----.-----.--.-" # matrix for similarity dict_words vs wn_words matrix_similarity_reverse = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; for dict_synset in dict_words_synsets[iDictWord]: # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets countwnNouns = 0.00000001; arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: p_max = dict_synset.path_similarity(wn_synset); if p_max != None: arr_p.append(p_max); # print p_max # - - - - - - - - - - - - - - - - - - - - - - - - arr_p = sorted(arr_p, reverse=True); nBest = 8; count = 0.0001 for i in range(len(arr_p)): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count +=1 p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 10; count = 5; for i in xrange(0, len(arr_p_word)-1): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*5.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1.; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity_reverse[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity_reverse] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - dict_words = wn_words; wn_words = wn.synsets(WORD, pos = 'n'); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord] + matrix_similarity_reverse[iDictWord][iWnWord]; matrix_similarity[iWnWord][iDictWord] /= 2; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # @brief: # matrix_similarity_jaccard = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; for iWnWord in range(len(wn_words)): tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition())); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(wn.synset(wn_words[iWnWord].name()).definition()); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); wn_set = set(words); # wn_set = set(wn.synset(wn_words[iWnWord].name()).definition().split()) # print wn_set # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): if not dict_words[str(iDictWord)].has_key("d") or dict_words[str(iDictWord)]["d"] == None: matrix_similarity_jaccard[iWnWord][iDictWord] = 1; continue tagged_sent = POSWrapper.pos_tag(nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"])); words = [word for word,pos in tagged_sent if (pos == 'NN' or pos == 'NNS' or pos == 'JJ' or pos == '' or pos == 'VB' or pos == 'VBN' or pos == 'VBD' or pos == 'RB')]; # words = nltk.wordpunct_tokenize(dict_words[str(iDictWord)]["d"]); # print words for i in range(len(words)): words[i] = wordnet_lemmatizer.lemmatize(words[i]); dict_set = set(words); # print # dict_set = set(dict_words[str(iDictWord)]["d"].encode('utf8').split()); matrix_similarity_jaccard[iWnWord][iDictWord] = jaccard_distance(wn_set,dict_set); for iWnWord in range(len(wn_words)): for iDictWord in range(len(dict_words)): matrix_similarity[iWnWord][iDictWord] = matrix_similarity[iWnWord][iDictWord]*10 + 2*(1-matrix_similarity_jaccard[iWnWord][iDictWord]); matrix_similarity[iWnWord][iDictWord] /= 12; #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) #################################################################################################### # # write file # - - - - - - - - - - - - - - - - - - - - - - - - - # col arrColWn = []; for i in range(len(wn_words)): matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition()); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = []; arrRowDict.append("--"); for i in range(len(dict_words)): if not dict_words[str(i)].has_key('tv'): dict_words[str(i)]['tv'] = "--"; if dict_words[str(i)]['tv'] == None: dict_words[str(i)]['tv'] = "--" arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8')); FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword_average.csv",arrRowDict,matrix_similarity)
print("File open") print("Receiving data...") i = 0 while True: data = conn.recv(1024) i += 1 if not data: break file.write(data) print("File receive") conn.close() conn, addr = sock.accept() FileProcess.readFromExcel(fileInput) FileProcess.supervisorInRoom() FileProcess.supervisorOutRoom() FileProcess.writeToExcel(fileOutput) FileProcess.connectToDatabase() with open(fileOutput, "rb") as file: data = file.read(1024) i = 0 while data: conn.send(data) #print(f"Sent {data!r}") i += 1 data = file.read(1024) print("File send")
def cal_features_and_write_to_file_for(syns_wn, syns_ox, filename_output): if len(syns_ox) == 1 and len(syns_wn) > 1: # cal all features between syns in ox with syn in wn syns_values_in_row = [] for i_wn in range(len(syns_wn)): syn_wn = syns_wn[i_wn] syn_ox = syns_ox[str(0)] feature_values = cal_feature_values_for(syn_wn, syn_ox) syns_values_in_row.append(feature_values) # cal max values of each feature arr_root_values_of_feature = [] for i_feature in range(len(syns_values_in_row[0])): root = root_values_of_a_feature_in_row(syns_values_in_row, i_feature) arr_root_values_of_feature.append(root) for i_wn in range(len(syns_wn)): # cal value for svm for i_ox in range(len(syns_ox)): feature_values_for_svm = "" feature_values_1_syn = syns_values_in_row[i_wn] for i_feature in range(len(feature_values_1_syn)): root_value = arr_root_values_of_feature[i_feature] feature_value = feature_values_1_syn[i_feature] feature_value_for_svm = feature_value/root_value feature_values_for_svm += str(feature_value_for_svm) + "\t" if feature_values_for_svm != "": feature_values_for_svm = feature_values_for_svm[:-1] FileProcess.append_value_to_file(feature_values_for_svm, filename_output) else: for i_wn in range(len(syns_wn)): # cal all features between syns in ox with syn in wn syns_values_in_row = [] for i_ox in range(len(syns_ox)): syn_wn = syns_wn[i_wn] syn_ox = syns_ox[str(i_ox)] feature_values = cal_feature_values_for(syn_wn, syn_ox) syns_values_in_row.append(feature_values) # cal max values of each feature arr_root_values_of_feature = [] for i_feature in range(len(syns_values_in_row[0])): root = root_values_of_a_feature_in_row(syns_values_in_row, i_feature) arr_root_values_of_feature.append(root) # cal value for svm for i_ox in range(len(syns_ox)): feature_values_for_svm = "" feature_values_1_syn = syns_values_in_row[i_ox] for i_feature in range(len(feature_values_1_syn)): root_value = arr_root_values_of_feature[i_feature] feature_value = feature_values_1_syn[i_feature] feature_value_for_svm = feature_value/root_value feature_values_for_svm += str(feature_value_for_svm) + "\t" if feature_values_for_svm != "": feature_values_for_svm = feature_values_for_svm[:-1] FileProcess.append_value_to_file(feature_values_for_svm, filename_output)
def write_label_for_svm(syns_wn, syns_ox, dict_gold): for i_wn in range(len(syns_wn)): for i_ox in range(len(syns_ox)): FileProcess.append_value_to_file(dict_gold[i_wn][i_ox], __filename_input_gs_train__)
import Edge import FileProcess import KruskalAlgorithm import Graph import SteinerTree # Reading file FILENAME = "Data/"+input("Enter the file's name : Data/") inputFile = open(FILENAME, "r") # _____________________________________________________________________________________________________________________________ # Process tuple = FileProcess.fileProcess(inputFile.read()) inputFile.close() Edges = tuple[4] connectedNodes = tuple[3] terminalNodes = tuple[5] edgesNum = tuple[1] # number of edges nodesNum = tuple[0] # number of nodes terminalsNum = tuple[2] # number of nodes newEdges = KruskalAlgorithm.KruskalAlgorithm(Edges, connectedNodes, nodesNum, edgesNum) print(Graph.calculateCost(newEdges))# MST newEdges = SteinerTree.buildSteinerTree(nodesNum, terminalNodes, connectedNodes, newEdges)
def main(): # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) X_test, y_test = fipr.load_csv("test_file.csv", True) #y_train = y_train.flatten() #y_test = y_test.flatten() time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True) training_data = X_train training_labels = y_train test_data = X_test test_labels = y_test test_data_even = X_test_even test_labels_even = y_test_even # building the SDA sDA = StackedDA([100]) # start counting time for training time_train_start = time.clock() print('Pre-training...') # pre-trainning the SDA sDA.pre_train(training_data[:1000], noise_rate=0.3, epochs=100) print('Training Network...') # adding the final layer sDA.finalLayer(training_data, training_labels, epochs=500) # trainning the whole network sDA.fine_tune(training_data, training_labels, epochs=500) # print training time time_train_end = time.clock() print("Training finished, training time: %g seconds \n" % (time_train_end - time_train_start)) # start counting time for testing time_test_start = time.clock() print('Testing performance...') # predicting using the SDA y_pred = sDA.predict(test_data).argmax(1) # print simple precision metric to the console print('Accuracy: ' + str(fipr.compute_accuracy(y_test, y_pred))) # print testing time time_test_end = time.clock() print("Testing finished, testing time: %g seconds \n" % (time_test_end - time_test_start)) # Even set test y_pred_even = sDA.predict(test_data_even).argmax(1) # print simple precision metric to the console print('Accuracy on EVEN set: ' + str(fipr.compute_accuracy(y_test_even, y_pred_even))) return sDA
def append_params_and_result_to_file(values): FileProcess.append_result_to_excel_file(result_file_name,values)
# Import labs import numpy as np import matplotlib.pyplot as plt from RBFN import RBFN import sys import numpy as np import FileProcess as fipr import time # Start counting time start_time = time.clock() # Open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) X_test, y_test = fipr.load_csv("test_file.csv", True) y_train = y_train.flatten() y_test = y_test.flatten() time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) # Training the network ''' x = np.linspace(0,10,100) y = np.sin(x) ''' # start counting time for training time_train_start = time.clock() # start training
# -*- coding: utf-8 -*- # @Time : 2020/10/26 9:30 # @Author : SanZhi # @File : index.py # @Software: PyCharm import FileProcess import getdoc2vec import get_tSNE import get_DBscan predata, namespace = FileProcess.preprocess(r'use5.csv') DocData = getdoc2vec.do_doc2vec(predata) TsneData = get_tSNE.getTsneData(DocData, namespace) # FileProcess.write_json('1-20alltsne.json', TsneData) # FileProcess.write_json('1-20peopleList.json', namespace) DBscanData = get_DBscan.getDbscanData(TsneData) FileProcess.write_json('1-20top7.json', DBscanData)
def main(): # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) X_test, y_test = fipr.load_csv("test_file.csv", True) y_train = y_train.flatten() y_test = y_test.flatten() time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True) y_test_even = y_test_even.flatten() # scale features to encourage gradient descent convergence X_train = fipr.scale_features(X_train, 0.0, 1.0) X_test = fipr.scale_features(X_test, 0.0, 1.0) X_test_even = fipr.scale_features(X_test_even, 0.0, 1.0) Pattern_train = [] for i, sample_train in enumerate(X_train): Pattern_train.append([sample_train, y_train[i]]) Pattern_test = [] for j, sample_test in enumerate(X_test): Pattern_test.append([sample_test, y_test[j]]) Pattern_test_even = [] for k, sample_test_even in enumerate(X_test_even): Pattern_test_even.append([sample_test_even, y_test_even[k]]) #print(Pattern_train) #print(Pattern_test) # Teach network XOR function (for test only) '''pat = [ [[0,0], [0]], [[0,1], [1]], [[1,0], [1]], [[1,1], [0]] ] print(pat) # create a network with two input, two hidden, and one output nodes n = NN(2, 2, 1) # train it with some patterns n.train(pat) # test it n.test(pat)''' # Test on Iris data #pattern = irisdemo() # create a network with two hundred inputs, two hidden, and one output nodes n = NN(200, 4, 1) # start counting time for training time_train_start = time.clock() # train it with some patterns n.train(Pattern_train) # print training time time_train_end = time.clock() print("Training finished, training time: %g seconds \n" % (time_train_end - time_train_start)) # start counting time for testing time_test_start = time.clock() # test it n.test(Pattern_test) # print testing time time_test_end = time.clock() print("Testing finished, testing time: %g seconds \n" % (time_test_end - time_test_start)) # test on EVEN data set n.test(Pattern_test_even)
lmbda - the regularization term model_file - the name of the file to store the final classification model """ # Start counting time start_time = time.clock() # Set parameters alpha = 0.01 lmbda = 0 maxiter = 100 # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) X_test, y_test = fipr.load_csv("test_file.csv", True) y_train = y_train.flatten() y_test = y_test.flatten() time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) X_test_even, y_test_even = fipr.load_csv("test_file_even.csv", True) y_test_even = y_test_even.flatten() # scale features to encourage gradient descent convergence X_train = fipr.scale_features(X_train, 0.0, 1.0) X_test = fipr.scale_features(X_test, 0.0, 1.0) X_test_even = fipr.scale_features(X_test_even, 0.0, 1.0)
### Import Library of Gaussian Naive Bayes model from sklearn.naive_bayes import GaussianNB import sys import numpy as np import FileProcess as fipr import time # Start counting time start_time = time.clock() # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file_400atb.csv", True) X_test, y_test = fipr.load_csv("test_file_400atb.csv", True) y_train = y_train.flatten() y_test = y_test.flatten() time_load_end = time.clock() print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) X_test_even, y_test_even = fipr.load_csv("test_file_400atb_even.csv", True) y_test = y_test.flatten() # Create a Gaussian Classifier model = GaussianNB() # start counting time for training time_train_start = time.clock() # Train the model using the training sets model.fit(X_train, y_train)
def similarity_by_synsets_synsets_nbest_withword(WORD, dict_words): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # dictionary data dict_words_synsets = get_nbest_synsets_n_v_with_word(dict_words,WORD); print "dict-word_synsets" print dict_words_synsets # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # wordnet data wn_words = wn.synsets(WORD, pos = 'n'); print "wn_words -------" print wn_words; wn_words_synsets = WordnetProcess.get_synsets_n_v(WORD, wn_words); print wn_words_synsets # matrix for similarity dict_words vs wn_words matrix_similarity = [[0 for x in range(len(dict_words))] for x in range(len(wn_words))]; # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### # # calculate 2d matrix of p for iWnWord in range(len(wn_words)): # - - - - - - - - - - - - - - - - - - - - - - - - - - - # word-word for iDictWord in range(len(dict_words)): p_iWnWord_iDictWord = 0.; arr_p_word = []; # for dict_synset in dict_words_synsets[iDictWord]: # print "------------ dict noun" # print dictNoun; p_dictNoun_wnNouns = 0; # for some nouns don't have synsets arr_p = []; # - - - - - - - - - - - - - - - - - - - - - - - - for wn_synset in wn_words_synsets[iWnWord]: # p_max = dict_synset.path_similarity(wn_synset); if p_max == None: continue arr_p.append(p_max); # print p_max arr_p = sorted(arr_p, reverse=True); nBest = 3; count = 0.0001; for i in xrange(0, len(arr_p)-1): if i < nBest: p_dictNoun_wnNouns += arr_p[i]; count += 1; p_dictNoun_wnNouns = p_dictNoun_wnNouns/count; arr_p_word.append(p_dictNoun_wnNouns); arr_p_word = sorted(arr_p_word, reverse=True); nBest = 40; count = 5; for i in xrange(0, len(arr_p_word)-1): if i < nBest: if nBest > len(arr_p_word): if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*10.; elif i< nBest/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; else: if i == 0: p_iWnWord_iDictWord += arr_p_word[i]*10.; elif i< len(arr_p_word)/3: p_iWnWord_iDictWord += arr_p_word[i]*1; else: p_iWnWord_iDictWord += arr_p_word[i]*1; count += 1; if count == 0: p_iWnWord_iDictWord = 0; else: p_iWnWord_iDictWord = p_iWnWord_iDictWord/count matrix_similarity[iWnWord][iDictWord] = p_iWnWord_iDictWord; # - - - - - - - - - - - - - - - - - - - - - - - - - # word-word # - - - - - - - - - - - - - - - - - - - - - - - - - - - #################################################################################################### print "----------------------------------------------------" s = [[str(e) for e in row] for row in matrix_similarity] lens = [max(map(len, col)) for col in zip(*s)] fmt = '\t'.join('{{:{}}}'.format(x) for x in lens) table = [fmt.format(*row) for row in s] print '\n'.join(table) # - - - - - - - - - - - - - - - - - - - - - - - - - - - # # write file # - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - # col arrColWn = []; for i in range(len(wn_words)): matrix_similarity[i].insert(0,wn.synset(wn_words[i].name()).definition()); # - - - - - - - - - - - - - - - - - - - - - - - - - # row arrRowDict = []; arrRowDict.append("--"); for i in range(len(dict_words)): if not dict_words[str(i)].has_key('tv'): dict_words[str(i)]['tv'] = "--"; if dict_words[str(i)]['tv'] == None: dict_words[str(i)]['tv'] = "--" arrRowDict.append(dict_words[str(i)]["tv"].encode('utf8')); FileProcess.write_to_excel_file("Results/"+WORD+"_synsets_synsets_nbest_withword.csv",arrRowDict,matrix_similarity)
import urllib import time import tensorflow as tf import FileProcess as fipr from Mnist import Mnist mnist = Mnist() sess = tf.InteractiveSession() # Start counting time start_time = time.clock() # open and load csv files time_load_start = time.clock() X_train, y_train = fipr.load_csv("train_file.csv", True) #X_test, y_test = fipr.load_csv("test_file.csv", True) #y_train = y_train.flatten() #y_test = y_test.flatten() time_load_end = time.clock() #print("Loading finished, loading time: %g seconds" % (time_load_end - time_load_start)) training_data = X_train training_labels = y_train print(type(training_labels)) print(type(training_labels[0, 0])) print(training_labels.shape) print('original labels:') print(training_labels[3])