def __init__(self, path_list): t0 = time() UP_pages = allPages(path_list) feature_matrix = [] y =[] # get features and labels for page in UP_pages.pages: tfidf_vector = [] for key in page.tfidf: tfidf_vector.append(page.tfidf[key]) feature_matrix.append(tfidf_vector) y = UP_pages.category self.y = np.array(y) X = np.array(feature_matrix) X = scale(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.99, random_state=2) clf = neighbors.KNeighborsClassifier(1) clf.fit(X,y) labels = clf.predict(X_test) right = 0 for i in range(len(labels)): if labels[i] == y_test[i]: right += 1 print "accuracy is "+ str(float(right)/float(len(y_test))) # select print("done in %0.3fs." % (time() - t0))
def __init__(self, path_list): t0 = time() UP_pages = allPages(path_list) feature_matrix = [] y = [] # get features and labels for page in UP_pages.pages: tfidf_vector = [] for key in page.tfidf: tfidf_vector.append(page.tfidf[key]) feature_matrix.append(tfidf_vector) y = UP_pages.category self.y = np.array(y) X = np.array(feature_matrix) X = scale(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.99, random_state=2) clf = neighbors.KNeighborsClassifier(1) clf.fit(X, y) labels = clf.predict(X_test) right = 0 for i in range(len(labels)): if labels[i] == y_test[i]: right += 1 print "accuracy is " + str(float(right) / float(len(y_test))) # select print ("done in %0.3fs." % (time() - t0))
def __init__(self, data_path,dataset): self.pages = allPages([data_path]) self.dataset = dataset prefix = data_path self.file_set = [] for page in self.pages.pages: self.file_set.append(page.path)
def __init__(self, data_path, dataset): self.pages = allPages([data_path]) self.dataset = dataset prefix = data_path self.file_set = [] for page in self.pages.pages: self.file_set.append(page.path)
def get_trans_mat(self, dataset): self.pages = allPages( ["../../Crawler/Apr17_samples/{0}/".format(dataset)], "new_stackexchange", mode="raw") trans_mat = np.zeros((self.max_class_num + 1, self.max_class_num + 1)) total_links = 0 count_list = {} print len(self.pages.pages) for page in self.pages.pages: path = page.path.replace("../../Crawler", "../Crawler") class_id = self.class_dict[path] cluster_id = self.cluster_dict[path] if class_id == -1: continue # print page.path , group if class_id not in count_list: count_list[class_id] = 1 else: count_list[class_id] += 1 link_dict = page.getAnchor() if cluster_id not in self.class_xpath: self.class_xpath[cluster_id] = {} self.class_xpath_link[cluster_id] = {} for xpath, links in link_dict.iteritems(): # initialize add xpath to class_xpath if xpath not in self.class_xpath[cluster_id]: self.class_xpath[cluster_id][xpath] = [] self.class_xpath_link[cluster_id][xpath] = [] for link in links: if self.check_intralink(link): tag = self.annotate(link, self.gold_map_dict) if tag != -1: total_links += 1 trans_mat[class_id, tag] += 1 self.class_xpath[cluster_id][xpath].append(tag) self.class_xpath_link[cluster_id][xpath].append( link) # if group == tag: # print link , page.path # print trans_mat # trans_mat = normalize(trans_mat,norm='l1', axis=1) print count_list for i in range(self.max_class_num + 1): for j in range(self.max_class_num + 1): if i not in count_list: trans_mat[i, j] = 0 else: trans_mat[i, j] = float(trans_mat[i, j]) / float( count_list[i]) print "total_links has " + str(total_links) self.trans_mat = trans_mat
def get_trans_mat(self, dataset): self.pages = allPages( ["../../Crawler/Apr17_samples/{0}/".format(dataset)], "new_{}".format(dataset), mode="raw" ) trans_mat = np.zeros((self.max_class_num + 1, self.max_class_num + 1)) total_links = 0 count_list = {} print len(self.pages.pages) for page in self.pages.pages: path = page.path.replace("../../Crawler", "../Crawler") class_id = self.class_dict[path] cluster_id = self.cluster_dict[path] if class_id == -1: continue # print page.path , group if class_id not in count_list: count_list[class_id] = 1 else: count_list[class_id] += 1 link_dict = page.getAnchor() if cluster_id not in self.class_xpath: self.class_xpath[cluster_id] = {} self.class_xpath_link[cluster_id] = {} for xpath, links in link_dict.iteritems(): # initialize add xpath to class_xpath if xpath not in self.class_xpath[cluster_id]: self.class_xpath[cluster_id][xpath] = [] self.class_xpath_link[cluster_id][xpath] = [] for link in links: if self.check_intralink(link): tag = self.annotate(link, self.gold_map_dict) if tag != -1: total_links += 1 trans_mat[class_id, tag] += 1 self.class_xpath[cluster_id][xpath].append(tag) self.class_xpath_link[cluster_id][xpath].append(link) # if group == tag: # print link , page.path # print trans_mat # trans_mat = normalize(trans_mat,norm='l1', axis=1) print count_list for i in range(self.max_class_num + 1): for j in range(self.max_class_num + 1): if i not in count_list: trans_mat[i, j] = 0 else: trans_mat[i, j] = float(trans_mat[i, j]) / float(count_list[i]) print "total_links has " + str(total_links) self.trans_mat = trans_mat
def __init__(self, dataset, date="Apr17",path_list=None,num_clusters=None, num_sample=1000): if path_list is None and num_clusters is None: self.t0 = time() else: self.t0 = time() self.dataset = dataset self.date = date self.UP_pages = allPages(path_list,dataset,date) self.path_list = self.UP_pages.path_list self.num_clusters = num_clusters self.features = self.UP_pages.features
def __init__(self, dataset, date="Apr17", path_list=None, num_clusters=None, num_sample=1000): if path_list is None and num_clusters is None: self.t0 = time() else: self.t0 = time() self.dataset = dataset self.date = date self.UP_pages = allPages(path_list, dataset, date) self.path_list = self.UP_pages.path_list self.num_clusters = num_clusters self.features = self.UP_pages.features
def __init__(self, dataset, date="Apr17",path_list=None,num_clusters=None, num_samples=None, debug=False): if path_list is None and num_clusters is None: self.t0 = time() else: self.t0 = time() self.dataset = dataset self.date = date print "debug mode ", debug, dataset self.UP_pages = allPages(path_list,dataset,date,num_samples,debug=debug) self.path_list = self.UP_pages.path_list self.num_clusters = num_clusters if num_samples is not None: self.num_samples = int(num_samples) else: self.num_samples = None self.features = self.UP_pages.features
print type(s), " type of s" tmp = self.jaccard(schema,s) if tmp < min_jaccard: min_jaccard = tmp cid = id self.pattern[cid].append(index) if __name__ == "__main__": dataset = sys.argv[1] mode = sys.argv[2] #dataset = "rottentomatoes" data_pages = allPages(["../Crawler/July30_samples/{}/".format(dataset)],dataset,date="July30",mode="c_baseline") #with open("cluster_temp","wb") as outfile: # pickle.dump(outfile,data_pages) c_baseline = baseline(data_pages,dataset) print data_pages.ground_truth if mode == "cv": c_baseline.cv() elif mode == "train": c_baseline.run() c_baseline.MDL() c_baseline.clustering() c_baseline.pages.Evaluation() ''' for page in c_baseline.pages.pages: print page.anchor_xpath_set
import gensim, logging from pages import allPages from gensim.models import word2vec from nltk.tokenize import WordPunctTokenizer logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #=============================================================================== # sentences = [['first', 'second','sentence'], ['second', 'sentence']] # model = gensim.models.Word2Vec(sentences, min_count=1,workers=3) # print(model.similarity('first','sentence')) #=============================================================================== #sentences = word2vec.LineSentence('comment/comment_table_cleaned.txt') #sentences = sentences.decode('latin-1').encode('utf8') print("Program Starts") sentences = [] UP_pages = allPages(["../Crawler/crawl_data/Questions/"]) for page in UP_pages.pages: sentences.append(page.dfs_xpaths_list) print len(sentences) model = gensim.models.Word2Vec(sentences,min_count=5, window=20, size=100,workers=5) #print("The lengh of sentences is ") #print(str(sentences.len())) #model = gensim.models.Word2Vec.load('../model/MedHelp_tokenizer.model') #model.train(sentences) #b = model.most_similar(positive=['feminism'], topn=1) #print(b) model.save('./Data/word2vec.model') #print(model['nurse']) #print(model.most_similar(['nurse'],topn=3)) #print(model.most_similar(['agree'],topn=10)) #print(model.most_similar(['cancer'],topn=8))
from page import Page from pages import allPages # this program aims to find out the xpath frequency differences between two page objects. if __name__ == '__main__': UP_pages = allPages(["../Crawler/crawl_data/showXpath/"]) a = UP_pages.pages[0] # less b = UP_pages.pages[1] # more print a.path + "\t" + b.path dif_dict = {} for item in a.xpaths: dif_dict[item] = a.xpaths[item] - b.xpaths[item] # Top 10 i > j g_sorted_result_dict = sorted(dif_dict.iteritems(), key=lambda d: d[1], reverse=True) l_sorted_result_dict = sorted(dif_dict.iteritems(), key=lambda d: d[1], reverse=False) for i in range(10): print str(g_sorted_result_dict[i][0]) + "\t" + str( g_sorted_result_dict[i][1]) for i in range(10): print str(l_sorted_result_dict[i][0]) + "\t" + str( l_sorted_result_dict[i][1])
import matplotlib.pyplot as plt ''' a = [3,3,3,2,2,1] c = Counter(a) x, y =[], [] for item in c: x.append(item) y.append(c[item]) plt.plot(x,y) plt.show() ''' #pages = allPages(["../Crawler/test_data/zhihu/"],dataset="rottentomatoes",mode="raw") pages = allPages(["../Crawler/Mar15_samples/asp/"],dataset="asp",mode="read") tf_matrix = [] log_tf_matrix = [] for index, page in enumerate(pages.pages): if index == 1 or index == 989: print page.path vector = [] for key in page.selected_tfidf: vector.append(page.selected_tfidf[key]) tf_vector = normalize(vector,norm='l1')[0] tf_matrix.append(tf_vector) vector = [] for key in page.selected_logtfidf: vector.append(page.selected_logtfidf[key])
import matplotlib.pyplot as plt ''' a = [3,3,3,2,2,1] c = Counter(a) x, y =[], [] for item in c: x.append(item) y.append(c[item]) plt.plot(x,y) plt.show() ''' #pages = allPages(["../Crawler/test_data/zhihu/"],dataset="rottentomatoes",mode="raw") pages = allPages(["../Crawler/Mar15_samples/asp/"], dataset="asp", mode="read") tf_matrix = [] log_tf_matrix = [] for index, page in enumerate(pages.pages): if index == 1 or index == 989: print page.path vector = [] for key in page.selected_tfidf: vector.append(page.selected_tfidf[key]) tf_vector = normalize(vector, norm='l1')[0] tf_matrix.append(tf_vector) vector = [] for key in page.selected_logtfidf: vector.append(page.selected_logtfidf[key])
s = self.mapping[id] print type(schema), " type of schema" print type(s), " type of s" tmp = self.jaccard(schema, s) if tmp < min_jaccard: min_jaccard = tmp cid = id self.pattern[cid].append(index) if __name__ == "__main__": dataset = sys.argv[1] mode = sys.argv[2] #dataset = "rottentomatoes" data_pages = allPages(["../Crawler/July30_samples/{}/".format(dataset)], dataset, date="July30", mode="c_baseline") #with open("cluster_temp","wb") as outfile: # pickle.dump(outfile,data_pages) c_baseline = baseline(data_pages, dataset) print data_pages.ground_truth if mode == "cv": c_baseline.cv() elif mode == "train": c_baseline.run() c_baseline.MDL() c_baseline.clustering() c_baseline.pages.Evaluation() ''' for page in c_baseline.pages.pages: print page.anchor_xpath_set
from page import Page from pages import allPages # this program aims to find out the xpath frequency differences between two page objects. if __name__=='__main__': UP_pages = allPages(["../Crawler/crawl_data/showXpath/"]) a = UP_pages.pages[0] # less b = UP_pages.pages[1] # more print a.path + "\t" + b.path dif_dict = {} for item in a.xpaths: dif_dict[item] = a.xpaths[item]-b.xpaths[item] # Top 10 i > j g_sorted_result_dict= sorted(dif_dict.iteritems(), key=lambda d:d[1], reverse = True) l_sorted_result_dict= sorted(dif_dict.iteritems(), key=lambda d:d[1], reverse = False) for i in range(10): print str(g_sorted_result_dict[i][0]) + "\t" + str(g_sorted_result_dict[i][1]) for i in range(10): print str(l_sorted_result_dict[i][0]) + "\t" + str(l_sorted_result_dict[i][1])
pair_dict = {} for key1 in xpaths_dict: for key2 in xpaths_dict[key1]: print xpaths_dict[key1][key2] p = float(co_dict[key1][key2])/float(N_pages) print p xpaths_dict[key1][key2] = p*xpaths_dict[key1][key2] if xpaths_dict[key1][key2] == 0: pair_dict["("+key1+","+key2+")"] = 0 else: pair_dict["("+key1+","+key2+")"] = math.log(xpaths_dict[key1][key2]) * p bigram_list = [] top = 1000 pair_list = sorted(pair_dict.iteritems(),key=lambda x:x[1],reverse=True) for i in range(top): print pair_list[i][0] + "\t" + str(pair_list[i][1]) [path1, path2] = pair_list[i][0].replace("(","").replace(")","").split(",") print str(df[path1]) + "\t" + str(df[path2]) bigram_list.append([path1,path2]) print bigram_list return bigram_list if __name__ == "__main__": pages = allPages(["../Crawler/test_data/stackexchange/"]) b = bigram(pages) b.find_bigram()