Exemple #1
0
	def __init__(self, path_list):
		t0 = time()
		UP_pages = allPages(path_list)
		feature_matrix = []
		y =[]
		# get features and labels
		for page in UP_pages.pages:
			tfidf_vector = []
			for key in page.tfidf:
				tfidf_vector.append(page.tfidf[key])
			feature_matrix.append(tfidf_vector)	

		y = UP_pages.category
		self.y = np.array(y)
		X = np.array(feature_matrix)
		X = scale(X)

		X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.99, random_state=2)
		
		clf = neighbors.KNeighborsClassifier(1)
		clf.fit(X,y)
		labels = clf.predict(X_test)
		right = 0
		for i in range(len(labels)):
			if labels[i] == y_test[i]:
				right += 1
		print "accuracy is "+ str(float(right)/float(len(y_test)))
		# select  
		print("done in %0.3fs." % (time() - t0))		
Exemple #2
0
    def __init__(self, path_list):
        t0 = time()
        UP_pages = allPages(path_list)
        feature_matrix = []
        y = []
        # get features and labels
        for page in UP_pages.pages:
            tfidf_vector = []
            for key in page.tfidf:
                tfidf_vector.append(page.tfidf[key])
            feature_matrix.append(tfidf_vector)

        y = UP_pages.category
        self.y = np.array(y)
        X = np.array(feature_matrix)
        X = scale(X)

        X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.99, random_state=2)

        clf = neighbors.KNeighborsClassifier(1)
        clf.fit(X, y)
        labels = clf.predict(X_test)
        right = 0
        for i in range(len(labels)):
            if labels[i] == y_test[i]:
                right += 1
        print "accuracy is " + str(float(right) / float(len(y_test)))
        # select
        print ("done in %0.3fs." % (time() - t0))
Exemple #3
0
	def __init__(self, data_path,dataset):
		self.pages = allPages([data_path])
		self.dataset = dataset
		prefix = data_path 
		self.file_set = []
		for page in self.pages.pages:
			self.file_set.append(page.path)
Exemple #4
0
 def __init__(self, data_path, dataset):
     self.pages = allPages([data_path])
     self.dataset = dataset
     prefix = data_path
     self.file_set = []
     for page in self.pages.pages:
         self.file_set.append(page.path)
    def get_trans_mat(self, dataset):
        self.pages = allPages(
            ["../../Crawler/Apr17_samples/{0}/".format(dataset)],
            "new_stackexchange",
            mode="raw")
        trans_mat = np.zeros((self.max_class_num + 1, self.max_class_num + 1))
        total_links = 0
        count_list = {}
        print len(self.pages.pages)
        for page in self.pages.pages:
            path = page.path.replace("../../Crawler", "../Crawler")
            class_id = self.class_dict[path]
            cluster_id = self.cluster_dict[path]
            if class_id == -1:
                continue
                # print page.path , group
            if class_id not in count_list:
                count_list[class_id] = 1
            else:
                count_list[class_id] += 1
            link_dict = page.getAnchor()

            if cluster_id not in self.class_xpath:
                self.class_xpath[cluster_id] = {}
                self.class_xpath_link[cluster_id] = {}

            for xpath, links in link_dict.iteritems():
                # initialize add xpath to class_xpath
                if xpath not in self.class_xpath[cluster_id]:
                    self.class_xpath[cluster_id][xpath] = []
                    self.class_xpath_link[cluster_id][xpath] = []

                for link in links:
                    if self.check_intralink(link):
                        tag = self.annotate(link, self.gold_map_dict)

                        if tag != -1:
                            total_links += 1
                            trans_mat[class_id, tag] += 1
                            self.class_xpath[cluster_id][xpath].append(tag)
                            self.class_xpath_link[cluster_id][xpath].append(
                                link)

                        # if group == tag:
                    #	print link , page.path
                    # print trans_mat
                    # trans_mat = normalize(trans_mat,norm='l1', axis=1)
        print count_list
        for i in range(self.max_class_num + 1):
            for j in range(self.max_class_num + 1):
                if i not in count_list:
                    trans_mat[i, j] = 0
                else:
                    trans_mat[i, j] = float(trans_mat[i, j]) / float(
                        count_list[i])

        print "total_links has " + str(total_links)
        self.trans_mat = trans_mat
    def get_trans_mat(self, dataset):
        self.pages = allPages(
            ["../../Crawler/Apr17_samples/{0}/".format(dataset)], "new_{}".format(dataset), mode="raw"
        )
        trans_mat = np.zeros((self.max_class_num + 1, self.max_class_num + 1))
        total_links = 0
        count_list = {}
        print len(self.pages.pages)
        for page in self.pages.pages:
            path = page.path.replace("../../Crawler", "../Crawler")
            class_id = self.class_dict[path]
            cluster_id = self.cluster_dict[path]
            if class_id == -1:
                continue
                # print page.path , group
            if class_id not in count_list:
                count_list[class_id] = 1
            else:
                count_list[class_id] += 1
            link_dict = page.getAnchor()

            if cluster_id not in self.class_xpath:
                self.class_xpath[cluster_id] = {}
                self.class_xpath_link[cluster_id] = {}

            for xpath, links in link_dict.iteritems():
                # initialize add xpath to class_xpath
                if xpath not in self.class_xpath[cluster_id]:
                    self.class_xpath[cluster_id][xpath] = []
                    self.class_xpath_link[cluster_id][xpath] = []

                for link in links:
                    if self.check_intralink(link):
                        tag = self.annotate(link, self.gold_map_dict)

                        if tag != -1:
                            total_links += 1
                            trans_mat[class_id, tag] += 1
                            self.class_xpath[cluster_id][xpath].append(tag)
                            self.class_xpath_link[cluster_id][xpath].append(link)

                        # if group == tag:
                    # 	print link , page.path
                    # print trans_mat
                    # trans_mat = normalize(trans_mat,norm='l1', axis=1)
        print count_list
        for i in range(self.max_class_num + 1):
            for j in range(self.max_class_num + 1):
                if i not in count_list:
                    trans_mat[i, j] = 0
                else:
                    trans_mat[i, j] = float(trans_mat[i, j]) / float(count_list[i])

        print "total_links has " + str(total_links)
        self.trans_mat = trans_mat
 def __init__(self, dataset, date="Apr17",path_list=None,num_clusters=None, num_sample=1000):
     if path_list is None and num_clusters is None:
         self.t0 = time()
     else:
         self.t0 = time()
         self.dataset = dataset
         self.date = date
         self.UP_pages = allPages(path_list,dataset,date)
         self.path_list = self.UP_pages.path_list
         self.num_clusters = num_clusters
         self.features = self.UP_pages.features
Exemple #8
0
 def __init__(self,
              dataset,
              date="Apr17",
              path_list=None,
              num_clusters=None,
              num_sample=1000):
     if path_list is None and num_clusters is None:
         self.t0 = time()
     else:
         self.t0 = time()
         self.dataset = dataset
         self.date = date
         self.UP_pages = allPages(path_list, dataset, date)
         self.path_list = self.UP_pages.path_list
         self.num_clusters = num_clusters
         self.features = self.UP_pages.features
    def __init__(self, dataset, date="Apr17",path_list=None,num_clusters=None, num_samples=None, debug=False):
        if path_list is None and num_clusters is None:
            self.t0 = time()
        else:

            self.t0 = time()
            self.dataset = dataset
            self.date = date
            print "debug mode ", debug, dataset
            self.UP_pages = allPages(path_list,dataset,date,num_samples,debug=debug)
            self.path_list = self.UP_pages.path_list
            self.num_clusters = num_clusters
            if num_samples is not None:
                self.num_samples = int(num_samples)
            else:
                self.num_samples = None
            self.features = self.UP_pages.features
                    print type(s), " type of s"
                    tmp = self.jaccard(schema,s)
                    if  tmp < min_jaccard:
                        min_jaccard = tmp
                        cid = id
            self.pattern[cid].append(index)





if __name__ == "__main__":
    dataset = sys.argv[1]
    mode = sys.argv[2]
    #dataset = "rottentomatoes"
    data_pages = allPages(["../Crawler/July30_samples/{}/".format(dataset)],dataset,date="July30",mode="c_baseline")
    #with open("cluster_temp","wb") as outfile:
    #    pickle.dump(outfile,data_pages)
    c_baseline = baseline(data_pages,dataset)
    print data_pages.ground_truth
    if mode == "cv":
        c_baseline.cv()
    elif mode == "train":
        c_baseline.run()
        c_baseline.MDL()
        c_baseline.clustering()
        c_baseline.pages.Evaluation()

    '''
    for page in c_baseline.pages.pages:
        print page.anchor_xpath_set
Exemple #11
0
import gensim, logging
from pages import allPages
from gensim.models import word2vec
from nltk.tokenize import WordPunctTokenizer
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#===============================================================================
# sentences = [['first', 'second','sentence'], ['second', 'sentence']]
# model = gensim.models.Word2Vec(sentences, min_count=1,workers=3)
# print(model.similarity('first','sentence'))
#===============================================================================
#sentences = word2vec.LineSentence('comment/comment_table_cleaned.txt')
#sentences = sentences.decode('latin-1').encode('utf8')
print("Program Starts")
sentences = []
UP_pages = allPages(["../Crawler/crawl_data/Questions/"])
for page in UP_pages.pages:
	sentences.append(page.dfs_xpaths_list)
print len(sentences)
model = gensim.models.Word2Vec(sentences,min_count=5, window=20, size=100,workers=5)
#print("The lengh of sentences is ")
#print(str(sentences.len()))
#model = gensim.models.Word2Vec.load('../model/MedHelp_tokenizer.model')
#model.train(sentences)
#b = model.most_similar(positive=['feminism'], topn=1)
#print(b)
model.save('./Data/word2vec.model')
#print(model['nurse'])
#print(model.most_similar(['nurse'],topn=3))
#print(model.most_similar(['agree'],topn=10))
#print(model.most_similar(['cancer'],topn=8))
Exemple #12
0
from page import Page
from pages import allPages
# this program aims to find out the xpath frequency differences between two page objects.

if __name__ == '__main__':
    UP_pages = allPages(["../Crawler/crawl_data/showXpath/"])

    a = UP_pages.pages[0]  # less
    b = UP_pages.pages[1]  # more
    print a.path + "\t" + b.path
    dif_dict = {}
    for item in a.xpaths:
        dif_dict[item] = a.xpaths[item] - b.xpaths[item]

    # Top 10 i > j
    g_sorted_result_dict = sorted(dif_dict.iteritems(),
                                  key=lambda d: d[1],
                                  reverse=True)
    l_sorted_result_dict = sorted(dif_dict.iteritems(),
                                  key=lambda d: d[1],
                                  reverse=False)
    for i in range(10):
        print str(g_sorted_result_dict[i][0]) + "\t" + str(
            g_sorted_result_dict[i][1])
    for i in range(10):
        print str(l_sorted_result_dict[i][0]) + "\t" + str(
            l_sorted_result_dict[i][1])
Exemple #13
0
import matplotlib.pyplot as plt
'''
a = [3,3,3,2,2,1]
c = Counter(a)
x, y  =[], []
for item in c:
    x.append(item)
    y.append(c[item])
plt.plot(x,y)
plt.show()



'''
#pages = allPages(["../Crawler/test_data/zhihu/"],dataset="rottentomatoes",mode="raw")
pages = allPages(["../Crawler/Mar15_samples/asp/"],dataset="asp",mode="read")

tf_matrix = []
log_tf_matrix = []
for index, page in enumerate(pages.pages):
    if index == 1 or index == 989:
        print page.path
        vector = []
        for key in page.selected_tfidf:
            vector.append(page.selected_tfidf[key])
        tf_vector = normalize(vector,norm='l1')[0]
        tf_matrix.append(tf_vector)

        vector = []
        for key in page.selected_logtfidf:
            vector.append(page.selected_logtfidf[key])
Exemple #14
0
import matplotlib.pyplot as plt
'''
a = [3,3,3,2,2,1]
c = Counter(a)
x, y  =[], []
for item in c:
    x.append(item)
    y.append(c[item])
plt.plot(x,y)
plt.show()



'''
#pages = allPages(["../Crawler/test_data/zhihu/"],dataset="rottentomatoes",mode="raw")
pages = allPages(["../Crawler/Mar15_samples/asp/"], dataset="asp", mode="read")

tf_matrix = []
log_tf_matrix = []
for index, page in enumerate(pages.pages):
    if index == 1 or index == 989:
        print page.path
        vector = []
        for key in page.selected_tfidf:
            vector.append(page.selected_tfidf[key])
        tf_vector = normalize(vector, norm='l1')[0]
        tf_matrix.append(tf_vector)

        vector = []
        for key in page.selected_logtfidf:
            vector.append(page.selected_logtfidf[key])
Exemple #15
0
                    s = self.mapping[id]
                    print type(schema), " type of schema"
                    print type(s), " type of s"
                    tmp = self.jaccard(schema, s)
                    if tmp < min_jaccard:
                        min_jaccard = tmp
                        cid = id
            self.pattern[cid].append(index)


if __name__ == "__main__":
    dataset = sys.argv[1]
    mode = sys.argv[2]
    #dataset = "rottentomatoes"
    data_pages = allPages(["../Crawler/July30_samples/{}/".format(dataset)],
                          dataset,
                          date="July30",
                          mode="c_baseline")
    #with open("cluster_temp","wb") as outfile:
    #    pickle.dump(outfile,data_pages)
    c_baseline = baseline(data_pages, dataset)
    print data_pages.ground_truth
    if mode == "cv":
        c_baseline.cv()
    elif mode == "train":
        c_baseline.run()
        c_baseline.MDL()
        c_baseline.clustering()
        c_baseline.pages.Evaluation()
    '''
    for page in c_baseline.pages.pages:
        print page.anchor_xpath_set
Exemple #16
0
from page import Page
from pages import allPages
# this program aims to find out the xpath frequency differences between two page objects.

if __name__=='__main__':
	UP_pages = allPages(["../Crawler/crawl_data/showXpath/"])

	a = UP_pages.pages[0] # less
	b = UP_pages.pages[1] # more
	print a.path + "\t" + b.path
	dif_dict = {}
	for item in a.xpaths:
		dif_dict[item] = a.xpaths[item]-b.xpaths[item]

	# Top 10 i > j
	g_sorted_result_dict= sorted(dif_dict.iteritems(), key=lambda d:d[1], reverse = True)
	l_sorted_result_dict= sorted(dif_dict.iteritems(), key=lambda d:d[1], reverse = False)
	for i in range(10):
		print str(g_sorted_result_dict[i][0]) + "\t" + str(g_sorted_result_dict[i][1])
	for i in range(10):
		print str(l_sorted_result_dict[i][0]) + "\t" + str(l_sorted_result_dict[i][1])
Exemple #17
0


		pair_dict = {}
		for key1 in xpaths_dict:
			for key2 in xpaths_dict[key1]:
				print xpaths_dict[key1][key2]
				p = float(co_dict[key1][key2])/float(N_pages)
				print p
				xpaths_dict[key1][key2] = p*xpaths_dict[key1][key2]
				
				if xpaths_dict[key1][key2] == 0:
					pair_dict["("+key1+","+key2+")"] = 0
				else:
					pair_dict["("+key1+","+key2+")"] = math.log(xpaths_dict[key1][key2]) * p
		bigram_list = []
		top = 1000
		pair_list = sorted(pair_dict.iteritems(),key=lambda x:x[1],reverse=True)
		for i in range(top):
			print pair_list[i][0] + "\t" + str(pair_list[i][1])
			[path1, path2] = pair_list[i][0].replace("(","").replace(")","").split(",")
			print str(df[path1]) + "\t" + str(df[path2])
			bigram_list.append([path1,path2])

		print bigram_list
		return bigram_list

if __name__ == "__main__":
	pages = allPages(["../Crawler/test_data/stackexchange/"])
	b = bigram(pages)
	b.find_bigram()