def pagerank(comatrix, n): TC = ReadingFile.reading2("doc-topics.txt") comatrixtrans = comatrix.transpose() alpha = 0.2 beta = 0.6 gama = 0.2 mindistance = 1e-5 Tele = dict() for key in TC: Tele[key] = np.zeros(n) for value in TC.get(key): Tele[key][int(value)] = 1.0 / float(len(TC.get(key))) TPageRank = dict() Rvector = np.ones(n) * (1.0 / n) P0 = Rvector for key in TC: TPageRank[key] = Rvector tempvector = TPageRank[key] TPageRank[key] = alpha * comatrixtrans * TPageRank[key] + beta * Tele[key] + gama * P0 while distance.euclidean(TPageRank[key], tempvector) > mindistance: tempvector = TPageRank[key] TPageRank[key] = alpha * comatrixtrans * TPageRank[key] + beta * Tele[key] + gama * P0 # print(TPageRank[key].sum()) return TPageRank
def main(): import sys print(sys.path) comatrix = ReadingFile.reading("transition.txt") Rvector = GPR(comatrix) with open("GPR-10.txt","w") as f: count = 1 for item in Rvector: temp = str(count) + " " + str(item) f.write("%s\n" % temp) count=count+1
def query_topic2(TPageRank, n): QTD = ReadingFile.reading3("user-topic-distro.txt") TSPR = dict() for key in QTD: TSPR[key] = np.zeros(n) for topic in range(0, len(QTD[key]) - 1): t = str(topic + 1) TSPR[key] = float(QTD[key][topic]) * TPageRank[t] + TSPR[key] # print("$$$$$$$$$$$$$$") # print(TSPR[key].sum()) return TSPR
def main(): url_queue = ReadingFile.read_file() if not url_queue: # seed url m = Spider('https://www.zhihu.com/people/du-du-du-91/followees') url_queue = deque() else: m = Spider(url_queue.popleft()+'/followees') next_people = m.get_data() zhihu_data = m.zhihu_dict() # 利用队列实现BFS算法 for people in next_people: url_queue.append(people) #连接mongodb client=pymongo.MongoClient("localhost",27017) db = client.zhihu collection=db.data_collection # insert one item to mongoDB user_name = zhihu_data['name'] # check duplicates if not collection.find_one({"name":user_name}): collection.insert(zhihu_data) else: pass # BFS try: # 限制爬取的最大数量 count = 1 while url_queue and count < 100000: m = Spider(url_queue.popleft()+'/followees') next_people = m.get_data() zhihu_data = m.zhihu_dict() user_name = zhihu_data['name'] for people in next_people: url_queue.append(people) # checking duplicates if not collection.find_one({"name":user_name}): collection.insert(zhihu_data) else: pass count += 1 if url_queue: print 'have reached the maximum iteration {}'.format(count) else: print 'the queue is exhausted' except: print "error find in "+m.url with open('zhihu_queue.pickle', 'wb') as f: pickle.dump(url_queue, f, pickle.HIGHEST_PROTOCOL) sys.exit()
def main(): comatrix = ReadingFile.reading("transition.txt") n = comatrix.shape[0] TpageRank = pagerank(comatrix, n) TSPR = query_topic(TpageRank, n) UTSPR = query_topic2(TpageRank, n) with open("QTSPR-U2Q2-10.txt", "w") as f: count = 1 for item in TSPR["22"]: temp = str(count) + " " + str(item) f.write("%s\n" % temp) count = count + 1 with open("PTSPR-U2Q2-10.txt", "w") as f: count = 1 for item in UTSPR["22"]: temp = str(count) + " " + str(item) f.write("%s\n" % temp) count = count + 1
def clear(key_params=[]): X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') X = PreprocessData.prepare_data(X, mode='save', key_features=key_params) return X, y
return metrics.accuracy_score(y, predicted) # In[2]: X, y = clear() svm_clf = SVC(kernel='linear') svm_clf.fit(X, y) # In[3]: # test feature transformation scores = [] X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append(check_score(X, y, svm_clf)) X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append(check_score(PreprocessData.prepare_features(X), y, svm_clf)) X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv') scores.append( check_score(PreprocessData.prepare_data(X, mode='save'), y, svm_clf)) scores # In[6]: # test feature importances