コード例 #1
0
ファイル: QTSPR.py プロジェクト: jessyli/PageRank
def pagerank(comatrix, n):
    TC = ReadingFile.reading2("doc-topics.txt")

    comatrixtrans = comatrix.transpose()

    alpha = 0.2
    beta = 0.6
    gama = 0.2
    mindistance = 1e-5
    Tele = dict()
    for key in TC:
        Tele[key] = np.zeros(n)
        for value in TC.get(key):
            Tele[key][int(value)] = 1.0 / float(len(TC.get(key)))
    TPageRank = dict()
    Rvector = np.ones(n) * (1.0 / n)
    P0 = Rvector
    for key in TC:
        TPageRank[key] = Rvector
        tempvector = TPageRank[key]
        TPageRank[key] = alpha * comatrixtrans * TPageRank[key] + beta * Tele[key] + gama * P0
        while distance.euclidean(TPageRank[key], tempvector) > mindistance:
            tempvector = TPageRank[key]
            TPageRank[key] = alpha * comatrixtrans * TPageRank[key] + beta * Tele[key] + gama * P0
        # print(TPageRank[key].sum())
    return TPageRank
コード例 #2
0
ファイル: GlobalPageRank.py プロジェクト: jessyli/PageRank
def main():
    import sys
    print(sys.path)
    comatrix = ReadingFile.reading("transition.txt")
    Rvector = GPR(comatrix)
    with open("GPR-10.txt","w") as f:
        count = 1
        for item in Rvector:
            temp = str(count) + " " + str(item)
            f.write("%s\n" % temp)
            count=count+1
コード例 #3
0
ファイル: QTSPR.py プロジェクト: jessyli/PageRank
def query_topic2(TPageRank, n):
    QTD = ReadingFile.reading3("user-topic-distro.txt")
    TSPR = dict()

    for key in QTD:
        TSPR[key] = np.zeros(n)
        for topic in range(0, len(QTD[key]) - 1):
            t = str(topic + 1)
            TSPR[key] = float(QTD[key][topic]) * TPageRank[t] + TSPR[key]
        # print("$$$$$$$$$$$$$$")
        # print(TSPR[key].sum())
    return TSPR
コード例 #4
0
ファイル: ZhihuSpider.py プロジェクト: deFang/ZhihuSpider
def main():
    url_queue = ReadingFile.read_file()
    if not url_queue:
        # seed url
        m = Spider('https://www.zhihu.com/people/du-du-du-91/followees')
        url_queue = deque()
    else:
        m = Spider(url_queue.popleft()+'/followees')
    next_people = m.get_data()
    zhihu_data = m.zhihu_dict()
    # 利用队列实现BFS算法
    
    for people in next_people:
        url_queue.append(people)
    #连接mongodb
    client=pymongo.MongoClient("localhost",27017)
    db = client.zhihu
    collection=db.data_collection
    # insert one item to mongoDB 
    user_name = zhihu_data['name']
    # check duplicates
    if not collection.find_one({"name":user_name}):
            collection.insert(zhihu_data)
    else:
        pass

    # BFS
    try:
        # 限制爬取的最大数量
        count = 1
        while url_queue and count < 100000:
            m = Spider(url_queue.popleft()+'/followees')
            next_people = m.get_data()
            zhihu_data = m.zhihu_dict()
            user_name = zhihu_data['name']
            for people in next_people:
                url_queue.append(people)
            # checking duplicates    
            if not collection.find_one({"name":user_name}):
                collection.insert(zhihu_data)
            else:
                pass
            count += 1
        if url_queue:
            print 'have reached the maximum iteration {}'.format(count)
        else:
            print 'the queue is exhausted'

    except:
        print "error find in "+m.url
        with open('zhihu_queue.pickle', 'wb') as f:
            pickle.dump(url_queue, f, pickle.HIGHEST_PROTOCOL)
        sys.exit()
コード例 #5
0
ファイル: QTSPR.py プロジェクト: jessyli/PageRank
def main():
    comatrix = ReadingFile.reading("transition.txt")
    n = comatrix.shape[0]
    TpageRank = pagerank(comatrix, n)
    TSPR = query_topic(TpageRank, n)
    UTSPR = query_topic2(TpageRank, n)
    with open("QTSPR-U2Q2-10.txt", "w") as f:
        count = 1
        for item in TSPR["22"]:
            temp = str(count) + " " + str(item)
            f.write("%s\n" % temp)
            count = count + 1
    with open("PTSPR-U2Q2-10.txt", "w") as f:
        count = 1
        for item in UTSPR["22"]:
            temp = str(count) + " " + str(item)
            f.write("%s\n" % temp)
            count = count + 1
コード例 #6
0
def clear(key_params=[]):
    X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
    X = PreprocessData.prepare_data(X, mode='save', key_features=key_params)
    return X, y
コード例 #7
0
    return metrics.accuracy_score(y, predicted)


# In[2]:

X, y = clear()
svm_clf = SVC(kernel='linear')
svm_clf.fit(X, y)

# In[3]:

# test feature transformation

scores = []

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(check_score(X, y, svm_clf))

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(check_score(PreprocessData.prepare_features(X), y, svm_clf))

X, y = ReadingFile.read_csv('TRAIN_CORPUS.csv')
scores.append(
    check_score(PreprocessData.prepare_data(X, mode='save'), y, svm_clf))

scores

# In[6]:

# test feature importances