def processdata(urllists, word_count_threshold, depth):
    content = []
    nums = []
    nums.append(0)
    for url in urllists:
        crawler = webCrawler(url, depth)
        crawler.crawl()
        nums.append(len(crawler.data))
        content.extend(crawler.data)

    instance = features(word_count_threshold)    
    word_counts, wordtoix = instance.extractwords(content)
    N = len(word_counts)
    for i in range(1, len(nums)):
        nums[i] = nums[i-1] + nums[i]
     
    cid = 0   
    output = np.zeros((nums[len(nums)-1], N+1))    
    for url in urllists:
        crawler = webCrawler(url, depth)
        crawler.crawl()
        currlen = len(crawler.data)
        feats = instance.bagofwords(crawler.data, word_counts, wordtoix)
        print feats.shape
        b = np.zeros((currlen,N+1))
        print b[:, :-1].shape
        b[:,0:N] = feats
        b[:,N] = cid +1 
        output[nums[cid]:nums[cid+1],:] = b
        cid = cid + 1
    np.savetxt('test.out', output, delimiter=',')   # X is an array   
def getdata(urllists, depth):
    content = []
    nums = []
    nums.append(0)
    for url in urllists:
        #if url != "https://en.wikipedia.org/wiki/1990_RTHK_Top_10_Gold_Songs_Awards":
        #    continue
        crawler = webCrawler(url, depth)
        crawler.crawl()
        nums.append(len(crawler.data))
        content.extend(crawler.data)
    return content    
Esempio n. 3
0
    urllists =[]
    urllists.append( "https://en.wikipedia.org/wiki/Sandra_Bullock");
    urllists.append( "https://en.wikipedia.org/wiki/Far_East_scarlet-like_fever");
    filepath = os.path.dirname(os.path.realpath(__file__))
    dictname = 'dictionary.txt'
    dict2idx = 'dict2idx.txt'
    

    with open(os.path.join(filepath,dictname), 'r') as fread:
        word_counts = json.load(fread)
    with open(os.path.join(filepath,dict2idx), 'r') as fread:
        wordtoix = json.load(fread)    
    #np.random.shuffle
    clf = joblib.load('model.pkl')
    for url in urllists:
            crawler = webCrawler(url, 1)
            crawler.crawl()
            instance = features(word_count_threshold) 
            feats = instance.bagofwords(crawler.data, word_counts, wordtoix)
            
            X = feats
            #print X.shape
            print fsum(X)
            transformer = TfidfTransformer()
            tfidf = transformer.fit_transform(X)
            X = tfidf.toarray()
            print fsum(X)
            yhat = clf.predict(X)
            print yhat
            
    print "finish page testing"