Exemple #1
0
from book import Book
from nltk.stem.porter import PorterStemmer
from nltk import pos_tag
from util import getArgMap
import sys,re,copy
import collections
from collections import *
from nltk.util import ngrams
from keyword_tool import *
import nltk
from loadFile import *
from nltk.tokenize import word_tokenize

ps = PorterStemmer()

argMap = getArgMap(sys.argv[1:])
bookname = argMap.get('-b','')
doc_suffix=argMap.get('-s','')
path = '../test/'+bookname+'/'

#vocabulary is the original vocabulary
#stem2dic is the defaultdict(list) key is the stemming form of the word. values are words
#df defaultdict(int)
# c is content
def exactMatch_byWords(c,vocabulary,df,stem2dic,stopword=()):
	tf_temp=wordFreq(c,stopword)
	new_voc=stem2dic.keys()
	wl=set()
	tf=defaultdict(int)
	for w in tf_temp.keys():
		if w in stopword or len(w) < 3:
Exemple #2
0
def load_matrix(k):
    sim_mat = [[] for i in xrange(m.shape[0])]
    print 'loading precompued sim matrix...'
    with open('../data/u{}.usersim_method{}'.format(k+1, DIST_FUNC)) as f:
        for i in xrange(m.shape[0]):
            s = f.readline().strip().split()
            for j in xrange(i+1, m.shape[0]):
                v = float(s[j-i-1])
                sim_mat[i].append(v)
            # break
    print 'loading done'
    return sim_mat

# parameters
argMap = getArgMap(sys.argv[1:])
N = int(argMap.get('-n', 10))  # N-nearest neighbors
DIST_FUNC = int(argMap.get('-d', 0))  # 0: cos 1: pearson
AGGREGATION_METHOD = int(argMap.get('-a', 0))  # score aggregation method
PRECALCULATION = int(argMap.get('-p', 0))
USE_COMPUTED_MATRIX = int(argMap.get('-m', 0))
SAVE_RESULTS = int(argMap.get('-save', 0))

if __name__ == "__main__":
    mae_set = []
    rmse_set = []
    for i in xrange(5):
        m, col2rows, rows, cols = construct_ui_matrix(
            data_folder + "u{}.base".format(i + 1))
        # pre-computation
        mean_val = []