def pointwise_test_features(test_data_file, is_pairwise=False): (queries, features) = extractFeatures(test_data_file) # Bulid IDF dictionary idfDict = getIDFScores() queryStrings = [] X = [] index_map = {} for query in queries.keys(): queryStrings.append(query) queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): X_i = [] rawCounts = getRawCounts(queries, features, query, document) if 'body_length' in features[query][document]: normalizedBodyLength = features[query][document]['body_length'] + 500 else: normalizedBodyLength = 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] X_i.append(currentValue) X.append(X_i) if query in index_map: # index_map[query][url] = i means X[i] is the feature vector of query and url index_map[query][document] = len(X) - 1 else: index_map[query] = {} index_map[query][document] = len(X) - 1 if is_pairwise: X = preprocessing.scale(X) return (X, queryStrings, index_map)
def pointwise_test_features(test_data_file, is_pairwise=False): (queries, features) = extractFeatures(test_data_file) # Bulid IDF dictionary idfDict = getIDFScores() queryStrings = [] X = [] index_map = {} for query in queries.keys(): queryStrings.append(query) queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): X_i = [] rawCounts = getRawCounts(queries, features, query, document) if 'body_length' in features[query][document]: normalizedBodyLength = features[query][document][ 'body_length'] + 500 else: normalizedBodyLength = 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] X_i.append(currentValue) X.append(X_i) if query in index_map: # index_map[query][url] = i means X[i] is the feature vector of query and url index_map[query][document] = len(X) - 1 else: index_map[query] = {} index_map[query][document] = len(X) - 1 if is_pairwise: X = preprocessing.scale(X) return (X, queryStrings, index_map)
def pointwise_train_features(train_data_file, train_rel_file): (queries, features) = extractFeatures(train_data_file) # Bulid IDF dictionary idfDict = getIDFScores(log = True) trainScores = getTrainScores(train_rel_file) X = [] Y = [] for query in queries.keys(): queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): X_i = [] # extract raw counts and apply sublinear scaling rawCounts = getRawCounts(queries, features, query, document) normalizedBodyLength = features[query][document]['body_length'] + 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] X_i.append(currentValue) X.append(X_i) Y.append(trainScores[query][document]) return (X, Y)
def pointwise_train_features(train_data_file, train_rel_file): (queries, features) = extractFeatures(train_data_file) # Bulid IDF dictionary idfDict = getIDFScores(log=True) trainScores = getTrainScores(train_rel_file) X = [] Y = [] for query in queries.keys(): queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): X_i = [] # extract raw counts and apply sublinear scaling rawCounts = getRawCounts(queries, features, query, document) normalizedBodyLength = features[query][document][ 'body_length'] + 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] X_i.append(currentValue) X.append(X_i) Y.append(trainScores[query][document]) return (X, Y)
def pairwise_train_features(train_data_file, train_rel_file): (queries, features) = extractFeatures(train_data_file) # Bulid IDF dictionary idfDict = getIDFScores() trainScores = getTrainScores(train_rel_file) X = [] Y = [] # Associates each query/doc pair with an index into the scaled feature matrix featureIndex = {} featuresBeforeScaling = [] for query in queries.keys(): featureIndex[query] = {} queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): featuresBeforeScaling_i = [] # extract raw counts and apply sublinear scaling rawCounts = getRawCounts(queries, features, query, document) normalizedBodyLength = features[query][document]['body_length'] + 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] featuresBeforeScaling_i.append(currentValue) featuresBeforeScaling.append(featuresBeforeScaling_i) featureIndex[query][document] = len(featuresBeforeScaling) - 1 features = preprocessing.scale(featuresBeforeScaling) for query in queries.keys(): results = queries[query] for d1, document1 in enumerate(results): X_d1 = features[featureIndex[query][document1]] for d2, document2 in enumerate(results[d1+1:]): X_d2 = features[featureIndex[query][document2]] d1Score = trainScores[query][document1] d2Score = trainScores[query][document2] if d1Score != d2Score: if d1Score > d2Score: val = 1 else: val = -1 X_i = [x1 - x2 for x1, x2 in zip(X_d1, X_d2)] X.append(X_i) Y.append(val) return (X, Y)
def pairwise_train_features(train_data_file, train_rel_file): (queries, features) = extractFeatures(train_data_file) # Bulid IDF dictionary idfDict = getIDFScores() trainScores = getTrainScores(train_rel_file) X = [] Y = [] # Associates each query/doc pair with an index into the scaled feature matrix featureIndex = {} featuresBeforeScaling = [] for query in queries.keys(): featureIndex[query] = {} queryTerms = query.rsplit() queryVector = collections.defaultdict(lambda: 0) for queryTerm in queryTerms: queryTerm = queryTerm.lower() queryVector[queryTerm] = queryVector[queryTerm] + 1 for key in queryVector: if key in idfDict: queryVector[key] = queryVector[key] * idfDict[key] else: queryVector[key] = queryVector[key] * log(98998) results = queries[query] for d, document in enumerate(results): featuresBeforeScaling_i = [] # extract raw counts and apply sublinear scaling rawCounts = getRawCounts(queries, features, query, document) normalizedBodyLength = features[query][document][ 'body_length'] + 500 for j in range(len(rawCounts)): currentCounts = rawCounts[j] currentValue = 0 for term in queryVector: currentValue += queryVector[term] * currentCounts[term] featuresBeforeScaling_i.append(currentValue) featuresBeforeScaling.append(featuresBeforeScaling_i) featureIndex[query][document] = len(featuresBeforeScaling) - 1 features = preprocessing.scale(featuresBeforeScaling) for query in queries.keys(): results = queries[query] for d1, document1 in enumerate(results): X_d1 = features[featureIndex[query][document1]] for d2, document2 in enumerate(results[d1 + 1:]): X_d2 = features[featureIndex[query][document2]] d1Score = trainScores[query][document1] d2Score = trainScores[query][document2] if d1Score != d2Score: if d1Score > d2Score: val = 1 else: val = -1 X_i = [x1 - x2 for x1, x2 in zip(X_d1, X_d2)] X.append(X_i) Y.append(val) return (X, Y)