Example #1
0
		tokens = []
		for line in f:
			if position == 0:
				if '<top>' in line:
					position = 1
			else:
				if '</top>' in line:
					queries.append([num, tokens])
					num = -1
					tokens = []
					position = 0
				elif '<num>' in line:
					num = tokenize.numeric_norm(line)[0][0]
				elif '<title>' in line:
					line = line.replace('Topic:', '')
					token_list, query_position = tokenize.process_line(line,0)
					position = query_position
					tokens += token_list

# process queries
# token is of format: (word, position, tf) => (word, position, tf, stemmed_word, idf)
processed_queries = []
for query in queries:
	token_list = query[1]
	new_token_list = []
	phrases = find_phrases(token_list)
	for token in token_list:
		if token[0] not in stop_words:
			stemmed = tokenize.ps.stem(token[0]).encode('utf-8')
			idf = get_idf(token[0], lexicons[0])
			if idf == 0: