tokens = [] for line in f: if position == 0: if '<top>' in line: position = 1 else: if '</top>' in line: queries.append([num, tokens]) num = -1 tokens = [] position = 0 elif '<num>' in line: num = tokenize.numeric_norm(line)[0][0] elif '<title>' in line: line = line.replace('Topic:', '') token_list, query_position = tokenize.process_line(line,0) position = query_position tokens += token_list # process queries # token is of format: (word, position, tf) => (word, position, tf, stemmed_word, idf) processed_queries = [] for query in queries: token_list = query[1] new_token_list = [] phrases = find_phrases(token_list) for token in token_list: if token[0] not in stop_words: stemmed = tokenize.ps.stem(token[0]).encode('utf-8') idf = get_idf(token[0], lexicons[0]) if idf == 0: