def ranked_retrieval(queries, collection_table, doc_nums, inverted_index, stop_words): ranked_scores = {} for query_index, query in enumerate(queries): query_tokens = normalise(remove_stop_words(tokenise(query), stop_words)) # Convert query into an OR boolean search and use eval to evaluate it boolean_vectors = [] for token in query_tokens: boolean_vector = collection_table[token] boolean_vectors.append('np.array([{}])'.format( array_to_string(boolean_vector))) query_eval_string = ' | '.join(boolean_vectors) query_documents = boolean_search(query_eval_string, doc_nums) query_scores = [] # Map query_boolean_result to a list of document ids for doc in query_documents: score = TFIDF(doc, query_tokens, len(doc_nums), inverted_index) query_scores.append((doc, score)) # Sort scores for each query on a descending order query_scores = sorted(query_scores, key=lambda x: x[1], reverse=True) ranked_scores[query_index + 1] = query_scores return ranked_scores
def computerelations(self, graph): for tweet in self: tokens = preprocess.tokenise(tweet.message) for token in tokens: #Does this token look like twitter's @recipient syntax ?? if token and token[0] == '@': user = token[1:] self.addrelation(user)
def get_word_indices_in_text(words, text): text_token = tokenise(text) word_indices = [] for word in words: word_index = text_token.index(word.lower()) word_indices.append(word_index + 1) return word_indices
def load_docs_from_trec(root): # Load the provided trec sample xml root = load_xml(TREC_SAMPLE_FILE, './DOC') doc_list = [] token_doc_list = [] tokenised_docs = {} doc_nums = [] test_list = [] for doc in root: doc_no = doc.find('DOCNO').text headline = doc.find('HEADLINE').text text = doc.find('TEXT').text headline_with_text = headline + ' ' + text doc_nums.append(doc_no) doc_list.append(headline_with_text) test_list.append(tokenise(headline_with_text)) token_doc_list.append(preprocess(headline_with_text)) tokenised_docs[doc_no] = preprocess(headline_with_text) return doc_nums, token_doc_list, tokenised_docs
def extract_features(filename): "Open and tokenise the contents of a file." return tokenise(read_corpus_file(filename))
def prepare_text(filename): return splitsentences(tokenise(readcorpusfile(filename)))
def preprocess(doc): return stemming(remove_stop_words(tokenise(doc), stop_words))