def tokenize_photo(fb_owner_id, selected_friends): docs_all = get_docs(fb_owner_id, selected_friends, Photo, "owner_id") tokens_lst = defaultdict(dict) num_docs = docs_all.count() for photo in docs_all: tokens = queryProcess.processLine(photo.photo_name) for token in tokens: tokens_lst[token][photo.photo_id] = tokens_lst.get(token, {}).get(photo.photo_id, 0) + 1 return tokens_lst, num_docs
def tokenize_link(fb_owner_id, selected_friends): docs_all = get_docs(fb_owner_id, selected_friends, Link, "owner_id") tokens_lst = defaultdict(dict) num_docs = docs_all.count() for link in docs_all: tokens = queryProcess.processLine(link.link_name + ' '+ link.link_description + ' '+ link.link_message) for token in tokens: tokens_lst[token][link.link_id] = tokens_lst.get(token, {}).get(link.link_id, 0) + 1 return tokens_lst, num_docs
def tokenize_comment(fb_owner_id, selected_friends): docs_all = get_docs(fb_owner_id, selected_friends, Comment, "owner_id") tokens_lst = defaultdict(dict) num_docs = docs_all.count() for comment in docs_all: tokens = queryProcess.processLine(comment.comment_message) for token in tokens: tokens_lst[token][comment.comment_id] = tokens_lst.get(token, {}).get(comment.comment_id, 0) + 1 return tokens_lst, num_docs
def tokenize_status(fb_owner_id, selected_friends): docs_all = get_docs(fb_owner_id, selected_friends, Status, "owner_id") tokens_lst = defaultdict(dict) num_docs = docs_all.count() for status in docs_all: tokens = queryProcess.processLine(status.status_message) for token in tokens: tokens_lst[token][status.status_id] = tokens_lst.get(token, {}).get(status.status_id, 0) + 1 return tokens_lst, num_docs
def tokenize_post(fb_owner_id, selected_friends): docs_all = get_docs(fb_owner_id, selected_friends, Post, "owner_id") tokens_lst = defaultdict(dict) num_docs = docs_all.count() for post in docs_all: #print "id", post.post_id #print "caption:", post.post_caption #print "description:", post.post_description #print "message:", post.post_message #print "story:", post.post_story #print "name:", post.post_name #print "link:", post.post_link tokens = queryProcess.processLine( post.post_caption + ' '+ post.post_description + ' '+ post.post_message + ' '+ post.post_story + ' '+ post.post_name) for token in tokens: tokens_lst[token][post.post_id] = tokens_lst.get(token, {}).get(post.post_id, 0) + 1 return tokens_lst, num_docs
def apply_search(owner_id, selected_friends, query, c_type): # tokens_lst is a dictionary of token and its occurrences in document # Each word is mapped to a list of (document number, document frequency) pair # for example, if token A occurs once in document number 1 and 9, and twice in 4 # its entry in tokens_lst would be ['A': (1,1), (4,2), (9,1)] tokens_lst = defaultdict(dict) [tokens_lst, num_docs] = get_tokens(owner_id, selected_friends, c_type) # eliminate stopwords and stemming tokens_lst = queryProcess.stemmer(tokens_lst, stopwords) # ===== Applying weighting scheme ===== # # doc_freq_lst maps token to its document frequency doc_freq_lst = dict() for token, doc_list in tokens_lst.items(): doc_freq_lst[token] = len(doc_list) weight_index = index.calcWeight(tokens_lst, num_docs) doc_length = index.calcDocLen(weight_index) # =========== Process the Query ============ # query_doc_no = 1 query_tokens_lst = defaultdict(dict) # tokenize the query tokens = queryProcess.processLine(query) for token in tokens: query_tokens_lst[token][query_doc_no] = query_tokens_lst.get(token, {}).get(query_doc_no, 0) + 1 # stem and eliminate stopwords query_tokens_lst = queryProcess.stemmer(query_tokens_lst, stopwords) # ========== Extract Query Set ============ # query = query.lower() # doc_set maps each document to its similarity score with the query doc_set = dict() # query_set is the set of items that contain at least one term in the query for term in query_tokens_lst: for doc_no in tokens_lst[term]: # initialize the similarity score to 0 if doc_no not in doc_set: doc_set[doc_no] = 0 # calculate term weight and query length query_weight_index = index.calcQueryWeight(doc_freq_lst, query_tokens_lst, num_docs) query_length = index.calcDocLen(query_weight_index) # Calculate the cosine similarity # Accumulate the inner product for term in query_tokens_lst: for doc_no in doc_set: doc_set[doc_no] = doc_set[doc_no] + weight_index[term].get(doc_no, 0) * query_weight_index[term][query_doc_no] # Normalize using document length for doc_no in doc_set: doc_set[doc_no] = doc_set[doc_no]/(doc_length[doc_no] * query_length[query_doc_no]) results = get_results(doc_set, c_type) return results