def generate_snippet(self, doc, query): fa = FileAccess() stop_words = fa.get_stop_words() query = query.split() stopped_content = query final_query = " ".join(stopped_content) fq_list = final_query.split() doc_list = doc.split() intr = list(set(doc_list).intersection(fq_list)) positions = [] for each in intr: if each in intr: key = doc_list.index(each) positions.append(key) else: continue final_doc = '' i = 0 for each in doc_list: if i in positions: q = '"' + each + '" ' final_doc += q else: final_doc += each + ' ' i += 1 return final_doc
def build_stopped_corpus(self): cwd = os.getcwd() clean_cacm = os.path.join(cwd, 'clean_cacm') stopped_cacm = os.path.join(cwd, 'stopped_cacm') fa = FileAccess() if not os.path.exists(clean_cacm): print "Clean corpus doesn't exist. It is created now. " \ "PLease put cleaned files inside the corpus folder" os.makedirs(clean_cacm, 0755) return if not os.path.exists(stopped_cacm): os.makedirs(stopped_cacm, 0755) stop_words = fa.get_stop_words() os.chdir(clean_cacm) for eachfile in glob.glob('*.html'): print eachfile content = open(eachfile).read() content = content.split() stopped_content = [x for x in content if x not in stop_words] final_content = " ".join(stopped_content) clean_file = open(os.path.join(stopped_cacm, eachfile), 'w') clean_file.write(final_content) clean_file.close()
def get_stopped_queries(self, query_dict): fa = FileAccess() query_dict = query_dict stop_words = fa.get_stop_words() stopped_queries = {} for each in query_dict: query = query_dict[each] query_list = query.split() stopped_query = [x for x in query_list if x not in stop_words] stopped_query = " ".join(stopped_query) stopped_queries[each] = stopped_query return stopped_queries