def search_basic(source, text, engine=None, total_result=100): if not engine: preprocess = source.preprocessor preprocess.expand_terms = True index = preprocess.get_representation_docs() query_processor = QueryProcessor(preprocessor=preprocess) search_model = BM25Search() search_model.set_index(index) engine = Engine(source, query_processor, search_model, free_search=True) query_result = engine.search(text) preprocess.expand_terms = False docs_list_retrieval = [ source.read_doc(engine.index.get_doc_item(d).id) for d in query_result.docs_retrieval[:total_result] ] return docs_list_retrieval
class AnalysisContextLocal(QueryProcessor): ''' Algorithm of Analysis Context Local ''' def __init__(self, top_docs=None, window_size=300, n_top_ranked=10, n_passages=50, m_concept=5, factor=0.0001, preprocessor=None): self.top_docs = top_docs self.window_size = window_size self.n_passages = n_passages self.m_concept = m_concept self.n_top_ranked = n_top_ranked self.factor = factor self.engine = None self.language = preprocessor.lang self.model = None self.passages = [] super(AnalysisContextLocal, self).__init__(expanded=True, reduce=True, top_docs=top_docs, limit=m_concept, preprocessor=preprocessor) @property def info(self): return 'acl-exp' def find_passage(self, window_size, document): list_passages = [] if window_size: list_passages = [ ' '.join(ck) for ck in chunks(document.text.split(), window_size) ] else: list_passages_tmp = [ p for p in re.split(r"[.!?;]\s", ' '.join( document.text.splitlines())) if len(p) > 10 ] i = 0 r = '' list_passages = [] for p in list_passages_tmp: if i > 5: i = 0 list_passages.append(r) r = '' else: r += ' ' + p i += 1 if i <= 5 and i > 0: list_passages.append(r) return list_passages def make_concepts(self, text): assert isinstance(text, str) is_noun = lambda pos: pos[:2] == 'NN' self.engine.preprocessor.pos_tag = True return [ word for (word, pos) in self.engine.preprocessor.tokenize(text) if is_noun(pos) and len(word) > 2 ] def find_sort_concepts(self, passages, query): millis = int(round(time.time() * 1000)) source = ListSource(list_docs=passages, list_query=[], language=self.language, info=millis) source.preprocessor.use_stop_words = True source.preprocessor.save = False source.preprocessor.reduce = True source.preprocessor.pos_tag = False source.preprocessor.expand_terms = True index = source.preprocessor.get_representation_docs(force=True, do_idf=False, type_tf=3, norm=None) query_processor = QueryProcessor(expanded=False, reduce=True, preprocessor=source.preprocessor) search_model = VectorSearch() search_model.set_index(index) self.engine = Engine(source, query_processor, search_model, free_search=True) query_result = self.engine.search(query) docs_list_retrieval = [ source.read_doc(d) for d in query_result.docs_retrieval[:self.n_passages] ] concepts = [] for d in docs_list_retrieval: concepts.extend(self.make_concepts(d.text)) return list(set(c for c in concepts)) def func_correlation(self, concept, ki): try: r = self.model[self.engine.index.get_feature_id(concept), self.engine.index.get_feature_id(ki)] #print(r) if r < 0: #print("Erro palavra negativo") return 0.0 else: return r except: #print("Erro palavra nao index correlacao") return 0 def IDF(self, term, N, Nc): return min(1, np.log10(N / Nc) / 5) def exist_vocab(self, k): return k in self.engine.index.index.keys() def co_degree(self, c, ki, N, Nc): return np.log10(self.func_correlation(c, ki) + 1) * self.IDF( c, N, Nc) / np.log10(self.n_top_ranked) def similarity(self, query, concept): assert isinstance(query, Query) N = len(self.preprocessor.representation.documents) try: Nc = len(self.preprocessor.representation.index[concept]) except: #print("Erro palavra nao index") return 0.0 simqc = [] for ki in query.query_vector: try: Nk = len(self.preprocessor.representation.index[ki]) except: #print("Erro palavra nao index local") continue tmp = (self.factor + self.co_degree(concept, ki, N, Nc))**self.IDF( ki, N, Nk) simqc.append(tmp) #print('%s: %s' % (concept,simqc)) return np.prod(simqc) def representation(self): self.model = self.engine.index.comatrix def expand_query(self, query): if isinstance(query, Query) and len(query.query_vector) > 0: # Search top documents #if not self.top_docs: self.top_docs = search_basic(self.preprocessor.source, query.text, total_result=self.n_top_ranked) documents = self.top_docs # Find passages self.passages = [] if documents: for doc in documents: self.passages.extend( self.find_passage(self.window_size, doc)) # Find concepts top_concepts = self.find_sort_concepts(self.passages, query.text) self.representation() #Sort top concepts m_scores = {} for i, cpt in enumerate(top_concepts): m_scores[cpt] = self.similarity(query, cpt) #print(m_scores) for w in query.query_vector: if w in m_scores.keys(): del m_scores[w] best_m = sorted(m_scores, key=m_scores.get, reverse=True)[:self.m_concept] query.query_score = [2.0 for w in query.query_vector] print(query.query_vector) for i, c in enumerate(best_m): query.query_vector.append(c) query.query_score.append(1 - (0.9 * i / self.m_concept)) print(query.query_vector) return query else: raise Exception('query is not instance of Query')
def get_context_data(self, **kwargs): context = super(SearchView, self).get_context_data(**kwargs) source_select = self.request.GET['source_select'] if 'source_select' in self.request.GET.keys() and \ self.request.GET['source_select'] else None query_processor_select = self.request.GET['query_processor'] if 'query_processor' in self.request.GET.keys() and \ self.request.GET['query_processor'] else None search_model_select = self.request.GET['search_model'] if 'search_model' in self.request.GET.keys() and \ self.request.GET['search_model'] else None query_text = self.request.GET['q'] if 'q' in self.request.GET.keys( ) and self.request.GET['q'] else None qid = int(self.request.GET['qid']) if 'qid' in self.request.GET.keys( ) and self.request.GET['qid'] else None drid = int( self.request.GET['drid']) if 'drid' in self.request.GET.keys( ) and self.request.GET['drid'] else None page = int( self.request.GET['page']) if 'page' in self.request.GET.keys( ) and self.request.GET['page'] else 1 free_search = True if 'free_search' in self.request.GET.keys( ) and self.request.GET['free_search'] == 'on' else False metrics_search = True if 'metrics_search' in self.request.GET.keys( ) and self.request.GET['metrics_search'] == 'on' else False try: if (source_select and query_processor_select and search_model_select and query_text): source_type = SourceType.objects.filter(slug=source_select, enable=True).first() if (source_type.slug == 'artigos'): source = get_class(source_type.instance)( path=os.path.join(BASE_DIR, source_type.path)) else: source = SOURCES_LIST[source_select] #if not 'source_v' in self.request.session.keys(): #self.request.session['source_v'] = source #else: #source = self.request.session['source_v'] q_pro_type = QueryProcessorType.objects.filter( slug=query_processor_select).first() query_processor = get_class( q_pro_type.instance)(preprocessor=source.preprocessor) search_type = SearchType.objects.filter( slug=search_model_select).first() search_model = get_class(search_type.instance)() engine = Engine(source, query_processor, search_model, free_search=free_search) related_docs = None metrics = None if drid != None: related_docs = [drid] now = datetime.now() query_result = engine.search(query_text, qid=qid, related_docs=related_docs) if metrics_search and not free_search: metrics = engine.calculate_metrics( query_result, self.request.session.session_key) later = datetime.now() diff = later - now diff_in_seconds = diff.seconds + diff.microseconds / 1E6 docs_list = query_result.docs_retrieval docs_list_relevant = query_result.docs_relevant context['total_docs'] = len(docs_list) paginator = Paginator(docs_list, 10) paginator2 = Paginator(docs_list_relevant, 10) try: docs = paginator.page(page) except PageNotAnInteger: page = 1 docs = paginator.page(page) except EmptyPage: page = 1 docs = paginator.page(paginator.num_pages) try: docs_rel = paginator2.page(page) except PageNotAnInteger: docs_rel = paginator2.page(1) except EmptyPage: docs_rel = [] if docs: ids = [engine.index.get_doc_item(d).id for d in docs] docs.object_list = DocumentData.objects.filter( idd__in=ids, source=source_type)\ # .extra( # select={'manual': 'FIELD(idd,%s)' % ','.join(map(str, ids))}, # order_by=['manual']) if docs_rel: ids = [d for d in docs_rel] docs_rel.object_list = DocumentData.objects.filter( idd__in=ids, source=source_type) # .extra( # select={'manual': 'FIELD(idd,%s)' % ','.join(map(str, ids))}, # order_by=['manual']) context['documents'] = docs context['documents_relevant'] = docs_rel context['result_config'] = str(engine) context['query_text'] = query_text context['drid'] = drid context['qid'] = qid context['document_related'] = DocumentData.objects.filter( idd=drid, source=source_type).first() context['query_result'] = str(engine.query.dic_vector()) context['source_result'] = source_type.name context['expansion_result'] = q_pro_type.name context['model_result'] = engine.search_model.info context['context_result'] = None context['page'] = page context['total_time'] = ("%.2f") % diff_in_seconds context['metrics'] = metrics query_request = QueryRequest() query_request.text = query_text query_request.query_processor = q_pro_type query_request.search = search_type query_request.time_request = diff_in_seconds query_request.free_search = free_search query_request.source = SourceType.objects.filter( slug=source_select, enable=True).first() query_request.page = page query_request.save() # messages.add_message(self.request, INFO, ) else: # Favor preencher as informações messages.add_message(self.request, ERROR, _('Please fill out the information')) except (Exception) as e: traceback.print_exc(file=sys.stdout) messages.add_message(self.request, ERROR, str(e)) # Config self.request.session['source_select'] = source_select self.request.session['query_processor_select'] = query_processor_select self.request.session['search_model_select'] = search_model_select self.request.session['free_search'] = 'on' if free_search else '' self.request.session['metrics_search'] = 'on' if metrics_search else '' session2context(context, self.request.session) return context
search_model.set_index(index) print('Setting Engine') engine = Engine(source, query_processor, search_model, free_search=False) fo_rel = open(path_relevant_file % engine.slug(), 'wt') fo_ret = open(path_retrieval_file % engine.slug(), 'wt') z = 0 for q in list(source.read_querys()): print('Read query %s' % q.id) z += 1 #profile.run('query_result = engine.search(q.text, qid=q.id);print()') query_result = engine.search(q.text, qid=q.id) #Problema TODO UID x ID docs_list_retrieval = [ engine.index.get_doc_item(d).id for d in query_result.docs_retrieval ] docs_list_relevant = [ d for d in query_result.docs_relevant ] docs_score_relevant = [ v for v in query_result.docs_scores ] print('Write out...')