def build_short_mode_text(method_info): method_text_tokens = " ".join(ParserUtil.extractNLwords([method_info['methodName']])) if method_text_tokens == "": return "" class_text_tokens = " ".join(ParserUtil.extractNLwords([method_info['className']])) return_type_text_tokens = method_info['returnType'].lower() param_type_text_tokens = " , ".join( [" ".join(ParserUtil.extractNLwords([param_type])) for param_type in method_info['paramTypes']] ) text = " | ".join([class_text_tokens, method_text_tokens, return_type_text_tokens, param_type_text_tokens]) return text
def search_snippets(self, query, with_score=False, cur_snippet=None): search_results = self.es.search(index=[self.index], doc_type=self.type, body=query) search_results = search_results['hits']['hits'] if self.short_mode: query_snippet_text = " | ".join([ " ".join(ParserUtil.extractNLwords([cur_snippet['className'] ])), " ".join(ParserUtil.extractNLwords([cur_snippet['methodName'] ])) ]) candidate_texts = [ " | ".join([ " ".join( ParserUtil.extractNLwords( [res['_source']['className']])), " ".join( ParserUtil.extractNLwords( [res['_source']['methodName']])) ]) for res in search_results ] else: if "multi_match" in query['query']: # BasicQueryBuilder query_snippet_text = query['query']['multi_match']['query'] elif "bool" in query['query']: # CombineQueryBuilder query_snippet_text = query['query']['bool']['should'][0][ 'multi_match']['query'] else: raise Exception() candidate_texts = [ " ".join(res['_source']['tokenSequence']) for res in search_results ] scores = self.bert_manager.rank(query_snippet_text, candidate_texts) sorted_scores = sorted([(i, score) for i, score in enumerate(scores)], key=lambda d: d[1], reverse=True) if with_score: search_results = [(search_results[i]['_source'], score) for i, score in sorted_scores] else: search_results = [ search_results[i]['_source'] for i, score in sorted_scores ] return search_results
def process(method_info_dict, data, key1, key2, wf, short_mode): if data[key1] not in method_info_dict or data[key2] not in method_info_dict: return data['textA'] = build_short_mode_text(method_info_dict[data[key1]]) data['textB'] = build_short_mode_text(method_info_dict[data[key2]]) if (data['textA'] != "") and (data['textB'] != ""): # the short mode is necessary wf.write('%s\n' % json.dumps(data, ensure_ascii=False)) if not short_mode: seq1 = method_info_dict[data[key1]]['tokenSequence'] seq2 = method_info_dict[data[key2]]['tokenSequence'] for ratio in [1, 2, 3, 5, 10]: text_a = " ".join(ParserUtil.extractNLwords(seq1[:len(seq1) // ratio])) text_b = " ".join(ParserUtil.extractNLwords(seq2)) if not ((text_a == "" or text_a.endswith("| ")) or (text_b == "" or text_b.endswith("| "))): data_copy = copy.deepcopy(data) data_copy['textA'] = text_a data_copy['textB'] = text_b data_copy['ratio'] = ratio wf.write('%s\n' % json.dumps(data_copy, ensure_ascii=False))
def diag_reset_adsl_line(self): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_dsl_html(4))
def diag_get_dls_line_status(self): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_dsl_html(3))
def diag_get_atm_loopback_test(self): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_dsl_html(2))
def diag_get_atm_status(self): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_dsl_html(1))
def traceroute_ipv6(self, target): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_ping_html(target, 3))
def ping(self, target): return ParserUtil.parse_diagnostic_responce(RequestsUtil.diagnostic_ping_html(target, 1))
def get_status(self, force=None): if force or not self.status or (self.status_time + CACHE_TIME) < time(): self.status = ParserUtil.parse_status(RequestsUtil.get_status_html()) self.status_time = time() return self.status
def get_connections(self, force=None): if force or not self.connections or (self.connections_time + CACHE_TIME) < time(): self.connections = ParserUtil.parse_connections(RequestsUtil.get_connections_html()) self.connections_time = time() return self.connections
search_results = retriever.search_snippets(basic_query) search_results = deduplicate(snippet, search_results) if user_bert: if short_mode: ## short-bert mode query_snippet_text = build_short_mode_text(snippet) candidate_texts = [ build_short_mode_text(res) for res in search_results ] scores = short_bert_manager.rank(query_snippet_text, candidate_texts) else: ## full-bert mode query_snippet_text = " ".join( ParserUtil.extractNLwords(text_tokens)) candidate_texts = [ " ".join( ParserUtil.extractNLwords(res['tokenSequence'])) for res in search_results ] scores = full_bert_manager.rank(query_snippet_text, candidate_texts) sorted_scores = sorted([(i, score) for i, score in enumerate(scores)], key=lambda d: d[1], reverse=True) # 如果bert得分不高,就返回文本匹配的结果 tmp_indices = [] for i, score in sorted_scores[:max_size]: if score >= 0.4:
def search_codes(self): rawbody = cherrypy.request.body.read( int(cherrypy.request.headers['Content-Length'])) jsonbody = json.loads(rawbody) code_context_tokens = jsonbody['codeContextTokens'] snippet = jsonbody['snippet'] user_bert = jsonbody['useBert'] text_tokens = snippet['tokenSequence'] if self.do_extend: inferred_text_tokens = self.lm_infer.infer(code_context_tokens, text_tokens, self.extend_token_len) extend_query = self.extend_query_builder.build_query( text_tokens, inferred_text_tokens, self.max_size * 10) search_results = self.retriever.search_snippets(extend_query, with_score=True) else: basic_query = self.basic_query_builder.build_query( text_tokens, self.max_size * 10) search_results = self.retriever.search_snippets(basic_query, with_score=True) distinct_results = deduplicate(snippet, search_results, with_score=True) if user_bert and self.args.use_bert: if len(snippet['lineCodes']) <= 2: ## short-bert mode # query_snippet_text = build_short_mode_text(snippet) # candidate_texts = [build_short_mode_text(res) for res, _ in search_results] query_snippet_text = " | ".join([ " ".join(ParserUtil.extractNLwords([snippet['className'] ])), " ".join(ParserUtil.extractNLwords([snippet['methodName'] ])) ]) candidate_texts = [ " | ".join([ " ".join(ParserUtil.extractNLwords([res['className'] ])), " ".join(ParserUtil.extractNLwords([res['methodName'] ])) ]) for res, _ in distinct_results ] scores = self.short_bert_manager.rank(query_snippet_text, candidate_texts) else: ## full-bert mode query_snippet_text = " ".join( ParserUtil.extractNLwords(snippet['tokenSequence'])) candidate_texts = [ " ".join(ParserUtil.extractNLwords(res['tokenSequence'])) for res, _ in distinct_results ] scores = self.full_bert_manager.rank(query_snippet_text, candidate_texts) sorted_scores = sorted([(i, score) for i, score in enumerate(scores)], key=lambda d: d[1], reverse=True) tmp_indices = [] for i, score in sorted_scores[:self.max_size]: if score >= 0.0: tmp_indices.append(i) else: tmp_index_set = set(tmp_indices) for idx in range(min(self.max_size, len(sorted_scores))): if idx not in tmp_index_set: tmp_indices.append(idx) break distinct_results = [distinct_results[idx] for idx in tmp_indices] distinct_results = distinct_results[:self.max_size] distinct_results = [{ 'methodInfo': res[0], 'score': float(res[1]) } for res in distinct_results] response = json.dumps(distinct_results) print(" ".join(text_tokens)) print("res size:", len(distinct_results)) method_ids = [(i + 1, res['methodInfo']['methodId']) for i, res in enumerate(distinct_results)] print(method_ids) print('=' * 80) return response