def fuzzy_search(self, path_queries, path_jdk, respond_top_n, path_save): queries = cm.load_pkl(path_queries) jdk = cm.load_pkl(path_jdk) total = respond_top_n for i in range(len(queries)): query = queries[i] query_words = list(query[0]) query_sorts = list(query[1]) cmd = '.*' + '.*'.join(query_words) + '.*' data = [] cmds = [] source_hash = [] respond, query_cmd = self.search_respond(cmd, source_hash, jdk) data.extend(respond) cmds.extend(query_cmd) idx = 0 while len(data) < total and len(query_words) - idx > 0: temp = [] if idx == 0: s = [query_sorts[0]] else: s = query_sorts[:idx] for j in range(len(query_words)): if j not in s: temp.append(query_words[j]) cmd = '.*' + '.*'.join(temp) + '.*' # add .* devant respond, query_cmd = self.search_respond(cmd, source_hash, jdk) data.extend(respond) cmds.extend(query_cmd) idx += 1 cm.save_pkl(path_save + 'respond' + str(i) + '.pkl', data) cm.save_pkl(path_save + 'cmd' + str(i) + '.pkl', cmds) print(str(i) + '-' + str(len(queries)) + ' ' + str(len(data)))
def jdk(path_from, path_to): with open(path_from, 'r', encoding='utf-8') as infile: jdk = infile.readlines() vocab = {} for line in jdk: line = str(line).split(',') vocab[line[0].lower()] = str(cm.filter_digit_english( line[1])).lower().replace(' ', '') cm.save_pkl(path_to, vocab)
def analyze_parsed(path_parsed_vocab, path_parsed): vdata = dict(cm.load_pkl(path_parsed_vocab)) vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) vjdks = dict() for k, v in dict(vdata).items(): key = cm.get_stemmed(k[str(k).rfind('.') + 1:].lower()) if key not in vjdks.keys(): vjdks[key] = v cm.save_pkl(path_parsed, vjdks)
def query_parse(path_from, path_parsed_vocab, path_method_vocab, path_to): queries = cm.load_txt(path_from) vjdk = dict(cm.load_pkl(path_parsed_vocab)) vword = dict(cm.load_pkl(path_method_vocab)) stemmer = PorterStemmer() str_replace = [ 'in java', 'using java', 'java', 'how to', 'how do', 'what is' ] data_queries = list() p = 0 for i in range(len(queries)): print(str(i)) query = queries[i] for str_re in str_replace: query = query.replace(str_re, '') data = [] tokens = cm.get_tokens(query) p += len(tokens) tokens = nltk.pos_tag(tokens) for token in tokens: tvalue = token[0] ttype = token[1] if ttype in type_all: para = 0 impact = 0 stem = stemmer.stem(tvalue) if stem in vword: para = 1 impact = vword[stem] else: freq = [] syns = cm.get_synonyms(stem) for syn in syns: score = 0 stem = cm.get_stemmed(syn) if stem in vword: score = vword[stem] freq.append(score) idx_max_freq = -1 if len(freq) > 0: idx_max_freq = freq.index(max(freq)) if idx_max_freq > -1: tvalue = syns[idx_max_freq] para = 1 impact = vword[tvalue] if ttype in type_nn and stem in vjdk: para = 2 impact = vjdk[stem] tvalue = cm.get_stemmed(tvalue) vector = [tvalue, ttype, para, impact] data.append(vector) data_queries.append(data) cm.save_pkl(path_to, data_queries)
def analyze_method(path_parsed_vocab, path_method): vdata = dict(cm.load_pkl(path_parsed_vocab)) vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) vwords = dict() for k, v in dict(vdata).items(): key = cm.get_stemmed(k).lower() if key in vwords.keys(): vwords[key] += v else: vwords[key] = v cm.save_pkl(path_method, vwords)
def stat_parsed(path_jdk, file_from, folder_to, total_num=-1): base = folder_to all_api = 0 all_jdk = 0 relate = 0 vocab_api = collections.defaultdict(int) vocab_jdk = collections.defaultdict(int) jdk = dict(cm.load_pkl(path_jdk)) methods = cm.load_pkl(file_from + 'methname.pkl') lines = cm.load_pkl(file_from + 'apiseq.pkl') assert len(lines) == len(methods) total = len(methods) if total_num == -1 else total_num for i in range(0, total): method_tokens = methods[i] apiseq = lines[i].lower() if apiseq is not '[]': for token in method_tokens: if apiseq.find(token) >= 0: relate += 1 break flag_api = 0 flag_jdk = 0 apis = apiseq.split(';') if len(apis) > 0: for api in apis: if '.' in api: flag_api = 1 if api.endswith(')'): api = api[0:api.find('(')] if api.endswith(']'): api = api[0:api.find('[')] if api.endswith('>'): api = api[0:api.find('<')] if api in jdk.keys(): flag_jdk = 1 vocab_jdk[api] += 1 else: vocab_api[api] += 1 all_api += flag_api all_jdk += flag_jdk print( str(i) + '-' + str(total) + ' ' + '-' + str(all_api) + '-' + str(relate) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys()))) cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)
def format_data(self, path_pased_repos, path_formatted_repos, total_files, repo_split_size): body = list() count = 0 index = 0 path = path_pased_repos for i in range(total_files): print(str(i)) if os.path.exists(path + 'comment/comment' + str(i) + '.txt'): wcomment = cm.load_txt(path + 'comment/comment' + str(i) + '.txt') wjavadoc = cm.load_txt(path + 'javadoc/javadoc' + str(i) + '.txt') wmethod = cm.load_txt(path + 'method/method' + str(i) + '.txt') wmodifier = cm.load_txt(path + 'modifier/modifier' + str(i) + '.txt') wpackage = cm.load_txt(path + 'package/package' + str(i) + '.txt') wparameter = cm.load_txt(path + 'parameter/parameter' + str(i) + '.txt') wparsed = cm.load_txt(path + 'parsed/parsed' + str(i) + '.txt') wreturn = cm.load_txt(path + 'return/return' + str(i) + '.txt') wsource = cm.load_txt(path + 'source/source' + str(i) + '.txt') for j in range(len(wcomment)): body.append({ "_index": self.index_name, "_type": self.doc_type, "_source": { "comment": wcomment[j], "javadoc": wjavadoc[j], "method": wmethod[j], "modifier": wmodifier[j], "package": wpackage[j], "parameter": wparameter[j], "parsed": wparsed[j], "return": wreturn[j], "source": wsource[j] } }) count += 1 if count == repo_split_size: cm.save_pkl( path_formatted_repos + 'body' + str(index) + '.pkl', body) index += 1 count = 0 body = list() cm.save_pkl(path_formatted_repos + 'body' + str(index) + '.pkl', body)
def query_parse_tree(path_from, path_to): lines = cm.load_pkl(path_from) # sorting words for i in range(len(lines)): items = lines[i] mid_list1 = list() mid_list2 = list() word_list1 = list() word_list2 = list() other_list1 = list() other_list2 = list() for j in range(len(items)): item = items[j] if item[1] in type_cc + type_to + type_in: if item[2] is 1: mid_list1.append([j, items[j][3]]) else: mid_list2.append([j, items[j][3]]) elif item[1] in type_vb + type_nn: if item[2] is 1: word_list1.append([j, items[j][3]]) else: word_list2.append([j, items[j][3]]) else: if item[2] is 1: other_list1.append([j, items[j][3]]) else: other_list2.append([j, items[j][3]]) mid_list1.sort(key=operator.itemgetter(1)) mid_list2.sort(key=operator.itemgetter(1)) word_list1.sort(key=operator.itemgetter(1)) word_list2.sort(key=operator.itemgetter(1)) other_list1.sort(key=operator.itemgetter(1)) other_list2.sort(key=operator.itemgetter(1)) sort_list = mid_list1 + mid_list2 + other_list1 + other_list2 + word_list1 + word_list2 for j in range(len(sort_list)): sort_list[j] = sort_list[j][0] query_list = list() for item in items: query_list.append(item[0]) lines[i] = [query_list, sort_list] cm.save_pkl(path_to, lines)
def stat_method(file_from, folder_to, total_num=-1): base = folder_to vword = dict() vsent = dict() vtype_cc = dict() vtype_cd = dict() vtype_in = dict() vtype_to = dict() vtype_jj = dict() vtype_nn = dict() vtype_rb = dict() vtype_vb = dict() vtype_ot = dict() methods = cm.load_pkl(file_from) total = len(methods) if total_num == -1 else total_num for i in range(0, total): print(str(i) + '-' + str(total)) tokens = methods[i] update_vocab_by_tokens(vword, tokens) tokens_type = nltk.pos_tag(tokens) sent = [] for ttype in tokens_type: if ttype[1] in type_cc: update_vocab_by_token(vtype_cc, ttype[0]) sent.append('cc') elif ttype[1] in type_cd: update_vocab_by_token(vtype_cd, ttype[0]) sent.append('cd') elif ttype[1] in type_jj: update_vocab_by_token(vtype_jj, ttype[0]) sent.append('jj') elif ttype[1] in type_nn: update_vocab_by_token(vtype_nn, ttype[0]) sent.append('nn') elif ttype[1] in type_rb: update_vocab_by_token(vtype_rb, ttype[0]) sent.append('rb') elif ttype[1] in type_vb: update_vocab_by_token(vtype_vb, ttype[0]) sent.append('vb') elif ttype[1] in type_in: update_vocab_by_token(vtype_in, ttype[0]) sent.append('in') elif ttype[1] in type_to: update_vocab_by_token(vtype_to, ttype[0]) sent.append('to') else: update_vocab_by_token(vtype_ot, str(ttype[0]) + '-' + str(ttype[1])) sent.append(ttype[1]) lsent = '-'.join(sent) update_vocab_by_token(vsent, lsent) print( str(i) + '-' + str(total) + ': ' + str(len(vword.keys())) + '-' + str(len(vsent.keys())) + ' ' + str(len(vtype_cd.keys())) + '-' + str(len(vtype_in.keys())) + '-' + str(len(vtype_to.keys())) + '-' + str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) + '-' + str(len(vtype_rb.keys())) + '-' + str(len(vtype_vb.keys())) + '-' + str(len(vtype_ot.keys())) + '-') cm.save_pkl(base + 'method_vword.pkl', vword) cm.save_pkl(base + 'method_vsent.pkl', vsent) cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd) cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in) cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to) cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj) cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn) cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb) cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb) cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)
def stat_method(folder_from, folder_to, total_files): base = folder_to vword = dict() vsent = dict() vtype_cc = dict() vtype_cd = dict() vtype_in = dict() vtype_to = dict() vtype_jj = dict() vtype_nn = dict() vtype_rb = dict() vtype_vb = dict() vtype_ot = dict() for i in range(0, total_files): print(str(i) + '-' + str(total_files)) path = folder_from + 'method' + str(i) + '.txt' if os.path.exists(path): lines = cm.load_txt(path) for line in lines: tokens = cm.camel_split_for_tokens(line) update_vocab_by_tokens(vword, tokens) tokens_type = nltk.pos_tag(tokens) sent = [] for ttype in tokens_type: if ttype[1] in type_cc: update_vocab_by_token(vtype_cc, ttype[0]) sent.append('cc') elif ttype[1] in type_cd: update_vocab_by_token(vtype_cd, ttype[0]) sent.append('cd') elif ttype[1] in type_jj: update_vocab_by_token(vtype_jj, ttype[0]) sent.append('jj') elif ttype[1] in type_nn: update_vocab_by_token(vtype_nn, ttype[0]) sent.append('nn') elif ttype[1] in type_rb: update_vocab_by_token(vtype_rb, ttype[0]) sent.append('rb') elif ttype[1] in type_vb: update_vocab_by_token(vtype_vb, ttype[0]) sent.append('vb') elif ttype[1] in type_in: update_vocab_by_token(vtype_in, ttype[0]) sent.append('in') elif ttype[1] in type_to: update_vocab_by_token(vtype_to, ttype[0]) sent.append('to') else: update_vocab_by_token( vtype_ot, str(ttype[0]) + '-' + str(ttype[1])) sent.append(ttype[1]) lsent = '-'.join(sent) update_vocab_by_token(vsent, lsent) print( str(i) + '-' + str(total_files) + ': ' + str(len(vword.keys())) + '-' + str(len(vsent.keys())) + ' ' + str(len(vtype_cd.keys())) + '-' + str(len(vtype_in.keys())) + '-' + str(len(vtype_to.keys())) + '-' + str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) + '-' + str(len(vtype_rb.keys())) + '-' + str(len(vtype_vb.keys())) + '-' + str(len(vtype_ot.keys())) + '-') cm.save_pkl(base + 'method_vword.pkl', vword) cm.save_pkl(base + 'method_vsent.pkl', vsent) cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd) cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in) cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to) cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj) cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn) cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb) cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb) cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)
def stat_parameter_return(path_jdk, folder_method, folder_parameter, folder_return, folder_to, total_files): base = folder_to vocab_api = dict() vocab_jdk = dict() vocab_name = dict() all = 0 con = 0 jdk = dict(cm.load_pkl(path_jdk)) for i in range(0, total_files): path = folder_method + 'method' + str(i) + '.txt' if os.path.exists(path): lines_method = cm.load_txt(folder_method + 'method' + str(i) + '.txt') lines_parameter = cm.load_txt(folder_parameter + 'parameter' + str(i) + '.txt') lines_return = cm.load_txt(folder_return + 'return' + str(i) + '.txt') for j in range(len(lines_method)): print( str(i) + '-' + str(total_files) + ' -' + str(j) + ' ' + str(len(lines_method)) + ' ' + str(con) + ' - ' + str(all) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys())) + '-' + str(len(vocab_name.keys()))) all += 1 line_method = lines_method[j] tokens = cm.get_tokens(cm.camel_split(line_method)) line_paras = lines_parameter[j].replace('\n', '') para_types = [] para_names = [] line_return = lines_return[j] line_return = line_return.replace('\n', '') para_types.append(line_return) line = line_paras + ' ' + line_return for token in tokens: if line.find(token) >= 0: con += 1 break if '[]' not in line_paras: if ';' in line_paras: line_paras = line_paras.split(';') for line_para in line_paras: paras = line_para.split(',') if len(paras) == 2: para_types.append(paras[0]) para_names.append(paras[1]) else: paras = line_paras.split(',') if len(paras) == 2: para_types.append(paras[0]) para_names.append(paras[1]) for type in para_types: if type in jdk.keys(): if type in vocab_jdk.keys(): vocab_jdk[type] += 1 else: vocab_jdk[type] = 1 else: if type in vocab_api.keys(): vocab_api[type] += 1 else: vocab_api[type] = 1 for name in para_names: if name in vocab_name.keys(): vocab_name[name] += 1 else: vocab_name[name] = 1 cm.save_pkl(base + 'para_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'para_vocab_jdk.pkl', vocab_jdk) cm.save_pkl(base + 'para_vocab_name.pkl', vocab_name)
def stat_parsed(path_jdk, folder_from_method, folder_to_parsed, folder_to, total_files): base = folder_to vocab_api = dict() vocab_jdk = dict() all = 0 all_api = 0 all_jdk = 0 relate = 0 jdk = dict(cm.load_pkl(path_jdk)) for i in range(0, total_files): path = folder_from_method + 'method' + str(i) + '.txt' if os.path.exists(path): methods = cm.load_txt(folder_from_method + 'method' + str(i) + '.txt') lines = cm.load_txt(folder_to_parsed + 'parsed' + str(i) + '.txt') for j in range(len(methods)): all += 1 line = lines[j].replace('\n', '') if line is not '[]': tokens = cm.get_tokens(cm.camel_split(methods[j])) for token in tokens: if line.find(token) >= 0: relate += 1 break flag_api = 0 flag_jdk = 0 apis = line.split(',') if len(apis) > 0: for api in apis: if '.' in api: flag_api = 1 if api.endswith(')'): api = api[0:api.find('(')] if api.endswith(']'): api = api[0:api.find('[')] if api.endswith('>'): api = api[0:api.find('<')] if api in jdk.keys(): flag_jdk = 1 if api in vocab_jdk.keys(): vocab_jdk[api] += 1 else: vocab_jdk[api] = 1 else: if api in vocab_api.keys(): vocab_api[api] += 1 else: vocab_api[api] = 1 all_api += flag_api all_jdk += flag_jdk print( str(i) + '-41025 ' + str(j) + '-' + str(len(methods)) + ' ' + str(all) + '-' + str(all_api) + '-' + str(all_jdk) + '-' + str(relate) + ' ' + str(len(vocab_api.keys())) + '-' + str(len(vocab_jdk.keys()))) cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api) cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)