Example #1
0
    def fuzzy_search(self, path_queries, path_jdk, respond_top_n, path_save):
        queries = cm.load_pkl(path_queries)
        jdk = cm.load_pkl(path_jdk)
        total = respond_top_n
        for i in range(len(queries)):
            query = queries[i]
            query_words = list(query[0])
            query_sorts = list(query[1])

            cmd = '.*' + '.*'.join(query_words) + '.*'
            data = []
            cmds = []
            source_hash = []
            respond, query_cmd = self.search_respond(cmd, source_hash, jdk)
            data.extend(respond)
            cmds.extend(query_cmd)
            idx = 0
            while len(data) < total and len(query_words) - idx > 0:
                temp = []
                if idx == 0:
                    s = [query_sorts[0]]
                else:
                    s = query_sorts[:idx]
                for j in range(len(query_words)):
                    if j not in s:
                        temp.append(query_words[j])
                cmd = '.*' + '.*'.join(temp) + '.*'  # add .* devant
                respond, query_cmd = self.search_respond(cmd, source_hash, jdk)
                data.extend(respond)
                cmds.extend(query_cmd)
                idx += 1
            cm.save_pkl(path_save + 'respond' + str(i) + '.pkl', data)
            cm.save_pkl(path_save + 'cmd' + str(i) + '.pkl', cmds)
            print(str(i) + '-' + str(len(queries)) + ' ' + str(len(data)))
def query_parse(path_from, path_parsed_vocab, path_method_vocab, path_to):
    queries = cm.load_txt(path_from)
    vjdk = dict(cm.load_pkl(path_parsed_vocab))
    vword = dict(cm.load_pkl(path_method_vocab))
    stemmer = PorterStemmer()
    str_replace = [
        'in java', 'using java', 'java', 'how to', 'how do', 'what is'
    ]

    data_queries = list()
    p = 0
    for i in range(len(queries)):
        print(str(i))
        query = queries[i]
        for str_re in str_replace:
            query = query.replace(str_re, '')
        data = []
        tokens = cm.get_tokens(query)
        p += len(tokens)
        tokens = nltk.pos_tag(tokens)

        for token in tokens:
            tvalue = token[0]
            ttype = token[1]
            if ttype in type_all:
                para = 0
                impact = 0
                stem = stemmer.stem(tvalue)
                if stem in vword:
                    para = 1
                    impact = vword[stem]
                else:
                    freq = []
                    syns = cm.get_synonyms(stem)
                    for syn in syns:
                        score = 0
                        stem = cm.get_stemmed(syn)
                        if stem in vword:
                            score = vword[stem]
                            freq.append(score)
                    idx_max_freq = -1
                    if len(freq) > 0:
                        idx_max_freq = freq.index(max(freq))
                    if idx_max_freq > -1:
                        tvalue = syns[idx_max_freq]
                        para = 1
                        impact = vword[tvalue]
                if ttype in type_nn and stem in vjdk:
                    para = 2
                    impact = vjdk[stem]
                tvalue = cm.get_stemmed(tvalue)

                vector = [tvalue, ttype, para, impact]
                data.append(vector)
        data_queries.append(data)
    cm.save_pkl(path_to, data_queries)
Example #3
0
def stat_parsed(path_jdk, file_from, folder_to, total_num=-1):
    base = folder_to
    all_api = 0
    all_jdk = 0
    relate = 0
    vocab_api = collections.defaultdict(int)
    vocab_jdk = collections.defaultdict(int)

    jdk = dict(cm.load_pkl(path_jdk))
    methods = cm.load_pkl(file_from + 'methname.pkl')
    lines = cm.load_pkl(file_from + 'apiseq.pkl')

    assert len(lines) == len(methods)
    total = len(methods) if total_num == -1 else total_num

    for i in range(0, total):
        method_tokens = methods[i]
        apiseq = lines[i].lower()

        if apiseq is not '[]':
            for token in method_tokens:
                if apiseq.find(token) >= 0:
                    relate += 1
                    break

            flag_api = 0
            flag_jdk = 0
            apis = apiseq.split(';')
            if len(apis) > 0:
                for api in apis:
                    if '.' in api:
                        flag_api = 1
                        if api.endswith(')'):
                            api = api[0:api.find('(')]
                        if api.endswith(']'):
                            api = api[0:api.find('[')]
                        if api.endswith('>'):
                            api = api[0:api.find('<')]

                        if api in jdk.keys():
                            flag_jdk = 1
                            vocab_jdk[api] += 1
                        else:
                            vocab_api[api] += 1
            all_api += flag_api
            all_jdk += flag_jdk

        print(
            str(i) + '-' + str(total) + ' ' + '-' + str(all_api) + '-' +
            str(relate) + ' ' + str(len(vocab_api.keys())) + '-' +
            str(len(vocab_jdk.keys())))

    cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)
def reranking(path_parsed_queries, path_queries, path_jdk, path_fuzzy_search, path_rerank):
    queries = cm.load_pkl(path_parsed_queries)
    jdk = cm.load_pkl(path_jdk)
    for i in range(len(queries)):
        query = queries[i]
        words = []
        for word in query:
            words.append(word[0])
        queries[i] = words

    queries_txt = cm.load_txt(path_queries)
    lines = []

    for i in range(99): # 50
        respond = cm.load_pkl(path_fuzzy_search + 'respond' + str(i) + '.pkl')
        query_cmd = cm.load_pkl(path_fuzzy_search + 'cmd' + str(i) + '.pkl')
        query = queries[i]
        query_txt = queries_txt[i]

        scores = list()
        for j in range(len(respond)):
            print(str(i) + '-50, iter-1, ' + str(j) + '-' + str(len(respond)))
            res = respond[j]['_source']
            line = res['method']
            cmd = query_cmd[j]
            scores.append([j, matcher_name(query, line, cmd)])
        scores.sort(key=operator.itemgetter(1), reverse=True)

        scores = scores[:100]

        for j in range(len(scores)):
            print(str(i + 1) + '-99, iter-2, ' + str(j) + '-' + str(len(scores)))
            idx = scores[j][0]
            res = respond[idx]['_source']
            line = res['parsed']
            scores[j].append(matcher_api(query, line, jdk))
        scores.sort(key=operator.itemgetter(1, 2), reverse=True)

        if '\n' not in query_txt:
            query_txt += '\n'
        lines.append(query_txt)
        results = min(len(scores), 10)
        if len(scores) > 0:
            for j in range(results):
                idx = scores[j][0]
                lines.append(respond[idx]['_source']['source'])
        lines.append('\n')

    cm.save_txt(path_rerank, lines)
Example #5
0
 def fill_simple_data(self, path):
     body = list()
     code = cm.load_txt(path + 'rawcode.txt')
     apiseq = cm.load_pkl(path + 'apiseq.pkl')
     meth = cm.load_pkl(path + 'raw_methname.pkl')
     assert len(code) == len(apiseq) and len(code) == len(meth)
     for j in range(len(code)):
         body.append({
             "_index": self.index_name,
             "_type": self.doc_type,
             "_source": {
                 "method": meth[j],
                 "parsed": apiseq[j],
                 "source": code[j]
             }
         })
     helpers.bulk(self.es, body, request_timeout=1000)
Example #6
0
def analyze_parsed(path_parsed_vocab, path_parsed):
    vdata = dict(cm.load_pkl(path_parsed_vocab))
    vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    vjdks = dict()
    for k, v in dict(vdata).items():
        key = cm.get_stemmed(k[str(k).rfind('.') + 1:].lower())
        if key not in vjdks.keys():
            vjdks[key] = v
    cm.save_pkl(path_parsed, vjdks)
Example #7
0
def analyze_method(path_parsed_vocab, path_method):
    vdata = dict(cm.load_pkl(path_parsed_vocab))
    vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    vwords = dict()
    for k, v in dict(vdata).items():
        key = cm.get_stemmed(k).lower()
        if key in vwords.keys():
            vwords[key] += v
        else:
            vwords[key] = v
    cm.save_pkl(path_method, vwords)
def query_parse_tree(path_from, path_to):
    lines = cm.load_pkl(path_from)

    # sorting words
    for i in range(len(lines)):
        items = lines[i]

        mid_list1 = list()
        mid_list2 = list()
        word_list1 = list()
        word_list2 = list()
        other_list1 = list()
        other_list2 = list()
        for j in range(len(items)):
            item = items[j]
            if item[1] in type_cc + type_to + type_in:
                if item[2] is 1:
                    mid_list1.append([j, items[j][3]])
                else:
                    mid_list2.append([j, items[j][3]])
            elif item[1] in type_vb + type_nn:
                if item[2] is 1:
                    word_list1.append([j, items[j][3]])
                else:
                    word_list2.append([j, items[j][3]])
            else:
                if item[2] is 1:
                    other_list1.append([j, items[j][3]])
                else:
                    other_list2.append([j, items[j][3]])

        mid_list1.sort(key=operator.itemgetter(1))
        mid_list2.sort(key=operator.itemgetter(1))
        word_list1.sort(key=operator.itemgetter(1))
        word_list2.sort(key=operator.itemgetter(1))
        other_list1.sort(key=operator.itemgetter(1))
        other_list2.sort(key=operator.itemgetter(1))

        sort_list = mid_list1 + mid_list2 + other_list1 + other_list2 + word_list1 + word_list2
        for j in range(len(sort_list)):
            sort_list[j] = sort_list[j][0]

        query_list = list()
        for item in items:
            query_list.append(item[0])

        lines[i] = [query_list, sort_list]
    cm.save_pkl(path_to, lines)
Example #9
0
def stat_method(file_from, folder_to, total_num=-1):
    base = folder_to
    vword = dict()
    vsent = dict()
    vtype_cc = dict()
    vtype_cd = dict()
    vtype_in = dict()
    vtype_to = dict()
    vtype_jj = dict()
    vtype_nn = dict()
    vtype_rb = dict()
    vtype_vb = dict()
    vtype_ot = dict()
    methods = cm.load_pkl(file_from)
    total = len(methods) if total_num == -1 else total_num
    for i in range(0, total):
        print(str(i) + '-' + str(total))
        tokens = methods[i]
        update_vocab_by_tokens(vword, tokens)
        tokens_type = nltk.pos_tag(tokens)
        sent = []

        for ttype in tokens_type:
            if ttype[1] in type_cc:
                update_vocab_by_token(vtype_cc, ttype[0])
                sent.append('cc')
            elif ttype[1] in type_cd:
                update_vocab_by_token(vtype_cd, ttype[0])
                sent.append('cd')
            elif ttype[1] in type_jj:
                update_vocab_by_token(vtype_jj, ttype[0])
                sent.append('jj')
            elif ttype[1] in type_nn:
                update_vocab_by_token(vtype_nn, ttype[0])
                sent.append('nn')
            elif ttype[1] in type_rb:
                update_vocab_by_token(vtype_rb, ttype[0])
                sent.append('rb')
            elif ttype[1] in type_vb:
                update_vocab_by_token(vtype_vb, ttype[0])
                sent.append('vb')
            elif ttype[1] in type_in:
                update_vocab_by_token(vtype_in, ttype[0])
                sent.append('in')
            elif ttype[1] in type_to:
                update_vocab_by_token(vtype_to, ttype[0])
                sent.append('to')
            else:
                update_vocab_by_token(vtype_ot,
                                      str(ttype[0]) + '-' + str(ttype[1]))
                sent.append(ttype[1])

        lsent = '-'.join(sent)
        update_vocab_by_token(vsent, lsent)

        print(
            str(i) + '-' + str(total) + ': ' + str(len(vword.keys())) + '-' +
            str(len(vsent.keys())) + ' ' + str(len(vtype_cd.keys())) + '-' +
            str(len(vtype_in.keys())) + '-' + str(len(vtype_to.keys())) + '-' +
            str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) + '-' +
            str(len(vtype_rb.keys())) + '-' + str(len(vtype_vb.keys())) + '-' +
            str(len(vtype_ot.keys())) + '-')
    cm.save_pkl(base + 'method_vword.pkl', vword)
    cm.save_pkl(base + 'method_vsent.pkl', vsent)
    cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd)
    cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in)
    cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to)
    cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj)
    cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn)
    cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb)
    cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb)
    cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)
Example #10
0
 def fill_data(self, path_formatted_repos):
     for i in range(10):
         print(str(i) + '-9')
         body = cm.load_pkl(path_formatted_repos + 'body' + str(i) + '.pkl')
         helpers.bulk(self.es, body, request_timeout=1000)
def stat_parameter_return(path_jdk, folder_method, folder_parameter,
                          folder_return, folder_to, total_files):
    base = folder_to
    vocab_api = dict()
    vocab_jdk = dict()
    vocab_name = dict()
    all = 0
    con = 0
    jdk = dict(cm.load_pkl(path_jdk))
    for i in range(0, total_files):
        path = folder_method + 'method' + str(i) + '.txt'
        if os.path.exists(path):
            lines_method = cm.load_txt(folder_method + 'method' + str(i) +
                                       '.txt')
            lines_parameter = cm.load_txt(folder_parameter + 'parameter' +
                                          str(i) + '.txt')
            lines_return = cm.load_txt(folder_return + 'return' + str(i) +
                                       '.txt')
            for j in range(len(lines_method)):
                print(
                    str(i) + '-' + str(total_files) + ' -' + str(j) + ' ' +
                    str(len(lines_method)) + ' ' + str(con) + ' - ' +
                    str(all) + ' ' + str(len(vocab_api.keys())) + '-' +
                    str(len(vocab_jdk.keys())) + '-' +
                    str(len(vocab_name.keys())))
                all += 1
                line_method = lines_method[j]
                tokens = cm.get_tokens(cm.camel_split(line_method))

                line_paras = lines_parameter[j].replace('\n', '')
                para_types = []
                para_names = []

                line_return = lines_return[j]
                line_return = line_return.replace('\n', '')
                para_types.append(line_return)

                line = line_paras + ' ' + line_return
                for token in tokens:
                    if line.find(token) >= 0:
                        con += 1
                        break

                if '[]' not in line_paras:
                    if ';' in line_paras:
                        line_paras = line_paras.split(';')
                        for line_para in line_paras:
                            paras = line_para.split(',')
                            if len(paras) == 2:
                                para_types.append(paras[0])
                                para_names.append(paras[1])
                    else:
                        paras = line_paras.split(',')
                        if len(paras) == 2:
                            para_types.append(paras[0])
                            para_names.append(paras[1])

                for type in para_types:
                    if type in jdk.keys():
                        if type in vocab_jdk.keys():
                            vocab_jdk[type] += 1
                        else:
                            vocab_jdk[type] = 1
                    else:
                        if type in vocab_api.keys():
                            vocab_api[type] += 1
                        else:
                            vocab_api[type] = 1

                for name in para_names:
                    if name in vocab_name.keys():
                        vocab_name[name] += 1
                    else:
                        vocab_name[name] = 1

    cm.save_pkl(base + 'para_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'para_vocab_jdk.pkl', vocab_jdk)
    cm.save_pkl(base + 'para_vocab_name.pkl', vocab_name)
def stat_parsed(path_jdk, folder_from_method, folder_to_parsed, folder_to,
                total_files):
    base = folder_to
    vocab_api = dict()
    vocab_jdk = dict()
    all = 0
    all_api = 0
    all_jdk = 0
    relate = 0
    jdk = dict(cm.load_pkl(path_jdk))
    for i in range(0, total_files):
        path = folder_from_method + 'method' + str(i) + '.txt'
        if os.path.exists(path):
            methods = cm.load_txt(folder_from_method + 'method' + str(i) +
                                  '.txt')
            lines = cm.load_txt(folder_to_parsed + 'parsed' + str(i) + '.txt')
            for j in range(len(methods)):
                all += 1

                line = lines[j].replace('\n', '')
                if line is not '[]':
                    tokens = cm.get_tokens(cm.camel_split(methods[j]))
                    for token in tokens:
                        if line.find(token) >= 0:
                            relate += 1
                            break

                    flag_api = 0
                    flag_jdk = 0
                    apis = line.split(',')
                    if len(apis) > 0:
                        for api in apis:
                            if '.' in api:
                                flag_api = 1

                                if api.endswith(')'):
                                    api = api[0:api.find('(')]
                                if api.endswith(']'):
                                    api = api[0:api.find('[')]
                                if api.endswith('>'):
                                    api = api[0:api.find('<')]

                                if api in jdk.keys():
                                    flag_jdk = 1
                                    if api in vocab_jdk.keys():
                                        vocab_jdk[api] += 1
                                    else:
                                        vocab_jdk[api] = 1
                                else:
                                    if api in vocab_api.keys():
                                        vocab_api[api] += 1
                                    else:
                                        vocab_api[api] = 1
                    all_api += flag_api
                    all_jdk += flag_jdk

                print(
                    str(i) + '-41025 ' + str(j) + '-' + str(len(methods)) +
                    ' ' + str(all) + '-' + str(all_api) + '-' + str(all_jdk) +
                    '-' + str(relate) + ' ' + str(len(vocab_api.keys())) +
                    '-' + str(len(vocab_jdk.keys())))

    cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)