Esempi in Python per save_pkl, esempi in Python per codematcher.save_pkl

Esempio n. 1

0

Mostra file

    def fuzzy_search(self, path_queries, path_jdk, respond_top_n, path_save):
        queries = cm.load_pkl(path_queries)
        jdk = cm.load_pkl(path_jdk)
        total = respond_top_n
        for i in range(len(queries)):
            query = queries[i]
            query_words = list(query[0])
            query_sorts = list(query[1])

            cmd = '.*' + '.*'.join(query_words) + '.*'
            data = []
            cmds = []
            source_hash = []
            respond, query_cmd = self.search_respond(cmd, source_hash, jdk)
            data.extend(respond)
            cmds.extend(query_cmd)
            idx = 0
            while len(data) < total and len(query_words) - idx > 0:
                temp = []
                if idx == 0:
                    s = [query_sorts[0]]
                else:
                    s = query_sorts[:idx]
                for j in range(len(query_words)):
                    if j not in s:
                        temp.append(query_words[j])
                cmd = '.*' + '.*'.join(temp) + '.*'  # add .* devant
                respond, query_cmd = self.search_respond(cmd, source_hash, jdk)
                data.extend(respond)
                cmds.extend(query_cmd)
                idx += 1
            cm.save_pkl(path_save + 'respond' + str(i) + '.pkl', data)
            cm.save_pkl(path_save + 'cmd' + str(i) + '.pkl', cmds)
            print(str(i) + '-' + str(len(queries)) + ' ' + str(len(data)))

Esempio n. 2

0

Mostra file

File: codematcher_parse.py Progetto: LeorPoirot/codematcher

def jdk(path_from, path_to):
    with open(path_from, 'r', encoding='utf-8') as infile:
        jdk = infile.readlines()
        vocab = {}
        for line in jdk:
            line = str(line).split(',')
            vocab[line[0].lower()] = str(cm.filter_digit_english(
                line[1])).lower().replace(' ', '')
        cm.save_pkl(path_to, vocab)

Esempio n. 3

0

Mostra file

def analyze_parsed(path_parsed_vocab, path_parsed):
    vdata = dict(cm.load_pkl(path_parsed_vocab))
    vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    vjdks = dict()
    for k, v in dict(vdata).items():
        key = cm.get_stemmed(k[str(k).rfind('.') + 1:].lower())
        if key not in vjdks.keys():
            vjdks[key] = v
    cm.save_pkl(path_parsed, vjdks)

Esempio n. 4

0

Mostra file

File: codematcher_parse.py Progetto: LeorPoirot/codematcher

def query_parse(path_from, path_parsed_vocab, path_method_vocab, path_to):
    queries = cm.load_txt(path_from)
    vjdk = dict(cm.load_pkl(path_parsed_vocab))
    vword = dict(cm.load_pkl(path_method_vocab))
    stemmer = PorterStemmer()
    str_replace = [
        'in java', 'using java', 'java', 'how to', 'how do', 'what is'
    ]

    data_queries = list()
    p = 0
    for i in range(len(queries)):
        print(str(i))
        query = queries[i]
        for str_re in str_replace:
            query = query.replace(str_re, '')
        data = []
        tokens = cm.get_tokens(query)
        p += len(tokens)
        tokens = nltk.pos_tag(tokens)

        for token in tokens:
            tvalue = token[0]
            ttype = token[1]
            if ttype in type_all:
                para = 0
                impact = 0
                stem = stemmer.stem(tvalue)
                if stem in vword:
                    para = 1
                    impact = vword[stem]
                else:
                    freq = []
                    syns = cm.get_synonyms(stem)
                    for syn in syns:
                        score = 0
                        stem = cm.get_stemmed(syn)
                        if stem in vword:
                            score = vword[stem]
                            freq.append(score)
                    idx_max_freq = -1
                    if len(freq) > 0:
                        idx_max_freq = freq.index(max(freq))
                    if idx_max_freq > -1:
                        tvalue = syns[idx_max_freq]
                        para = 1
                        impact = vword[tvalue]
                if ttype in type_nn and stem in vjdk:
                    para = 2
                    impact = vjdk[stem]
                tvalue = cm.get_stemmed(tvalue)

                vector = [tvalue, ttype, para, impact]
                data.append(vector)
        data_queries.append(data)
    cm.save_pkl(path_to, data_queries)

Esempio n. 5

0

Mostra file

def analyze_method(path_parsed_vocab, path_method):
    vdata = dict(cm.load_pkl(path_parsed_vocab))
    vdata = sorted(vdata.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    vwords = dict()
    for k, v in dict(vdata).items():
        key = cm.get_stemmed(k).lower()
        if key in vwords.keys():
            vwords[key] += v
        else:
            vwords[key] = v
    cm.save_pkl(path_method, vwords)

Esempio n. 6

0

Mostra file

def stat_parsed(path_jdk, file_from, folder_to, total_num=-1):
    base = folder_to
    all_api = 0
    all_jdk = 0
    relate = 0
    vocab_api = collections.defaultdict(int)
    vocab_jdk = collections.defaultdict(int)

    jdk = dict(cm.load_pkl(path_jdk))
    methods = cm.load_pkl(file_from + 'methname.pkl')
    lines = cm.load_pkl(file_from + 'apiseq.pkl')

    assert len(lines) == len(methods)
    total = len(methods) if total_num == -1 else total_num

    for i in range(0, total):
        method_tokens = methods[i]
        apiseq = lines[i].lower()

        if apiseq is not '[]':
            for token in method_tokens:
                if apiseq.find(token) >= 0:
                    relate += 1
                    break

            flag_api = 0
            flag_jdk = 0
            apis = apiseq.split(';')
            if len(apis) > 0:
                for api in apis:
                    if '.' in api:
                        flag_api = 1
                        if api.endswith(')'):
                            api = api[0:api.find('(')]
                        if api.endswith(']'):
                            api = api[0:api.find('[')]
                        if api.endswith('>'):
                            api = api[0:api.find('<')]

                        if api in jdk.keys():
                            flag_jdk = 1
                            vocab_jdk[api] += 1
                        else:
                            vocab_api[api] += 1
            all_api += flag_api
            all_jdk += flag_jdk

        print(
            str(i) + '-' + str(total) + ' ' + '-' + str(all_api) + '-' +
            str(relate) + ' ' + str(len(vocab_api.keys())) + '-' +
            str(len(vocab_jdk.keys())))

    cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)

Esempio n. 7

0

Mostra file

    def format_data(self, path_pased_repos, path_formatted_repos, total_files,
                    repo_split_size):
        body = list()
        count = 0
        index = 0
        path = path_pased_repos
        for i in range(total_files):
            print(str(i))
            if os.path.exists(path + 'comment/comment' + str(i) + '.txt'):
                wcomment = cm.load_txt(path + 'comment/comment' + str(i) +
                                       '.txt')
                wjavadoc = cm.load_txt(path + 'javadoc/javadoc' + str(i) +
                                       '.txt')
                wmethod = cm.load_txt(path + 'method/method' + str(i) + '.txt')
                wmodifier = cm.load_txt(path + 'modifier/modifier' + str(i) +
                                        '.txt')
                wpackage = cm.load_txt(path + 'package/package' + str(i) +
                                       '.txt')
                wparameter = cm.load_txt(path + 'parameter/parameter' +
                                         str(i) + '.txt')
                wparsed = cm.load_txt(path + 'parsed/parsed' + str(i) + '.txt')
                wreturn = cm.load_txt(path + 'return/return' + str(i) + '.txt')
                wsource = cm.load_txt(path + 'source/source' + str(i) + '.txt')

                for j in range(len(wcomment)):
                    body.append({
                        "_index": self.index_name,
                        "_type": self.doc_type,
                        "_source": {
                            "comment": wcomment[j],
                            "javadoc": wjavadoc[j],
                            "method": wmethod[j],
                            "modifier": wmodifier[j],
                            "package": wpackage[j],
                            "parameter": wparameter[j],
                            "parsed": wparsed[j],
                            "return": wreturn[j],
                            "source": wsource[j]
                        }
                    })

                count += 1
                if count == repo_split_size:
                    cm.save_pkl(
                        path_formatted_repos + 'body' + str(index) + '.pkl',
                        body)
                    index += 1
                    count = 0
                    body = list()
        cm.save_pkl(path_formatted_repos + 'body' + str(index) + '.pkl', body)

Esempio n. 8

0

Mostra file

File: codematcher_parse.py Progetto: LeorPoirot/codematcher

def query_parse_tree(path_from, path_to):
    lines = cm.load_pkl(path_from)

    # sorting words
    for i in range(len(lines)):
        items = lines[i]

        mid_list1 = list()
        mid_list2 = list()
        word_list1 = list()
        word_list2 = list()
        other_list1 = list()
        other_list2 = list()
        for j in range(len(items)):
            item = items[j]
            if item[1] in type_cc + type_to + type_in:
                if item[2] is 1:
                    mid_list1.append([j, items[j][3]])
                else:
                    mid_list2.append([j, items[j][3]])
            elif item[1] in type_vb + type_nn:
                if item[2] is 1:
                    word_list1.append([j, items[j][3]])
                else:
                    word_list2.append([j, items[j][3]])
            else:
                if item[2] is 1:
                    other_list1.append([j, items[j][3]])
                else:
                    other_list2.append([j, items[j][3]])

        mid_list1.sort(key=operator.itemgetter(1))
        mid_list2.sort(key=operator.itemgetter(1))
        word_list1.sort(key=operator.itemgetter(1))
        word_list2.sort(key=operator.itemgetter(1))
        other_list1.sort(key=operator.itemgetter(1))
        other_list2.sort(key=operator.itemgetter(1))

        sort_list = mid_list1 + mid_list2 + other_list1 + other_list2 + word_list1 + word_list2
        for j in range(len(sort_list)):
            sort_list[j] = sort_list[j][0]

        query_list = list()
        for item in items:
            query_list.append(item[0])

        lines[i] = [query_list, sort_list]
    cm.save_pkl(path_to, lines)

Esempio n. 9

0

Mostra file

def stat_method(file_from, folder_to, total_num=-1):
    base = folder_to
    vword = dict()
    vsent = dict()
    vtype_cc = dict()
    vtype_cd = dict()
    vtype_in = dict()
    vtype_to = dict()
    vtype_jj = dict()
    vtype_nn = dict()
    vtype_rb = dict()
    vtype_vb = dict()
    vtype_ot = dict()
    methods = cm.load_pkl(file_from)
    total = len(methods) if total_num == -1 else total_num
    for i in range(0, total):
        print(str(i) + '-' + str(total))
        tokens = methods[i]
        update_vocab_by_tokens(vword, tokens)
        tokens_type = nltk.pos_tag(tokens)
        sent = []

        for ttype in tokens_type:
            if ttype[1] in type_cc:
                update_vocab_by_token(vtype_cc, ttype[0])
                sent.append('cc')
            elif ttype[1] in type_cd:
                update_vocab_by_token(vtype_cd, ttype[0])
                sent.append('cd')
            elif ttype[1] in type_jj:
                update_vocab_by_token(vtype_jj, ttype[0])
                sent.append('jj')
            elif ttype[1] in type_nn:
                update_vocab_by_token(vtype_nn, ttype[0])
                sent.append('nn')
            elif ttype[1] in type_rb:
                update_vocab_by_token(vtype_rb, ttype[0])
                sent.append('rb')
            elif ttype[1] in type_vb:
                update_vocab_by_token(vtype_vb, ttype[0])
                sent.append('vb')
            elif ttype[1] in type_in:
                update_vocab_by_token(vtype_in, ttype[0])
                sent.append('in')
            elif ttype[1] in type_to:
                update_vocab_by_token(vtype_to, ttype[0])
                sent.append('to')
            else:
                update_vocab_by_token(vtype_ot,
                                      str(ttype[0]) + '-' + str(ttype[1]))
                sent.append(ttype[1])

        lsent = '-'.join(sent)
        update_vocab_by_token(vsent, lsent)

        print(
            str(i) + '-' + str(total) + ': ' + str(len(vword.keys())) + '-' +
            str(len(vsent.keys())) + ' ' + str(len(vtype_cd.keys())) + '-' +
            str(len(vtype_in.keys())) + '-' + str(len(vtype_to.keys())) + '-' +
            str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) + '-' +
            str(len(vtype_rb.keys())) + '-' + str(len(vtype_vb.keys())) + '-' +
            str(len(vtype_ot.keys())) + '-')
    cm.save_pkl(base + 'method_vword.pkl', vword)
    cm.save_pkl(base + 'method_vsent.pkl', vsent)
    cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd)
    cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in)
    cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to)
    cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj)
    cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn)
    cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb)
    cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb)
    cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)

Esempio n. 10

0

Mostra file

File: codematcher_analysis.py Progetto: LeorPoirot/codematcher

def stat_method(folder_from, folder_to, total_files):
    base = folder_to
    vword = dict()
    vsent = dict()
    vtype_cc = dict()
    vtype_cd = dict()
    vtype_in = dict()
    vtype_to = dict()
    vtype_jj = dict()
    vtype_nn = dict()
    vtype_rb = dict()
    vtype_vb = dict()
    vtype_ot = dict()
    for i in range(0, total_files):
        print(str(i) + '-' + str(total_files))
        path = folder_from + 'method' + str(i) + '.txt'
        if os.path.exists(path):
            lines = cm.load_txt(path)
            for line in lines:
                tokens = cm.camel_split_for_tokens(line)
                update_vocab_by_tokens(vword, tokens)

                tokens_type = nltk.pos_tag(tokens)
                sent = []

                for ttype in tokens_type:
                    if ttype[1] in type_cc:
                        update_vocab_by_token(vtype_cc, ttype[0])
                        sent.append('cc')
                    elif ttype[1] in type_cd:
                        update_vocab_by_token(vtype_cd, ttype[0])
                        sent.append('cd')
                    elif ttype[1] in type_jj:
                        update_vocab_by_token(vtype_jj, ttype[0])
                        sent.append('jj')
                    elif ttype[1] in type_nn:
                        update_vocab_by_token(vtype_nn, ttype[0])
                        sent.append('nn')
                    elif ttype[1] in type_rb:
                        update_vocab_by_token(vtype_rb, ttype[0])
                        sent.append('rb')
                    elif ttype[1] in type_vb:
                        update_vocab_by_token(vtype_vb, ttype[0])
                        sent.append('vb')
                    elif ttype[1] in type_in:
                        update_vocab_by_token(vtype_in, ttype[0])
                        sent.append('in')
                    elif ttype[1] in type_to:
                        update_vocab_by_token(vtype_to, ttype[0])
                        sent.append('to')
                    else:
                        update_vocab_by_token(
                            vtype_ot,
                            str(ttype[0]) + '-' + str(ttype[1]))
                        sent.append(ttype[1])

                lsent = '-'.join(sent)
                update_vocab_by_token(vsent, lsent)

            print(
                str(i) + '-' + str(total_files) + ': ' +
                str(len(vword.keys())) + '-' + str(len(vsent.keys())) + ' ' +
                str(len(vtype_cd.keys())) + '-' + str(len(vtype_in.keys())) +
                '-' + str(len(vtype_to.keys())) + '-' +
                str(len(vtype_jj.keys())) + '-' + str(len(vtype_nn.keys())) +
                '-' + str(len(vtype_rb.keys())) + '-' +
                str(len(vtype_vb.keys())) + '-' + str(len(vtype_ot.keys())) +
                '-')
    cm.save_pkl(base + 'method_vword.pkl', vword)
    cm.save_pkl(base + 'method_vsent.pkl', vsent)
    cm.save_pkl(base + 'method_vtype_cd.pkl', vtype_cd)
    cm.save_pkl(base + 'method_vtype_in.pkl', vtype_in)
    cm.save_pkl(base + 'method_vtype_to.pkl', vtype_to)
    cm.save_pkl(base + 'method_vtype_jj.pkl', vtype_jj)
    cm.save_pkl(base + 'method_vtype_nn.pkl', vtype_nn)
    cm.save_pkl(base + 'method_vtype_rb.pkl', vtype_rb)
    cm.save_pkl(base + 'method_vtype_vb.pkl', vtype_vb)
    cm.save_pkl(base + 'method_vtype_ot.pkl', vtype_ot)

Esempio n. 11

0

Mostra file

File: codematcher_analysis.py Progetto: LeorPoirot/codematcher

def stat_parameter_return(path_jdk, folder_method, folder_parameter,
                          folder_return, folder_to, total_files):
    base = folder_to
    vocab_api = dict()
    vocab_jdk = dict()
    vocab_name = dict()
    all = 0
    con = 0
    jdk = dict(cm.load_pkl(path_jdk))
    for i in range(0, total_files):
        path = folder_method + 'method' + str(i) + '.txt'
        if os.path.exists(path):
            lines_method = cm.load_txt(folder_method + 'method' + str(i) +
                                       '.txt')
            lines_parameter = cm.load_txt(folder_parameter + 'parameter' +
                                          str(i) + '.txt')
            lines_return = cm.load_txt(folder_return + 'return' + str(i) +
                                       '.txt')
            for j in range(len(lines_method)):
                print(
                    str(i) + '-' + str(total_files) + ' -' + str(j) + ' ' +
                    str(len(lines_method)) + ' ' + str(con) + ' - ' +
                    str(all) + ' ' + str(len(vocab_api.keys())) + '-' +
                    str(len(vocab_jdk.keys())) + '-' +
                    str(len(vocab_name.keys())))
                all += 1
                line_method = lines_method[j]
                tokens = cm.get_tokens(cm.camel_split(line_method))

                line_paras = lines_parameter[j].replace('\n', '')
                para_types = []
                para_names = []

                line_return = lines_return[j]
                line_return = line_return.replace('\n', '')
                para_types.append(line_return)

                line = line_paras + ' ' + line_return
                for token in tokens:
                    if line.find(token) >= 0:
                        con += 1
                        break

                if '[]' not in line_paras:
                    if ';' in line_paras:
                        line_paras = line_paras.split(';')
                        for line_para in line_paras:
                            paras = line_para.split(',')
                            if len(paras) == 2:
                                para_types.append(paras[0])
                                para_names.append(paras[1])
                    else:
                        paras = line_paras.split(',')
                        if len(paras) == 2:
                            para_types.append(paras[0])
                            para_names.append(paras[1])

                for type in para_types:
                    if type in jdk.keys():
                        if type in vocab_jdk.keys():
                            vocab_jdk[type] += 1
                        else:
                            vocab_jdk[type] = 1
                    else:
                        if type in vocab_api.keys():
                            vocab_api[type] += 1
                        else:
                            vocab_api[type] = 1

                for name in para_names:
                    if name in vocab_name.keys():
                        vocab_name[name] += 1
                    else:
                        vocab_name[name] = 1

    cm.save_pkl(base + 'para_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'para_vocab_jdk.pkl', vocab_jdk)
    cm.save_pkl(base + 'para_vocab_name.pkl', vocab_name)

Esempio n. 12

0

Mostra file

File: codematcher_analysis.py Progetto: LeorPoirot/codematcher

def stat_parsed(path_jdk, folder_from_method, folder_to_parsed, folder_to,
                total_files):
    base = folder_to
    vocab_api = dict()
    vocab_jdk = dict()
    all = 0
    all_api = 0
    all_jdk = 0
    relate = 0
    jdk = dict(cm.load_pkl(path_jdk))
    for i in range(0, total_files):
        path = folder_from_method + 'method' + str(i) + '.txt'
        if os.path.exists(path):
            methods = cm.load_txt(folder_from_method + 'method' + str(i) +
                                  '.txt')
            lines = cm.load_txt(folder_to_parsed + 'parsed' + str(i) + '.txt')
            for j in range(len(methods)):
                all += 1

                line = lines[j].replace('\n', '')
                if line is not '[]':
                    tokens = cm.get_tokens(cm.camel_split(methods[j]))
                    for token in tokens:
                        if line.find(token) >= 0:
                            relate += 1
                            break

                    flag_api = 0
                    flag_jdk = 0
                    apis = line.split(',')
                    if len(apis) > 0:
                        for api in apis:
                            if '.' in api:
                                flag_api = 1

                                if api.endswith(')'):
                                    api = api[0:api.find('(')]
                                if api.endswith(']'):
                                    api = api[0:api.find('[')]
                                if api.endswith('>'):
                                    api = api[0:api.find('<')]

                                if api in jdk.keys():
                                    flag_jdk = 1
                                    if api in vocab_jdk.keys():
                                        vocab_jdk[api] += 1
                                    else:
                                        vocab_jdk[api] = 1
                                else:
                                    if api in vocab_api.keys():
                                        vocab_api[api] += 1
                                    else:
                                        vocab_api[api] = 1
                    all_api += flag_api
                    all_jdk += flag_jdk

                print(
                    str(i) + '-41025 ' + str(j) + '-' + str(len(methods)) +
                    ' ' + str(all) + '-' + str(all_api) + '-' + str(all_jdk) +
                    '-' + str(relate) + ' ' + str(len(vocab_api.keys())) +
                    '-' + str(len(vocab_jdk.keys())))

    cm.save_pkl(base + 'parsed_vocab_api.pkl', vocab_api)
    cm.save_pkl(base + 'parsed_vocab_jdk.pkl', vocab_jdk)