Ejemplo n.º 1
0
def get_code_tokens_from_file(filelist_json,
                              outfile_prefix,
                              category=None,
                              bigram_flag=True):
    data = {}
    data['file'] = []
    for f_json in filelist_json:

        file = f_json["name"]
        # ignore non code file
        if language_tool.is_text(file):
            continue
        # print(file)
        # ignore file if the change is too big
        # if f_json['LOC']['add'] > 10000 or (f_json['LOC']['add'] < 5 and len(f_json['add_code']) > 10000):
        if (f_json['LOC']['add'] < 5 and len(f_json['add_code']) > 10000):
            print('code change in current file is too long, skip')
            continue

        text = f_json[category]

        text = filteringText(text, outfile_prefix)

        tokens = nltk.word_tokenize(text)

        # generate an array of bigrams
        filtered_tokens = token_filtering(tokens, file)
        if bigram_flag:
            bigram_tokens = list(
                itertools.chain(*[
                    word_split_by_char(token, 'bigrams')
                    for token in filtered_tokens
                ]))
            bigram_tokens = token_filtering(bigram_tokens, file)
            stem_bigrams_tokens = stem_process(bigram_tokens)

        tokens = list(
            itertools.chain(
                *[word_split_by_char(token) for token in filtered_tokens]))
        stem_tokens = stem_process(tokens)

        if bigram_tokens:
            data['file'].append({
                'filename':
                file,
                # 'bigram_tokens':'\t'.join(bigram_tokens),
                'stem_bigram_tokens':
                '\t'.join(stem_bigrams_tokens),
                # 'tokens':'\t'.join(tokens),
                'stem_tokens':
                '\t'.join(stem_tokens)
            })
        else:
            data['file'].append({
                'filename': file,
                # 'tokens': '\t'.join(tokens),
                'stem_tokens': '\t'.join(stem_tokens)
            })
    with open(outfile_prefix + "/" + category + ".json", 'w+') as outfile:
        json.dump(data, outfile)
Ejemplo n.º 2
0
def filterNonCodeFiles(file_list, outfile_prefix):
    newFileList = []
    count = 0
    for f in file_list:
        if count > 500:
            localfile.write_to_file(outfile_prefix + "/toobig.txt", '500file')
            return []
        if not language_tool.is_text(f['name']):
            newFileList.append(f)
            count += 1
    return newFileList
Ejemplo n.º 3
0
def getCodeLocation(filelist_json, outfile_prefix):
    location_set = {}
    for f_json in filelist_json:
        file = f_json["name"]
        location_set[file] = []
        # ignore non code file
        from util import language_tool
        if language_tool.is_text(file):
            continue
        loc_list = f_json['location']['add']
        for x in loc_list:
            location_set[file].append([int(x[0]), int(x[0]) + int(x[1])])
    with open(outfile_prefix + "/code_loc.json", 'w+') as outfile:
        json.dump(location_set, outfile)
Ejemplo n.º 4
0
def token_filtering(tokens, file=None):
    tokens = list(filter(lambda x: re.search(r"[0-9A-Za-z_./\\\-]", x),
                         tokens))
    newtokens = []
    for t in tokens:
        t = re.sub(r"^[_\\/\-.]+", "", t)
        t = re.sub(r"[_\\/\-.]+$", "", t)
        t = re.sub(r"\d+", "", t)
        if len(t) > 1:
            newtokens.append(t)
    newtokens = list(filter(lambda x: len(x) >= 2, newtokens))
    if (file is not None) and (not language_tool.is_text(file)):
        newtokens = list(
            filter(lambda x: x not in language_tool.PL_reserved_words,
                   newtokens))
    newtokens = [x.lower() for x in newtokens]
    tokens = list(
        filter(lambda x: x not in language_tool.general_stopwords, newtokens))
    return tokens