Exemple #1
0
def extract_token_maps(path_to_token_frequency, sw_percent, token_count):
    i = 0
    with open(path_to_token_frequency) as f:
        for line in f:
            token, freq, ids = split_line_to_components(line)
            percent = float(freq) / APK_COUNT
            if percent >= sw_percent:
                sw_set.add(token)
            elif i <= token_count:
                i += 1
                token_set.add(token)
            else:
                break
Exemple #2
0
def getWeirdFiles(path):
    s = set()
    j = 0
    for i in range(APK_COUNT):
        s.add(i)
    with open(path) as f:
        for line in f:
            j += 1
            key, freq, ids = split_line_to_components(line)
            idSet = set(ids)
            oddOnes = s.difference(idSet)
            print oddOnes
            print key
            if j == 5:
                break
Exemple #3
0
def run_everything(ngram_size, token_stopword_percentage, token_count,
                   ngram_stopword_percentages, ngram_token_counts, save_path):
    global chunkNum, apkCount, idMap, startFromCount
    chunkNum = 0
    startFromCount = 0
    apkCount = 0
    save_path = save_path + "\\token_%s_%s" % (token_count,
                                               token_stopword_percentage)
    if not exists(save_path):
        makedirs(save_path)
    extract_token_maps("D:\\Source\\Results\\tokens\\batch.csv",
                       token_stopword_percentage, token_count)
    all_tokens_filename = "ngramResultMap.part"
    all_tokens_path = save_path + "\\" + all_tokens_filename
    id_map_path = save_path + "\\ngramIdMap2.txt"
    sorted_tokens = save_path + "\\sortedTokens.csv"
    with open(id_map_path, 'w') as idMap:
        visit_folders(VISIT_PATH,
                      all_tokens_path,
                      id_map_path,
                      CHUNK_SIZE,
                      "A",
                      "Zz",
                      ngram_size,
                      MODE_ACE,
                      only_check_specific=True,
                      specific_file=MERGED_JAVA_USABLE)
    chunk_merger(save_path, all_tokens_filename,
                 int(ceil(float(APK_COUNT) / CHUNK_SIZE)))
    batch_sort(save_path + "\\" + all_tokens_filename + "0",
               sorted_tokens,
               key=lambda x: -1 * int(split_line_to_components(x)[1]))
    remove(save_path + "\\" + all_tokens_filename + "0")
    write_sample(sorted_tokens, save_path + "\\ngramSample.csv", 30000)
    get_ngrams_from_file = ngram_getter(ngram_size, MODE_ACE)
    make_table(0, 50000, sorted_tokens, save_path + "\\matrixWhole.csv",
               MERGED_JAVA_USABLE, id_map_path, get_ngrams_from_file)
    for sw_percent in ngram_stopword_percentages:
        path = "%s\\sw_percent_%s" % (save_path, sw_percent)
        if not exists(path):
            makedirs(path)
        for token_count in ngram_token_counts:
            print "%s_%s" % (sw_percent, token_count)
            make_table_from_file(save_path + "\\matrixWhole.csv", sw_percent,
                                 token_count, path)
Exemple #4
0
def initialize_dataset(save_path, ngram_size):
    global chunkNum, apkCount, idMap, startFromCount
    apkCount = 0

    startFromCount = apkCount
    chunkNum = apkCount / CHUNK_SIZE

    save_path = save_path + "\\%sngrams" % ngram_size
    if not exists(save_path):
        makedirs(save_path)
    extract_token_maps("D:\\Source\\Results\\tokens\\batch.csv", 1, 20000)
    print len(sw_set)
    print len(token_set)
    all_tokens_filename = "ngramResultMap.part"
    all_tokens_path = save_path + "\\" + all_tokens_filename
    id_map_path = save_path + "\\ngramIdMap.txt"
    sorted_tokens = save_path + "\\sortedTokens.csv"
    with open(id_map_path, 'w') as idMap:
        visit_folders(VISIT_PATH,
                      all_tokens_path,
                      id_map_path,
                      CHUNK_SIZE,
                      "A",
                      "Zz",
                      ngram_size,
                      MODE_ACE,
                      only_check_specific=True,
                      specific_file=MERGED_JAVA_USABLE)
    chunk_merger(save_path, all_tokens_filename,
                 int(ceil(float(APK_COUNT) / CHUNK_SIZE)))
    batch_sort(save_path + "\\" + all_tokens_filename + "0",
               sorted_tokens,
               key=lambda x: -1 * int(split_line_to_components(x)[1]))
    remove(save_path + "\\" + all_tokens_filename + "0")
    get_ngrams_from_file = ngram_getter(ngram_size, MODE_ACE)
    make_table(0, 20000, sorted_tokens, save_path + "\\matrixWhole.csv",
               MERGED_JAVA_USABLE, id_map_path, get_ngrams_from_file)