def extract_token_maps(path_to_token_frequency, sw_percent, token_count): i = 0 with open(path_to_token_frequency) as f: for line in f: token, freq, ids = split_line_to_components(line) percent = float(freq) / APK_COUNT if percent >= sw_percent: sw_set.add(token) elif i <= token_count: i += 1 token_set.add(token) else: break
def getWeirdFiles(path): s = set() j = 0 for i in range(APK_COUNT): s.add(i) with open(path) as f: for line in f: j += 1 key, freq, ids = split_line_to_components(line) idSet = set(ids) oddOnes = s.difference(idSet) print oddOnes print key if j == 5: break
def run_everything(ngram_size, token_stopword_percentage, token_count, ngram_stopword_percentages, ngram_token_counts, save_path): global chunkNum, apkCount, idMap, startFromCount chunkNum = 0 startFromCount = 0 apkCount = 0 save_path = save_path + "\\token_%s_%s" % (token_count, token_stopword_percentage) if not exists(save_path): makedirs(save_path) extract_token_maps("D:\\Source\\Results\\tokens\\batch.csv", token_stopword_percentage, token_count) all_tokens_filename = "ngramResultMap.part" all_tokens_path = save_path + "\\" + all_tokens_filename id_map_path = save_path + "\\ngramIdMap2.txt" sorted_tokens = save_path + "\\sortedTokens.csv" with open(id_map_path, 'w') as idMap: visit_folders(VISIT_PATH, all_tokens_path, id_map_path, CHUNK_SIZE, "A", "Zz", ngram_size, MODE_ACE, only_check_specific=True, specific_file=MERGED_JAVA_USABLE) chunk_merger(save_path, all_tokens_filename, int(ceil(float(APK_COUNT) / CHUNK_SIZE))) batch_sort(save_path + "\\" + all_tokens_filename + "0", sorted_tokens, key=lambda x: -1 * int(split_line_to_components(x)[1])) remove(save_path + "\\" + all_tokens_filename + "0") write_sample(sorted_tokens, save_path + "\\ngramSample.csv", 30000) get_ngrams_from_file = ngram_getter(ngram_size, MODE_ACE) make_table(0, 50000, sorted_tokens, save_path + "\\matrixWhole.csv", MERGED_JAVA_USABLE, id_map_path, get_ngrams_from_file) for sw_percent in ngram_stopword_percentages: path = "%s\\sw_percent_%s" % (save_path, sw_percent) if not exists(path): makedirs(path) for token_count in ngram_token_counts: print "%s_%s" % (sw_percent, token_count) make_table_from_file(save_path + "\\matrixWhole.csv", sw_percent, token_count, path)
def initialize_dataset(save_path, ngram_size): global chunkNum, apkCount, idMap, startFromCount apkCount = 0 startFromCount = apkCount chunkNum = apkCount / CHUNK_SIZE save_path = save_path + "\\%sngrams" % ngram_size if not exists(save_path): makedirs(save_path) extract_token_maps("D:\\Source\\Results\\tokens\\batch.csv", 1, 20000) print len(sw_set) print len(token_set) all_tokens_filename = "ngramResultMap.part" all_tokens_path = save_path + "\\" + all_tokens_filename id_map_path = save_path + "\\ngramIdMap.txt" sorted_tokens = save_path + "\\sortedTokens.csv" with open(id_map_path, 'w') as idMap: visit_folders(VISIT_PATH, all_tokens_path, id_map_path, CHUNK_SIZE, "A", "Zz", ngram_size, MODE_ACE, only_check_specific=True, specific_file=MERGED_JAVA_USABLE) chunk_merger(save_path, all_tokens_filename, int(ceil(float(APK_COUNT) / CHUNK_SIZE))) batch_sort(save_path + "\\" + all_tokens_filename + "0", sorted_tokens, key=lambda x: -1 * int(split_line_to_components(x)[1])) remove(save_path + "\\" + all_tokens_filename + "0") get_ngrams_from_file = ngram_getter(ngram_size, MODE_ACE) make_table(0, 20000, sorted_tokens, save_path + "\\matrixWhole.csv", MERGED_JAVA_USABLE, id_map_path, get_ngrams_from_file)