def bigram_model(item_word, item_sort, cat_count): freq_sku_words = defaultdict(lambda: defaultdict(set)) for cat in item_word: for sku in item_word[cat]: hots = item_word[cat][sku].items() freq_sku_words[cat][sku] = set([i[0] for i in hots if i[1] >= GLOBAL_BIGRAM_QUERY]) freq_words = dict() for cat in freq_sku_words: freq_words[cat] = set() for sku in freq_sku_words[cat]: freq_words[cat] = freq_words[cat].union(freq_sku_words[cat][sku]) f_in = readfile(new_train_file) bigram_item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) index = 0 for (__user, sku, category, raw_query, ___click_time) in f_in: index += 1 bound = cat_count[category][HOT_SIZE] popular = [i[0] for i in item_sort[category][0:bound]] if sku in popular: bigram = bigram_word(raw_query, freq_words, category) for w in bigram: bigram_item_word[category][sku][w] += magic_num cat_count[category][BIGRAM_HOT] += magic_num return bigram_item_word, cat_count, freq_words
def bigram_model(item_word, item_sort, cat_count): freq_sku_words = defaultdict(lambda: defaultdict(set)) for cat in item_word: for sku in item_word[cat]: hots = item_word[cat][sku].items() freq_sku_words[cat][sku] = set( [i[0] for i in hots if i[1] >= GLOBAL_BIGRAM_QUERY]) freq_words = dict() for cat in freq_sku_words: freq_words[cat] = set() for sku in freq_sku_words[cat]: freq_words[cat] = freq_words[cat].union(freq_sku_words[cat][sku]) f_in = readfile(new_train_file) bigram_item_word = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) index = 0 for (__user, sku, category, raw_query, ___click_time) in f_in: index += 1 bound = cat_count[category][HOT_SIZE] popular = [i[0] for i in item_sort[category][0:bound]] if sku in popular: bigram = bigram_word(raw_query, freq_words, category) for w in bigram: bigram_item_word[category][sku][w] += magic_num cat_count[category][BIGRAM_HOT] += magic_num return bigram_item_word, cat_count, freq_words
def count_items(): f_in = readfile(new_train_file) item_count = defaultdict(lambda: defaultdict(int)) time_item_count = defaultdict(lambda:defaultdict(lambda: defaultdict(int))) index = 0 for (__user, sku, category, __query, click_time) in f_in: time_block = min(int(click_time) / block, MAX_BLOCK) index += 1 item_count[category][sku] += magic_num time_item_count[time_block][category][sku] += magic_num item_sort = dict() for category in item_count: item_sort[category] = sorted(item_count[category].items(), key=lambda x: x[1], reverse=True) smooth_time_item_count = defaultdict(lambda:defaultdict(lambda: defaultdict(int))) for time_block in time_item_count: for cat in time_item_count[time_block]: for sku in time_item_count[time_block][cat]: smooth_time_item_count[time_block][cat][sku] = item_count[cat][sku] * 3.0 / block_size for time_block in time_item_count: for cat in time_item_count[time_block]: for sku in time_item_count[time_block][cat]: smooth_time_item_count[time_block][cat][sku] = time_item_count[time_block][cat][sku] if time_block == 0 or time_block == MAX_BLOCK: smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block][cat][sku] if time_block >= 1: smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block - 1][cat][sku] if time_block < MAX_BLOCK: smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block + 1][cat][sku] return item_count, item_sort, smooth_time_item_count
def make_predictions(st_line, ed_line, predict_file, pname, models): cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, freq_words = models[ 0] f_in = readfile(new_test_file) f_out = writefile(predict_file) line_index = 0 for (user, category, raw_query, click_time) in f_in: line_index += 1 if line_index < st_line: continue if line_index > ed_line: break if line_index % STEP_SIZE == 0: print '%s--%d' % (pname, line_index / STEP_SIZE) time_block = min(int(click_time) / block, MAX_BLOCK) try: bound = cat_count[category][PREDICT_HOT_SIZE] hots = [x[0] for x in item_sort[category][0:bound]] except: f_out.writerow(["0"]) continue try: bigram = bigram_word(raw_query, freq_words, category) words = get_words(raw_query) query_size = sum([cat_word[category][w] for w in words]) if query_size >= 100 and len(bigram) > 0: "queries that are frequent enough and can generate bigram features can be predicted by boosting model" rank = [[ sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block) ] for sku in hots] elif query_size >= 100 and len(bigram) == 0: "if hot enough but can not generate bigram features then use naive bayes with time information" rank = [[ sku, bayes_query_predict(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block) ] for sku in hots] else: "otherwise use naive bayes" rank = [[ sku, naive_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count) ] for sku in hots] rank = sorted(rank, key=lambda x: x[1], reverse=True) guesses = [i[0] for i in rank[0:5]] guesses = rank_predictions(guesses, user, raw_query) f_out.writerow([" ".join(guesses)]) except (TypeError, KeyError): f_out.writerow([" ".join(hots[0:5])])
def unigram_model(item_sort, cat_count): f_in = readfile(new_train_file) item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) cat_word = defaultdict(lambda: defaultdict(int)) index = 0 for (__user, sku, category, raw_query, ___click_time) in f_in: index += 1 bound = cat_count[category][HOT_SIZE] popular = [i[0] for i in item_sort[category][0:bound]] if sku in popular: words = get_words(raw_query) for w in words: item_word[category][sku][w] += magic_num cat_word[category][w] += magic_num return item_word, cat_word
def fix_query(in_file, out_file, file_type): local_cache = dict() lemmatizer = WordNetLemmatizer() reader = readfile(in_file) with open(out_file, 'w') as writer: writer.write('data:\n') if file_type == 'train': for (user, sku, category, raw_query, click_time, __query_time) in reader: new_query = clean_query(raw_query, lemmatizer, local_cache) new_click_time = get_new_time(click_time) outline = ','.join([user, sku, category, new_query, new_click_time]) writer.write(outline + '\n') elif file_type == 'test': for (user, category, raw_query, click_time, __query_time) in reader: new_query = clean_query(raw_query, lemmatizer, local_cache) new_click_time = get_new_time(click_time) outline = ','.join([user, category, new_query, new_click_time]) writer.write(outline + '\n') else: raise Exception('Query Correction Failed!')
def make_predictions(st_line, ed_line, predict_file, pname, models): cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, freq_words = models[0] f_in = readfile(new_test_file) f_out = writefile(predict_file) line_index = 0 for (user, category, raw_query, click_time) in f_in: line_index += 1 if line_index < st_line: continue if line_index > ed_line: break if line_index % STEP_SIZE == 0: print '%s--%d' % (pname, line_index / STEP_SIZE) time_block = min(int(click_time) / block, MAX_BLOCK) try: bound = cat_count[category][PREDICT_HOT_SIZE] hots = [x[0] for x in item_sort[category][0:bound]] except: f_out.writerow(["0"]) continue try: bigram = bigram_word(raw_query, freq_words, category) words = get_words(raw_query) query_size = sum([cat_word[category][w] for w in words]) if query_size >= 100 and len(bigram) > 0: "queries that are frequent enough and can generate bigram features can be predicted by boosting model" rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots] elif query_size >= 100 and len(bigram) == 0: "if hot enough but can not generate bigram features then use naive bayes with time information" rank = [[sku, bayes_query_predict(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots] else: "otherwise use naive bayes" rank = [[sku, naive_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots] rank = sorted(rank, key=lambda x:x[1], reverse=True) guesses = [i[0] for i in rank[0:5]] guesses = rank_predictions(guesses, user, raw_query) f_out.writerow([" ".join(guesses)]) except (TypeError, KeyError): f_out.writerow([" ".join(hots[0:5])])
def count_items(): f_in = readfile(new_train_file) item_count = defaultdict(lambda: defaultdict(int)) time_item_count = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) index = 0 for (__user, sku, category, __query, click_time) in f_in: time_block = min(int(click_time) / block, MAX_BLOCK) index += 1 item_count[category][sku] += magic_num time_item_count[time_block][category][sku] += magic_num item_sort = dict() for category in item_count: item_sort[category] = sorted(item_count[category].items(), key=lambda x: x[1], reverse=True) smooth_time_item_count = defaultdict( lambda: defaultdict(lambda: defaultdict(int))) for time_block in time_item_count: for cat in time_item_count[time_block]: for sku in time_item_count[time_block][cat]: smooth_time_item_count[time_block][cat][ sku] = item_count[cat][sku] * 3.0 / block_size for time_block in time_item_count: for cat in time_item_count[time_block]: for sku in time_item_count[time_block][cat]: smooth_time_item_count[time_block][cat][sku] = time_item_count[ time_block][cat][sku] if time_block == 0 or time_block == MAX_BLOCK: smooth_time_item_count[time_block][cat][ sku] += time_item_count[time_block][cat][sku] if time_block >= 1: smooth_time_item_count[time_block][cat][ sku] += time_item_count[time_block - 1][cat][sku] if time_block < MAX_BLOCK: smooth_time_item_count[time_block][cat][ sku] += time_item_count[time_block + 1][cat][sku] return item_count, item_sort, smooth_time_item_count
def fix_query(in_file, out_file, file_type): local_cache = dict() lemmatizer = WordNetLemmatizer() reader = readfile(in_file) with open(out_file, 'w') as writer: writer.write('data:\n') if file_type == 'train': for (user, sku, category, raw_query, click_time, __query_time) in reader: new_query = clean_query(raw_query, lemmatizer, local_cache) new_click_time = get_new_time(click_time) outline = ','.join( [user, sku, category, new_query, new_click_time]) writer.write(outline + '\n') elif file_type == 'test': for (user, category, raw_query, click_time, __query_time) in reader: new_query = clean_query(raw_query, lemmatizer, local_cache) new_click_time = get_new_time(click_time) outline = ','.join([user, category, new_query, new_click_time]) writer.write(outline + '\n') else: raise Exception('Query Correction Failed!')
# Using xz leads to this anyway, but it's worth reminding the reader. # To permute in 2D, use the --permute flag. return use,res; if __name__ == "__main__": from docopt import docopt; from misc import readfile, mkvprint, dump_pickle; opts=docopt(__doc__,help=True); dims,res = handle_dims(opts); vprint = mkvprint; var = opts['<var>']; readvars = list(var); if readvars: readvars+=dims; if opts['--gen-samples']: xs = tuple([d[l] for l in dims]); i = simple_nearest_indices(xs,res); dump_pickle(opts["<output>"],(i,xs)); exit(1); if opts['--sample']: i,xs = readfile(opts['--sample'], dumpfull=True); else: xs = tuple([d[l] for l in dims]); i = simple_nearest_indices(xs,res); did = {v:d[v][i] for v in var}; #Has _D_ been _I_nterpolate_D_? Yes it DID. did.update({l:x for l,x in zip(dims,xs)}); #get it? #alright I'll stop dump_pickle(opts['<output>'], did);
def load_data(): """ Loads all data from the ResourceFiles directory. Parameters ---------- None - input data is constant. Returns ------- Bunch class containing ResourceFiles. Example ------- >>> from ppi.misc import load_data() >>> data = load_data() >>> data.humanppi[0:5] [(Protein: 0, Protein: 6476), (Protein: 1, Protein: 604), (Protein: 1, Protein: 3466), (Protein: 1, Protein: 5215), (Protein: 1, Protein: 7154)] >>> data.functions[0:5] [(Protein: 0, Function: F0003723), (Protein: 0, Function: F0035097), (Protein: 0, Function: F0016568), (Protein: 0, Function: F0051568), (Protein: 0, Function: F0016740)] >>> data.cancer[0:5] [241, 249, 255, 266, 287] >>> data.test1[0:5] [(Protein: 0, 'nonCancer'), (Protein: 1, 'nonCancer'), (Protein: 1208, 'cancer'), (Protein: 2431, 'cancer'), (Protein: 2, 'nonCancer')] """ cancer_txt = readfile('ResourceFiles/Cancer.txt') cancer = [line.strip() for line in cancer_txt] cancer = [ int(line[5:]) for line in cancer ] humanppi_txt = readfile('ResourceFiles/humanPPI.txt') humanppi = [line.strip().split(',') for line in humanppi_txt] humanppi = [ tuple([int(elem[5:]) for elem in line ]) for line in \ humanppi ] temp = [] for p1, p2 in humanppi: if p1 in cancer: a = CancerProtein(p1) else: a = Protein(p1) if p2 in cancer: b = CancerProtein(p2) else: b = Protein(p2) temp.append((a,b)) del humanppi humanppi = temp functions_txt = readfile('ResourceFiles/Functions.txt') functions = [line.strip().split(',') for line in functions_txt] functions = [ tuple([int(line[0][5:]), "F"+line[1][5:]] ) for line \ in functions ] f = [] for p, fn in functions: if p in cancer: a = CancerProtein(p) else: a = Protein(p) b = Function(fn) f.append((a,b)) del functions functions = f test1_txt = readfile('ResourceFiles/Test1.txt') test1 = [line.strip().split(',') for line in test1_txt] test1 = [ tuple( [int(line[0][5:]), line[1] ] ) for line in test1 ] test1 = [ (Protein(p), answer) for p, answer in test1] test2_txt = readfile('ResourceFiles/Test2.txt') test2 = [line.strip().split(',') for line in test2_txt] test2 = [ int(line[0][5:]) for line in test2 ] test2 = [ Protein(p) for p in test2] data = Bunch(humanppi = humanppi,\ functions=functions,\ cancer=cancer,\ test1=test1,\ test2=test2) return data