コード例 #1
0
def bigram_model(item_word, item_sort, cat_count):
    freq_sku_words = defaultdict(lambda: defaultdict(set))
    for cat in item_word:
        for sku in item_word[cat]:
            hots = item_word[cat][sku].items()
            freq_sku_words[cat][sku] = set([i[0] for i in hots if i[1] >= GLOBAL_BIGRAM_QUERY])

    freq_words = dict() 
    for cat in freq_sku_words:
        freq_words[cat] = set()
        for sku in freq_sku_words[cat]:
            freq_words[cat] = freq_words[cat].union(freq_sku_words[cat][sku])

    f_in = readfile(new_train_file)
    bigram_item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    index = 0
    for (__user, sku, category, raw_query, ___click_time) in f_in:
        index += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            bigram = bigram_word(raw_query, freq_words, category)
            for w in bigram:
                bigram_item_word[category][sku][w] += magic_num
                cat_count[category][BIGRAM_HOT] += magic_num
    return bigram_item_word, cat_count, freq_words
コード例 #2
0
def bigram_model(item_word, item_sort, cat_count):
    freq_sku_words = defaultdict(lambda: defaultdict(set))
    for cat in item_word:
        for sku in item_word[cat]:
            hots = item_word[cat][sku].items()
            freq_sku_words[cat][sku] = set(
                [i[0] for i in hots if i[1] >= GLOBAL_BIGRAM_QUERY])

    freq_words = dict()
    for cat in freq_sku_words:
        freq_words[cat] = set()
        for sku in freq_sku_words[cat]:
            freq_words[cat] = freq_words[cat].union(freq_sku_words[cat][sku])

    f_in = readfile(new_train_file)
    bigram_item_word = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))
    index = 0
    for (__user, sku, category, raw_query, ___click_time) in f_in:
        index += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            bigram = bigram_word(raw_query, freq_words, category)
            for w in bigram:
                bigram_item_word[category][sku][w] += magic_num
                cat_count[category][BIGRAM_HOT] += magic_num
    return bigram_item_word, cat_count, freq_words
コード例 #3
0
def count_items():
    f_in = readfile(new_train_file)
    item_count = defaultdict(lambda: defaultdict(int))
    time_item_count = defaultdict(lambda:defaultdict(lambda: defaultdict(int)))
    index = 0
    for (__user, sku, category, __query, click_time) in f_in:
        time_block = min(int(click_time) / block, MAX_BLOCK)
        index += 1
        item_count[category][sku] += magic_num
        time_item_count[time_block][category][sku] += magic_num
    item_sort = dict()
    for category in item_count:
        item_sort[category] = sorted(item_count[category].items(), key=lambda x: x[1], reverse=True)
    smooth_time_item_count = defaultdict(lambda:defaultdict(lambda: defaultdict(int)))
    for time_block in time_item_count:
        for cat in time_item_count[time_block]:
            for sku in time_item_count[time_block][cat]:
                smooth_time_item_count[time_block][cat][sku] = item_count[cat][sku] * 3.0 / block_size
    for time_block in time_item_count:
        for cat in time_item_count[time_block]:
            for sku in time_item_count[time_block][cat]:
                smooth_time_item_count[time_block][cat][sku] = time_item_count[time_block][cat][sku]
                if time_block == 0 or time_block == MAX_BLOCK:
                    smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block][cat][sku]
                if time_block >= 1:
                    smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block - 1][cat][sku]
                if time_block < MAX_BLOCK:
                    smooth_time_item_count[time_block][cat][sku] += time_item_count[time_block + 1][cat][sku]
    return item_count, item_sort, smooth_time_item_count
コード例 #4
0
def make_predictions(st_line, ed_line, predict_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, freq_words = models[
        0]
    f_in = readfile(new_test_file)
    f_out = writefile(predict_file)
    line_index = 0
    for (user, category, raw_query, click_time) in f_in:
        line_index += 1
        if line_index < st_line:
            continue
        if line_index > ed_line:
            break
        if line_index % STEP_SIZE == 0:
            print '%s--%d' % (pname, line_index / STEP_SIZE)
        time_block = min(int(click_time) / block, MAX_BLOCK)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            f_out.writerow(["0"])
            continue
        try:
            bigram = bigram_word(raw_query, freq_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                "queries that are frequent enough and can generate bigram features can be predicted by boosting model"
                rank = [[
                    sku,
                    boosting_bayes(bigram, words, category, sku, alpha, beta,
                                   item_word, bigram_item_word, item_count,
                                   cat_count, time_cat_item_dict, time_block)
                ] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                "if hot enough but can not generate bigram features then use naive bayes with time information"
                rank = [[
                    sku,
                    bayes_query_predict(words, category, sku, alpha, beta,
                                        item_word, item_count, cat_count,
                                        time_cat_item_dict, time_block)
                ] for sku in hots]
            else:
                "otherwise use naive bayes"
                rank = [[
                    sku,
                    naive_bayes_query_prediction(words, category, sku, alpha,
                                                 beta, item_word, item_count,
                                                 cat_count)
                ] for sku in hots]
            rank = sorted(rank, key=lambda x: x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rank_predictions(guesses, user, raw_query)

            f_out.writerow([" ".join(guesses)])
        except (TypeError, KeyError):
            f_out.writerow([" ".join(hots[0:5])])
コード例 #5
0
def unigram_model(item_sort, cat_count):
    f_in = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    index = 0
    for (__user, sku, category, raw_query, ___click_time) in f_in:
        index += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
コード例 #6
0
def unigram_model(item_sort, cat_count):
    f_in = readfile(new_train_file)
    item_word = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    cat_word = defaultdict(lambda: defaultdict(int))
    index = 0
    for (__user, sku, category, raw_query, ___click_time) in f_in:
        index += 1
        bound = cat_count[category][HOT_SIZE]
        popular = [i[0] for i in item_sort[category][0:bound]]
        if sku in popular:
            words = get_words(raw_query)
            for w in words:
                item_word[category][sku][w] += magic_num
                cat_word[category][w] += magic_num
    return item_word, cat_word
コード例 #7
0
def fix_query(in_file, out_file, file_type):
    local_cache = dict()
    lemmatizer = WordNetLemmatizer()
    reader = readfile(in_file)
    with open(out_file, 'w') as writer:
        writer.write('data:\n')
        if file_type == 'train':
            for (user, sku, category, raw_query, click_time, __query_time) in reader:
                new_query = clean_query(raw_query, lemmatizer, local_cache)
                new_click_time = get_new_time(click_time) 
                outline = ','.join([user, sku, category, new_query, new_click_time])
                writer.write(outline + '\n')
        elif file_type == 'test':
            for (user, category, raw_query, click_time, __query_time) in reader:
                new_query = clean_query(raw_query, lemmatizer, local_cache)
                new_click_time = get_new_time(click_time)
                outline = ','.join([user, category, new_query, new_click_time])
                writer.write(outline + '\n')
        else:
            raise Exception('Query Correction Failed!')
コード例 #8
0
def make_predictions(st_line, ed_line, predict_file, pname, models):
    cat_count, item_count, item_sort, alpha, beta, item_word, bigram_item_word, time_cat_item_dict, cat_word, freq_words = models[0]
    f_in = readfile(new_test_file)
    f_out = writefile(predict_file)
    line_index = 0
    for (user, category, raw_query, click_time) in f_in:
        line_index += 1
        if line_index < st_line:
            continue
        if line_index > ed_line:
            break
        if line_index % STEP_SIZE == 0:
            print '%s--%d' % (pname, line_index / STEP_SIZE)
        time_block = min(int(click_time) / block, MAX_BLOCK)
        try:
            bound = cat_count[category][PREDICT_HOT_SIZE]
            hots = [x[0] for x in item_sort[category][0:bound]]
        except:
            f_out.writerow(["0"])
            continue
        try:
            bigram = bigram_word(raw_query, freq_words, category)
            words = get_words(raw_query)
            query_size = sum([cat_word[category][w] for w in words])
            if query_size >= 100 and len(bigram) > 0:
                "queries that are frequent enough and can generate bigram features can be predicted by boosting model"
                rank = [[sku, boosting_bayes(bigram, words, category, sku, alpha, beta, item_word, bigram_item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            elif query_size >= 100 and len(bigram) == 0:
                "if hot enough but can not generate bigram features then use naive bayes with time information"
                rank = [[sku, bayes_query_predict(words, category, sku, alpha, beta, item_word, item_count, cat_count, time_cat_item_dict, time_block)] for sku in hots]
            else:
                "otherwise use naive bayes"
                rank = [[sku, naive_bayes_query_prediction(words, category, sku, alpha, beta, item_word, item_count, cat_count)] for sku in hots]
            rank = sorted(rank, key=lambda x:x[1], reverse=True)
            guesses = [i[0] for i in rank[0:5]]
            guesses = rank_predictions(guesses, user, raw_query)

            f_out.writerow([" ".join(guesses)])
        except (TypeError, KeyError):
            f_out.writerow([" ".join(hots[0:5])])
コード例 #9
0
def count_items():
    f_in = readfile(new_train_file)
    item_count = defaultdict(lambda: defaultdict(int))
    time_item_count = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))
    index = 0
    for (__user, sku, category, __query, click_time) in f_in:
        time_block = min(int(click_time) / block, MAX_BLOCK)
        index += 1
        item_count[category][sku] += magic_num
        time_item_count[time_block][category][sku] += magic_num
    item_sort = dict()
    for category in item_count:
        item_sort[category] = sorted(item_count[category].items(),
                                     key=lambda x: x[1],
                                     reverse=True)
    smooth_time_item_count = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))
    for time_block in time_item_count:
        for cat in time_item_count[time_block]:
            for sku in time_item_count[time_block][cat]:
                smooth_time_item_count[time_block][cat][
                    sku] = item_count[cat][sku] * 3.0 / block_size
    for time_block in time_item_count:
        for cat in time_item_count[time_block]:
            for sku in time_item_count[time_block][cat]:
                smooth_time_item_count[time_block][cat][sku] = time_item_count[
                    time_block][cat][sku]
                if time_block == 0 or time_block == MAX_BLOCK:
                    smooth_time_item_count[time_block][cat][
                        sku] += time_item_count[time_block][cat][sku]
                if time_block >= 1:
                    smooth_time_item_count[time_block][cat][
                        sku] += time_item_count[time_block - 1][cat][sku]
                if time_block < MAX_BLOCK:
                    smooth_time_item_count[time_block][cat][
                        sku] += time_item_count[time_block + 1][cat][sku]
    return item_count, item_sort, smooth_time_item_count
コード例 #10
0
def fix_query(in_file, out_file, file_type):
    local_cache = dict()
    lemmatizer = WordNetLemmatizer()
    reader = readfile(in_file)
    with open(out_file, 'w') as writer:
        writer.write('data:\n')
        if file_type == 'train':
            for (user, sku, category, raw_query, click_time,
                 __query_time) in reader:
                new_query = clean_query(raw_query, lemmatizer, local_cache)
                new_click_time = get_new_time(click_time)
                outline = ','.join(
                    [user, sku, category, new_query, new_click_time])
                writer.write(outline + '\n')
        elif file_type == 'test':
            for (user, category, raw_query, click_time,
                 __query_time) in reader:
                new_query = clean_query(raw_query, lemmatizer, local_cache)
                new_click_time = get_new_time(click_time)
                outline = ','.join([user, category, new_query, new_click_time])
                writer.write(outline + '\n')
        else:
            raise Exception('Query Correction Failed!')
コード例 #11
0
ファイル: nearest.py プロジェクト: noobermin/lspreader
    # Using xz leads to this anyway, but it's worth reminding the reader.
    # To permute in 2D, use the --permute flag.
    return use,res;
if __name__ == "__main__":
    from docopt import docopt;
    from misc import readfile, mkvprint, dump_pickle;
    opts=docopt(__doc__,help=True);
    dims,res = handle_dims(opts);
    vprint = mkvprint;
    var = opts['<var>'];
    readvars = list(var);
    if readvars:
        readvars+=dims;
    if opts['--gen-samples']:
        xs = tuple([d[l] for l in dims]);
        i = simple_nearest_indices(xs,res);
        dump_pickle(opts["<output>"],(i,xs));
        exit(1);
    if opts['--sample']:
        i,xs = readfile(opts['--sample'], dumpfull=True);
    else:
        xs = tuple([d[l] for l in dims]);
        i = simple_nearest_indices(xs,res);
    did = {v:d[v][i] for v in var};
    #Has _D_ been _I_nterpolate_D_?  Yes it DID.
    did.update({l:x for l,x in zip(dims,xs)});
    #get it?
    #alright I'll stop
    dump_pickle(opts['<output>'], did);

コード例 #12
0
def load_data():
    """ 
    Loads all data from the ResourceFiles directory. 

    Parameters
    ----------
    None - input data is constant.

    Returns
    -------
    Bunch class containing ResourceFiles.

    Example
    -------
    >>> from ppi.misc import load_data()
    >>> data = load_data()
    >>> data.humanppi[0:5]
    [(Protein: 0, Protein: 6476), (Protein: 1, Protein: 604), 
     (Protein: 1, Protein: 3466), (Protein: 1, Protein: 5215), 
     (Protein: 1, Protein: 7154)]
    >>> data.functions[0:5]
    [(Protein: 0, Function: F0003723), (Protein: 0, Function: F0035097),
     (Protein: 0, Function: F0016568), (Protein: 0, Function: F0051568),
     (Protein: 0, Function: F0016740)]
    >>> data.cancer[0:5]
    [241, 249, 255, 266, 287]
    >>> data.test1[0:5]
    [(Protein: 0, 'nonCancer'), (Protein: 1, 'nonCancer'), 
     (Protein: 1208, 'cancer'), (Protein: 2431, 'cancer'), 
     (Protein: 2, 'nonCancer')]
    
    """
    cancer_txt = readfile('ResourceFiles/Cancer.txt')
    cancer = [line.strip() for line in cancer_txt]
    cancer = [ int(line[5:]) for line in cancer ]

    humanppi_txt = readfile('ResourceFiles/humanPPI.txt')
    humanppi = [line.strip().split(',') for line in humanppi_txt]
    humanppi = [ tuple([int(elem[5:]) for elem in line ])  for line in \
                 humanppi ]
    temp = []
    for p1, p2 in humanppi:
        if p1 in cancer:
            a = CancerProtein(p1)
        else: 
            a = Protein(p1)

        if p2 in cancer:
            b = CancerProtein(p2)
        else: 
            b = Protein(p2)
        temp.append((a,b))
    del humanppi
    humanppi = temp    
        
    functions_txt = readfile('ResourceFiles/Functions.txt')
    functions = [line.strip().split(',') for line in functions_txt]
    functions = [ tuple([int(line[0][5:]), "F"+line[1][5:]] ) for line \
                  in functions ]

    f = []
    for p, fn in functions:
        if p in cancer:
            a = CancerProtein(p)
        else: 
            a = Protein(p)
        b = Function(fn) 
        f.append((a,b))
    del functions
    functions = f    

    test1_txt = readfile('ResourceFiles/Test1.txt')
    test1 = [line.strip().split(',') for line in test1_txt]
    test1 = [ tuple( [int(line[0][5:]), line[1] ] )  for line in test1 ]
    test1 = [ (Protein(p), answer) for p, answer in test1]

    test2_txt = readfile('ResourceFiles/Test2.txt')
    test2 = [line.strip().split(',') for line in test2_txt]
    test2 = [ int(line[0][5:]) for line in test2 ]
    test2 = [ Protein(p) for p in test2]

    data = Bunch(humanppi = humanppi,\
                 functions=functions,\
                 cancer=cancer,\
                 test1=test1,\
                 test2=test2)
    return data