Exemple #1
0
def pcc_item_rating_pred(path, rating, method, k):
    start = time.time()
    name = 'pcc_item'
    data = extract_data(path)
    mtx = get_matrix(3).toarray()
    item_mtx = []
    result = []
    zero = np.where(~mtx.any(axis=0))[0]  # get zero
    mtx[:, [zero]] = 0.00001  # prevent zero-devide
    #normalize
    pcc = (mtx.T - np.sum(mtx, axis=1)) / len(mtx)
    pcc /= np.linalg.norm(mtx, axis=1).T
    mtx = pcc.T
    if method == 'dot':
        item_mtx = dot_sim(mtx, name)
    elif method == 'cos':
        inputs = (mtx.T * np.linalg.norm(mtx, axis=1)).T
        item_mtx = cos_sim(inputs, name)
    #KNN
    for i in data:
        score = 0
        item_id = i[0]  #get item_id
        user_id = i[1]  #get user_id
        item = item_mtx[item_id]  #row
        knn = np.argsort(item, kind='heapsort')[::-1][0:k + 1]
        if item_id in knn:  # delte query
            idx = np.where(knn == item_id)
            knn = np.delete(knn, idx)
        else:
            knn = np.delete(knn, len(knn) - 1)
        #get score
        if rating == 'mean':
            score = np.sum(np.take(mtx[:, user_id],
                                   knn.tolist())) / float(k) + 3
        elif rating == 'weighted':
            knn_sim = item[knn]
            if np.sum(knn_sim) != 0:  #prevent zero-devide
                weight = knn_sim / np.sum(knn_sim)
                score = np.sum(
                    np.multiply(np.take(mtx[:, user_id], knn.tolist()),
                                weight)) + 3
            else:
                score = np.sum(mtx[:, user_id]) / np.size(
                    np.nonzero(mtx[:, user_id])) + 3
        result.append(score)
    write(result, name, rating, method, k)
    print('item_rating_pred {} {} {} time : {}'.format(method, rating, k,
                                                       time.time() - start))
    gold = golden()
    print("RMSE :", np.sqrt(np.mean(np.square(result - gold))))
def user_rating_pred(path, rating,method,k):
    start = time.time()
    name='user'
    data = extract_data(path)
    mtx = get_matrix(3).toarray()
    user_mtx = []
    result = []
    zero = np.where(~mtx.any(axis=0))[0] #get zero
    mtx[:, [zero]] = 0.00001 # prevent zero-devide
    if method =='dot':
        user_mtx = dot_sim(mtx,name)
    elif method=='cos':
        inputs=np.linalg.norm(mtx,axis=0)*mtx #normalize before cos_sim
        user_mtx = cos_sim(inputs,name)#honestly cos_sim is cosine similariy but input is normalized so same with cos_similarity
    for i in data:
        score = 0
        mv_id = i[0] #get item_id
        user_id = i[1] #get user_id
        user = user_mtx[user_id] #get user
        knn = np.argsort(user,kind='heapsort')[::-1][0: k+1]
        if user_id in knn:# delte query
            i = np.where(knn == user_id)
            knn = np.delete(knn, i)
        else:
            knn = np.delete(knn, len(knn) - 1)
        #get score
        if rating == 'mean':
            score = (np.sum(np.take(mtx[mv_id, :], knn.tolist())) / float(k))+3
        elif rating=='weighted':
            knn_sim = user[knn]
            if np.sum(knn_sim) != 0:
                weight = knn_sim / np.sum(knn_sim) #prevent zero-devide
                score = np.sum(np.multiply(np.take(mtx[mv_id, :], knn.tolist()), weight))+3
            else:
                score = np.sum(mtx[mv_id, :]) / np.size(np.nonzero(mtx[mv_id, :]))+3
        result.append(score)
    #print('start _writting')
    write(result,name,rating,method,k)
    print('user_rating_pred {} {} {} time : {}'.format(method,rating, k,time.time() - start))
    gold=golden()
    print("RMSE :",np.sqrt(np.mean(np.square(result-gold))))
Exemple #3
0
def main():

    from optparse import OptionParser
    usage = "usage: %prog [options] -i [INPUT_MAF_FILE_FOLDER] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=2,
        default=None,
        help=
        "Enter mutation maf files path,if cancer vs normal the second arg is None."
    )
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder.")
    # optional flags
    parser.add_option("-f",
                      "--fdr",
                      dest="fdr",
                      nargs=1,
                      default=0.05,
                      help="FDR cut off")
    parser.add_option(
        "-s",
        "--step",
        dest="step",
        nargs=1,
        default=1,
        help=
        "The maximum number of interval genes allowed between Sub-pathway genes."
    )
    parser.add_option("-c",
                      "--minsize",
                      dest="minsize",
                      nargs=1,
                      default=3,
                      help="Sub-pathway minimum number of nodes.")
    parser.add_option(
        "-p",
        "--pathway",
        dest="pathway",
        nargs=1,
        default=None,
        help="KEGG human normal pathway filename(absolute path).")
    parser.add_option(
        "-g",
        "--gene",
        dest="gene",
        nargs=1,
        default=None,
        help=
        "gene information file, gene id and gene symbol,default is NCBI human_gene_info."
    )
    parser.add_option("-n",
                      "--nperm",
                      dest="nperm",
                      nargs=1,
                      default=1000,
                      help="random times")
    parser.add_option(
        "-m",
        "--symbol",
        dest="symbol",
        nargs=1,
        default=0,
        help="input 1:mutation maf file geneid is 0 but have symbol.")
    parser.add_option(
        "-a",
        "--sub",
        dest="sub",
        nargs=1,
        default=None,
        help=
        "Determine whether subpath extraction is performed separately.File absolute path."
    )
    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.out:
        print('hi there')
        parser.print_help()
        exit()
    # making the out folder if it doesn't exist
    outfolder = myutils.folderparser(options.out)

    #get pathway information
    if not options.pathway:
        pathway_info = myselect.select_normal_pathway_gene()
    else:
        pathway_info = myutils.getpathway(options.pathway)

    # get sample mutation infromation
    mut = []
    for t in options.input:
        if t != 'None':
            mut.append(myselect.select_mutation_gene(t, int(options.symbol)))
    # get gene information
    if not options.gene:
        gene_info = myutils.get_gene_info()
    else:
        gene_info = myutils.get_gene_info(options.gene)
    # run
    gid = myutils.getgid()
    hsa = myselect.select_human_pathway().set_index('pathway name')
    begin = """
    **********************************************************************
    *                               BEGIN                                *
    **********************************************************************
    """
    end = """
    **********************************************************************
    *                                END                                 *
    **********************************************************************
    """
    if len(mut) == 1:
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        #                                             高覆盖通路                                              #
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        if not options.sub:
            tb = time.time()
            print('-' * 10, 'Non-Random Mutation-High-Cover Pathway', '-' * 10)
            print(begin)
            sig_pathway = myalgorithm.RSMP(mut[0], pathway_info, gid,
                                           int(options.nperm),
                                           float(options.fdr))
            myutils.write(sig_pathway,
                          os.path.join(outfolder, 'sig_pathway.xlsx'))
            te = time.time()
            print('Spend %.2f minute!' % ((te - tb) / 60))
            print(end)
        else:
            sig_pathway = pd.read_excel(options.sub)
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        #                                        common sub-pathways                                          #
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        te = time.time()
        print('-' * 10, 'Mutation-High-Cover Sub-Pathway', '-' * 10)
        print(begin)
        Sub_Pathway = SubPathway.HighCoverSub(mut[0], gene_info,
                                              sig_pathway, hsa, outfolder,
                                              int(options.step),
                                              int(options.minsize))
        myutils.write(Sub_Pathway, os.path.join(outfolder, 'Sub_Pathway.xlsx'))
        print('Spend %.2f minute' % ((time.time() - te) / 60))
        print(end)
    else:
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        #                                       subtype-specific pathways                                     #
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        if not options.sub:
            tb = time.time()
            print('-' * 15, 'Subtype Specificity Pathway', '-' * 15)
            print(begin)
            sig_pathway = myalgorithm.TSDP(mut[0],
                                           mut[1],
                                           pathway_info,
                                           cut_off=float(options.fdr))
            myutils.write(sig_pathway,
                          os.path.join(outfolder, 'sig_pathway.xlsx'))
            te = time.time()
            print('Spend %.2f minute!' % ((te - tb) / 60))
            print(end)
        else:
            sig_pathway = pd.read_excel(options.sub)
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        #                                      subtype-specific sub-pathways                                  #
        # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
        te = time.time()
        print('-' * 15, 'Subtype Specificity Sub-Pathway', '-' * 15)
        print(begin)
        Specific_Sub_Pathway = SubPathway.DistinctSub(mut, gene_info,
                                                      sig_pathway, hsa,
                                                      outfolder,
                                                      int(options.step),
                                                      int(options.minsize))
        myutils.write(Specific_Sub_Pathway,
                      os.path.join(outfolder, 'Specific_Sub_Pathway.xlsx'))
        print('Spend %.2f minute' % ((time.time() - te) / 60))
        print(end)