Example #1
0
def main(args):
    word_embeddings = p.load(open(args.word_embeddings, 'rb'))
    word_embeddings = np.array(word_embeddings)
    word2index = p.load(open(args.vocab, 'rb'))

    data = read_data(args.contexts, args.questions, args.ids, args.labels,
                     args.max_post_len, args.max_ques_len)

    N = len(data)
    train_data = data[:int(0.8 * N)]
    test_data = data[int(0.8 * N):]

    print('No. of train_data %d' % len(train_data))
    print('No. of test_data %d' % len(test_data))

    ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs = \
        preprocess_data(train_data, word2index, args.max_post_len, args.max_ques_len)

    q_train_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs

    ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs = \
        preprocess_data(test_data, word2index, args.max_post_len, args.max_ques_len)

    q_test_data = ids_seqs, post_seqs, post_lens, ques_seqs, ques_lens, lab_seqs

    run_classifier(q_train_data,
                   q_test_data,
                   word_embeddings,
                   args,
                   n_layers=2)
Example #2
0
def eval():
    #
    test_dir = './eval/'

    checkpoint_dir = './logs'
    y = tf.placeholder(tf.float32, [BATCH_SIZE,10], name='y')
    z = tf.placeholder(tf.float32, [None, 100], name='z')

    G = generator(z,y)
    data_x, data_y = read_data()
    sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100))
    sample_labels = data_y[120:184]

    print("Reading checkpoints...")
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)

    #saver
    saver = tf.train.Saver(tf.all_variables())

    os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.2
    sess = tf.InteractiveSession(config = config)

    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name))

    test_sess = sess.run(G, feed_dict={z: sample_z, y:sample_labels})

    save_image(test_sess, [8,8], test_dir +'test_%d.png' % 500)

    sess.close()
Example #3
0
def eval():
    #
    test_dir = './eval/'

    checkpoint_dir = './logs'
    y = tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y')
    z = tf.placeholder(tf.float32, [None, 100], name='z')

    G = generator(z, y)
    data_x, data_y = read_data()
    sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100))
    sample_labels = data_y[120:184]

    print("Reading checkpoints...")
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)

    #saver
    saver = tf.train.Saver(tf.all_variables())

    os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.2
    sess = tf.InteractiveSession(config=config)

    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
        saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name))

    test_sess = sess.run(G, feed_dict={z: sample_z, y: sample_labels})

    save_image(test_sess, [8, 8], test_dir + 'test_%d.png' % 500)

    sess.close()
Example #4
0
def review_list_view(request):
    business_url = ''

    if request.method == "GET":
        business_url = request.GET['url']

        [review, user, business] = read_data()

        print(type(review))
        print(type(user))
        print(type(business))

        return JsonResponse({
            'review': list(review),
            'user': list(review),
            'business': list(business),
        })

        context = {
            'business': {
                'url': business_url,
            },
            'result': [],
            'unrecommend': []
        }

        # reviews
        if status == 'finished':
            return render(request, "review_list.html", context)

        return render(request, "review_list.html", context)
Example #5
0
def main():
    # 1.import data
    dataset = 'WBCD.csv'
    Data = read_data(dataset)

    # 2.Fill with median
    Data = fillMed(Data)

    # dnalyze by describing data
    #print(Data.columns.values)

    # preview the data
    #print(Data.head())
    #print(Data.tail())

    #print(Data.describe())

    #print(Data.describe(include=['O']))

    for i in range(1, (Data.shape[1] - 1)):
        #print(Data[[names[i], names[-1]]].groupby([names[i]], \
        #	as_index=False).mean().sort_values(by=names[-1], ascending=False))
        g = sns.FacetGrid(Data, col=names[-1])
        g.map(plt.hist, names[i], bins=20)
        if SAVE == True:
            plt.savefig(directory + '/' + names[i] + '_' + names[-1] + '.png')
        plt.close()

    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(Data.corr(), vmin=-1, vmax=1)
    fig.colorbar(cax)
    ticks = np.arange(0, Data.shape[1], 1)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    ax.set_xticklabels(names, fontsize=6)
    ax.set_yticklabels(names)
    if SAVE == True:
        plt.savefig(directory + '/' + 'CorrMat.png')
    plt.close()

    corrArray = Data.corr().values()[-1, :]
    corrFrame = pd.DataFrame(data=corrArray,
                             index=names,
                             columns=['Correlation'])
    corrFrame_sorted = corrFrame.sort_values(['Correlation'], ascending=False)
    #print(corrFrame_sorted)

    indexBar = range(len(names) - 2)
    plt.bar(indexBar, (corrFrame_sorted.values)[1:-1], align='center')
    plt.xlabel('Features', fontsize=16)
    plt.ylabel('Correlation of tumor being malignant', fontsize=16)
    plt.xticks(indexBar, (corrFrame_sorted.index.values)[1:-1],
               fontsize=6,
               rotation=0)
    plt.ylim((0, 1))
    if SAVE == True:
        plt.savefig(directory + '/' + 'CorrBar.png')
    plt.close()
Example #6
0
def hoeffding(erisk,delta):
    # Get the whoel dataset to find m
    (xvals,yvals) = read_data(1)
    m = xvals.shape[0]
    # Calculate the Hoeffding's Confidence Interval with empirical risk of algorithms
    lower = erisk - sqrt(log(2/delta)/(2*m))
    upper = erisk + sqrt(log(2/delta)/(2*m))

    return (lower,upper)
Example #7
0
def logit(train_x, train_y, test_x, test_y):
    print("\nLogistic Regression Outputs")
    print("========================================================")
    # Create functino for logistic regression
    clf = LogisticRegression(class_weight={-1: 1.5, 1: 1})

    # Fitting train data to train the model
    model = clf.fit(train_x, train_y)

    # Use test data to test the accuracy of our model
    score = model.predict(test_x)
    testerror = 1 - model.score(test_x, test_y)
    trainerror = 1 - model.score(train_x, train_y)
    print('Training Error: %2.3f' % trainerror)
    print('Testing Error: %2.3f' % testerror)
    # Create a table to visualize our accuracy and scores
    print("\nScore matrix: ")
    print(confusion_matrix(test_y, score))

    # Create report to show precision recall f-score and support
    print("Classification Report:")
    print(classification_report(test_y, score))

    # Try k-fold cross validation on 10 folds
    (xvals, yvals) = read_data(1)

    score_accuracy = cross_validate(clf,
                                    xvals,
                                    yvals,
                                    cv=10,
                                    scoring='accuracy',
                                    return_train_score=True)
    #score_auc = cross_val_score(clf,xvals,yvals,cv=10,scoring='roc_auc')
    print('K-fold cross validation results:')
    print("Average train accuracy is %2.3f" %
          score_accuracy['train_score'].mean())
    print("Average test accuracy is %2.3f \n " %
          score_accuracy['test_score'].mean())

    # Hoeffding's
    CI_L, CI_U = hoeffding(testerror, 0.05)
    print("Hoeffding's Confidence interval for Logistic Regression is:")
    print(CI_L, CI_U)

    # For k-folds
    kerisk = 1 - score_accuracy['test_score'].mean()
    kCI_L, kCI_U = hoeffding(kerisk, 0.05)
    print("\nHoeffding's Confidence interval for LR after k-folds is:")
    print(kCI_L, kCI_U)

    return (CI_L, CI_U)
Example #8
0
def test_model(sess, model, label):
    if label == 't':
        user_ratings = sess.run(model.all_ratings_t)
        DIR = '../dataset/' + DATASET_T + '/'
    else:
        user_ratings = sess.run(model.all_ratings_s)
        DIR = '../dataset/' + DATASET_S + '/'
    train_path = DIR + 'train_data.json'
    teat_path = DIR + 'test_data.json'
    validation_path = DIR + 'validation_data.json'
    ## load data
    train_data = read_data(train_path, 1)[0]
    teat_vali_path = validation_path if operator.eq(
        TEST_VALIDATION, 'Validation') == 1 else teat_path
    test_data = read_data(teat_vali_path, 1)[0]
    result = []
    for u in range(len(user_ratings)):
        if len(test_data[u]) > 0:
            result.append(
                test_one_user([user_ratings[u], train_data[u], test_data[u]]))
    result = np.array(result)
    F1, NDCG = np.mean(np.array(result), axis=0)
    return F1, NDCG
Example #9
0
def train(path, path_out, lr, epochs):
    """
    Create and train a bidirectional LSTM RNN.
        :param path: path to files
        :param lr: learning rate
        :param epochs: number of epochs
    """
    # create dictionaries
    w2i, i2w, t2i, i2t, l2i, i2l = create_dictionaries(path)
                        
    # create model
    model = LSTM_RNN(w2i, i2w, t2i, i2t, l2i, i2l)
    
    # make data ready for usage
    train_input, golden_scores, golden_labels, data_dim = read_data(path, w2i, i2w, t2i, i2t, l2i, i2l)
    
    # define loss function: cross entropy loss
    cross_entropy_loss = nn.CrossEntropyLoss()

    # define optimize method: stochastic gradient descent 
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):

        # sentence and target are matrix
        for j in range(data_dim):
            
            # zero gradient buffers
            optimizer.zero_grad()

            # clear hidden
            model.hidden = model.init_hidden()

            # find output of network
            scores, labels = model(train_input[j])
            
            # calculate the loss
            score_loss = cross_entropy_loss(torch.transpose(scores, 0, 1), golden_scores[j])
            label_loss = cross_entropy_loss(labels, golden_labels[j])
            total_loss = score_loss + label_loss
            
            # backpropagate the error
            total_loss.backward()

            # update the weights
            optimizer.step()
            
    torch.save(model, path_out)
    
    return model
Example #10
0
    def __init__(self):
        self.input_choses = {
            "1": self.plot_band,
            "2": self.plot_dos,
            "3": self.run,
        }

        self.framework_choises = {
            "1": self.vasp,
            "2": self.qespresso,
            "0": self.quit,
            "4": self.input_data,
            "5": self.automatic
        }
        self.test_data = read_data()
def prepare_data(post_data_tsv, qa_data_tsv, train_ids_file, test_ids_file, sim_ques_filename):
	p_input_data, q_data, train_triples, test_triples = read_data(post_data_tsv, qa_data_tsv, train_ids_file, test_ids_file, sim_ques_filename)

	triples = train_triples + test_triples
	print("Indexing words...")
	for triple in triples:
		p_input_data.index_words(triple[0])
		for q in triple[1]:
			q_data.index_words(q)
		q_data.index_words(triple[2])
	
	print('Indexed %d words in post input, %d words in ques' % (p_input_data.n_words, q_data.n_words))

	p_input_data.trim_using_tfidf()
	q_data.trim(MIN_COUNT)

	return p_input_data, q_data, train_triples, test_triples
Example #12
0
def main(args):
    print('Enter main')
    word_embeddings = p.load(open(args.word_embeddings, 'rb'))
    print(('Loaded emb of size %d' % len(word_embeddings)))
    word_embeddings = np.array(word_embeddings)
    word2index = p.load(open(args.vocab, 'rb'))
    index2word = reverse_dict(word2index)
    index2kwd, kwd2index, index2cnt = read_kwd_vocab(args.kwd_vocab)
    test_data = read_data(args.test_context,
                          args.test_question,
                          args.test_ids,
                          args.max_post_len,
                          args.max_ques_len,
                          mode='test')

    print('No. of test_data %d' % len(test_data))
    if args.eval_kwd:
        run_eval_kwd(test_data, word_embeddings, word2index, index2word,
                     kwd2index, index2kwd, args)
    else:
        run_model(test_data, word_embeddings, word2index, index2word,
                  kwd2index, index2kwd, args)
Example #13
0
def eval():
    # 用于存放测试图片
    test_dir = '/home/jackie/workspace/gan/dcgan/eval/'
    # 从此处加载模型
    checkpoint_dir = '/home/jackie/workspace/gan/dcgan/logs/'
    
    y= tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y')
    z = tf.placeholder(tf.float32, [None, 100], name='z')
    
    G = generator(z, y)    
    data_x, data_y = read_data()
    sample_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100))
    sample_labels = data_y[120: 184]
    
    # 读取 ckpt 需要 sess,saver
    print("Reading checkpoints...")
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    
    # saver
    saver = tf.train.Saver(tf.all_variables())
    
    # sess
    os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.2
    sess = tf.InteractiveSession(config=config)
    
    # 从保存的模型中恢复变量
    if ckpt and ckpt.model_checkpoint_path:
        ckpt_name = os.path.basename(ckpt.model_checkpoint_path)        
        saver.restore(sess, os.path.join(checkpoint_dir, ckpt_name))
    
    # 用恢复的变量进行生成器的测试
    test_sess = sess.run(G, feed_dict = {z: sample_z, y: sample_labels})
    
    # 保存测试的生成器图片到特定文件夹
    save_images(test_sess, [8, 8], test_dir + 'test_%d.png' % 500)
    
    sess.close()
Example #14
0
def build_alf_model(filename, tag='', pool_type='multiprocessing'):
    """
    - based on alf.f90
    - `https://github.com/cconroy20/alf/blob/master/src/alf.f90`
    Master program to fit the absorption line spectrum, or indices,
    #  of a quiescent (>1 Gyr) stellar population
    # Some important points to keep in mind:
    # 1. The prior bounds on the parameters are specified in set_pinit_priors.
    #    Always make sure that the output parameters are not hitting a prior.
    # 2. Make sure that the chain is converged in all relevant parameters
    #    by plotting the chain trace (parameter vs. chain step).
    # 3. Do not use this code blindly.  Fitting spectra is a
    #    subtle art and the code can easily fool you if you don't know
    #    what you're doing.  Make sure you understand *why* the code is
    #    settling on a particular parameter value.
    # 4. Wavelength-dependent instrumental broadening is included but
    #    will not be accurate in the limit of modest-large redshift b/c
    #    this is implemented in the model restframe at code setup time
    # 5. The code can fit for the atmospheric transmission function but
    #    this will only work if the input data are in the original
    #    observed frame; i.e., not de-redshifted.
    # 6. I've found that Nwalkers=1024 and Nburn=~10,000 seems to
    #    generically yield well-converged solutions, but you should test
    #    this yourself by fitting mock data generated with write_a_model
    # To Do: let the Fe-peak elements track Fe in simple mode
    """
    ALFPY_HOME = os.environ['ALFPY_HOME']
    for ifolder in [
            'alfvar_models', 'results_emcee', 'results_dynesty', 'subjobs'
    ]:
        if os.path.exists(ALFPY_HOME + ifolder) is not True:
            os.makedirs(ALFPY_HOME + ifolder)

    pickle_model_name = '{0}alfvar_models/alfvar_model_{1}_{2}.p'.format(
        ALFPY_HOME, filename, tag)
    print('We will create one and pickle dump it to \n' + pickle_model_name)
    alfvar = ALFVAR()

    global use_keys
    use_keys = [k for k, (v1, v2) in tofit_params.items() if v1 == True]

    #---------------------------------------------------------------!
    #---------------------------Setup-------------------------------!
    #---------------------------------------------------------------!
    # ---- flag specifying if fitting indices or spectra
    alfvar.fit_indices = 0  #flag specifying if fitting indices or spectra

    # ---- flag determining the level of complexity
    # ---- 0=full, 1=simple, 2=super-simple.  See sfvars for details
    alfvar.fit_type = 0  # do not change; use use_keys to specify parameters

    # ---- fit h3 and h4 parameters
    alfvar.fit_hermite = 0

    # ---- type of IMF to fit
    # ---- 0=1PL, 1=2PL, 2=1PL+cutoff, 3=2PL+cutoff, 4=non-parametric IMF
    alfvar.imf_type = 3

    # ---- are the data in the original observed frame?
    alfvar.observed_frame = 0
    alfvar.mwimf = 0  #force a MW (Kroupa) IMF

    if alfvar.mwimf:
        alfvar.imf_type = 1

    # ---- fit two-age SFH or not?  (only considered if fit_type=0)
    alfvar.fit_two_ages = 1

    # ---- IMF slope within the non-parametric IMF bins
    # ---- 0 = flat, 1 = Kroupa, 2 = Salpeter
    alfvar.nonpimf_alpha = 2

    # ---- turn on/off the use of an external tabulated M/L prior
    alfvar.extmlpr = 0

    # ---- set initial params, step sizes, and prior ranges
    _, prlo, prhi = set_pinit_priors(alfvar.imf_type)

    # ---- change the prior limits to kill off these parameters
    prhi.logm7g = -5.0
    prhi.teff = 2.0
    prlo.teff = -2.0

    # ---- mass of the young component should always be sub-dominant
    prhi.logfy = -0.5

    # ---------------------------------------------------------------!
    # --------------Do not change things below this line-------------!
    # ---------------unless you know what you are doing--------------!
    # ---------------------------------------------------------------!

    # ---- regularize non-parametric IMF (always do this)
    alfvar.nonpimf_regularize = 1

    # ---- dont fit transmission function in cases where the input
    # ---- spectrum has already been de-redshifted to ~0.0
    if alfvar.observed_frame == 0 or alfvar.fit_indices == 1:
        alfvar.fit_trans = 0
        prhi.logtrans = -5.0
        prhi.logsky = -5.0
    else:
        alfvar.fit_trans = 1

    # ---- extra smoothing to the transmission spectrum.
    # ---- if the input data has been smoothed by a gaussian
    # ---- in velocity space, set the parameter below to that extra smoothing
    alfvar.smooth_trans = 0.0

    if (alfvar.ssp_type == 'cvd'):
        # ---- always limit the [Z/H] range for CvD since
        # ---- these models are actually only at Zsol
        prhi.zh = 0.01
        prlo.zh = -0.01
        if (alfvar.imf_type > 1):
            print('ALF ERROR, ssp_type=cvd but imf>1')

    if alfvar.fit_type in [1, 2]:
        alfvar.mwimf = 1

    #---------------------------------------------------------------!

    if filename is None:
        print('ALF ERROR: You need to specify an input file')
        teminput = input("Name of the input file: ")
        if len(teminput.split(' ')) == 1:
            filename = teminput
        elif len(teminput.split(' ')) > 1:
            filename = teminput[0]
            tag = teminput[1]

    # ---- write some important variables to screen
    print(" ************************************")
    if alfvar.fit_indices == 1:
        print(" ***********Index Fitter*************")
    else:
        print(" **********Spectral Fitter***********")
    print(" ************************************")
    print("   ssp_type  =", alfvar.ssp_type)
    print("   fit_type  =", alfvar.fit_type)
    print("   imf_type  =", alfvar.imf_type)
    print(" fit_hermite =", alfvar.fit_hermite)
    print("fit_two_ages =", alfvar.fit_two_ages)
    if alfvar.imf_type == 4:
        print("   nonpimf   =", alfvar.nonpimf_alpha)
    print("  obs_frame  =", alfvar.observed_frame)
    print("      mwimf  =", alfvar.mwimf)
    print("  age-dep Rf =", alfvar.use_age_dep_resp_fcns)
    print("    Z-dep Rf =", alfvar.use_z_dep_resp_fcns)
    #print("  Ncores     = ",  ntasks)
    print("  filename   = ", filename, ' ', tag)
    print(" ************************************")
    #print('\n\nStart Time ',datetime.now())

    #---------------------------------------------------------------!

    # ---- read in the data and wavelength boundaries
    alfvar.filename = filename
    alfvar.tag = tag

    if alfvar.fit_indices == 0:
        alfvar = read_data(alfvar)
        # ---- read in the SSPs and bandpass filters
        # ------- setting up model arry with given imf_type ---- #

        if pool_type == 'multiprocessing':
            from multiprocessing import Pool as to_use_pool
        else:
            from schwimmbad import MPIPool as to_use_pool

        pool = to_use_pool()
        if pool_type == 'mpi':
            print('pool size', pool.size)
            if not pool.is_master():
                pool.wait()
                sys.exit(0)

        print('\nsetting up model arry with given imf_type and input data\n')
        tstart = time.time()
        alfvar = setup(alfvar, onlybasic=False, pool=pool)
        ndur = time.time() - tstart
        print('\n Total time for setup {:.2f}min'.format(ndur / 60))

        ## ---- This part requires alfvar.sspgrid.lam ---- ##
        lam = np.copy(alfvar.sspgrid.lam)
        # ---- interpolate the sky emission model onto the observed wavelength grid
        # ---- moved to read_data
        if alfvar.observed_frame == 1:
            alfvar.data.sky = linterp(alfvar.lsky, alfvar.fsky,
                                      alfvar.data.lam)
            alfvar.data.sky[alfvar.data.sky < 0] = 0.
        else:
            alfvar.data.sky[:] = tiny_number
        alfvar.data.sky[:] = tiny_number  # ?? why?

        # ---- we only compute things up to 500A beyond the input fit region
        alfvar.nl_fit = min(max(locate(lam, alfvar.l2[-1] + 500.0), 0),
                            alfvar.nl - 1)
        ## ---- define the log wavelength grid used in velbroad.f90
        alfvar.dlstep = (np.log(alfvar.sspgrid.lam[alfvar.nl_fit]) -
                         np.log(alfvar.sspgrid.lam[0])) / (alfvar.nl_fit + 1)

        for i in range(alfvar.nl_fit):
            alfvar.lnlam[i] = i * alfvar.dlstep + np.log(alfvar.sspgrid.lam[0])

    # ---- convert the structures into their equivalent arrays
    prloarr = str2arr(switch=1, instr=prlo)
    prhiarr = str2arr(switch=1, instr=prhi)

    # ---- this is the master process
    # ---- estimate velz ---- #
    print("  Fitting ", alfvar.nlint, " wavelength intervals")
    nlint = alfvar.nlint
    l1, l2 = alfvar.l1, alfvar.l2
    print('wavelength bourdaries: ', l1, l2)
    if l2[-1] > np.nanmax(lam) or l1[0] < np.nanmin(lam):
        print('ERROR: wavelength boundaries exceed model wavelength grid')
        print(l2[nlint - 1], lam[nl - 1], l1[0], lam[0])

    global global_alfvar, global_prloarr, global_prhiarr
    global_alfvar = copy.deepcopy(alfvar)
    global_prloarr = copy.deepcopy(prloarr)
    global_prhiarr = copy.deepcopy(prhiarr)
    # -------- optimize the first four parameters -------- #
    len_optimize = 4
    prior_bounds = list(
        zip(global_prloarr[:len_optimize], global_prhiarr[:len_optimize]))

    if ~alfvar.observed_frame:
        prior_bounds[0] = (-200, 200)
    optimize_res = differential_evolution(func_2min,
                                          bounds=prior_bounds,
                                          disp=True,
                                          polish=False,
                                          updating='deferred',
                                          workers=1)
    print('optimized parameters', optimize_res)

    # -------- getting priors for the sampler -------- #
    global global_all_prior  # ---- note it's for all parameters
    all_key_list = list(tofit_params.keys())
    # ---------------- update priors ----------------- #
    prrange = [10, 10, 0.1, 0.1]
    global_all_prior = [ClippedNormal(np.array(optimize_res.x)[i], prrange[i],
                                      global_prloarr[i],
                                      global_prhiarr[i]) for i in range(len_optimize)] + \
                       [TopHat(global_prloarr[i+len_optimize],
                               global_prhiarr[i+len_optimize]) for i in range(len(all_key_list)-len_optimize)]

    pickle.dump([alfvar, prloarr, prhiarr, global_all_prior, optimize_res.x],
                open(pickle_model_name, "wb"))
    pool.close()
Example #15
0
def train_model(para, path_excel):
    [_,MODEL,LR,LAMDA,LAYER,EMB_DIM,FREQUENCY_USER, FREQUENCY_ITEM,
     BATCH_SIZE, SAMPLE_RATE,IF_PRETRAIN,N_EPOCH,_,TOP_K,OPTIMIZATION] = para
    ## Paths of data
    train_path = DIR+'train_data.json'
    transformation_bases_path = DIR+'hypergraph_embeddings.json'                  # transformation bases for graph convolution
    pre_train_feature_path = DIR+'pre_train_feature'+str(EMB_DIM)+'.json'         # to pretrain latent factors for user-item interaction

    ## Load data
    # load training data
    [train_data, train_data_interaction, user_num, item_num] = read_data(train_path)
    # load pre-trained embeddings for all deep models
    try:
        pre_train_feature = read_bases(pre_train_feature_path, EMB_DIM, EMB_DIM)
    except:
        print('There is no pre-trained feature found!!')
        pre_train_feature = [0, 0]
        IF_PRETRAIN = 0
        
    # load pre-trained transform bases for LCFN
    if MODEL == 'LCFN': transformation_bases = read_bases(transformation_bases_path, FREQUENCY_USER, FREQUENCY_ITEM)

    ## Define the model
    if MODEL == 'BPR':
        model = model_BPR(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA,
                          optimization=OPTIMIZATION)
    if MODEL == 'NCF':
        model = model_NCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA,
                          optimization=OPTIMIZATION, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'GCMC':
        model = model_GCMC(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, 
                           emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, 
                           pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'NGCF':
        model = model_NGCF(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, 
                           emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION,
                           pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'SCF':
        model = model_SCF(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, 
                          emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, 
                          pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'CGMC':
        model = model_CGMC(layer=LAYER, graph=train_data_interaction, n_users=user_num, n_items=item_num, 
                           emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, 
                           pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)
    if MODEL == 'LCFN':
        model = model_LCFN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, 
                           graph_embeddings=transformation_bases, lr=LR, lamda=LAMDA, optimization=OPTIMIZATION, 
                           pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    ## Split the training samples into batches
    batches = list(range(0, len(train_data_interaction), BATCH_SIZE))
    batches.append(len(train_data_interaction))
    ## Training iteratively
    F1_max = 0
    F1_df = pd.DataFrame(columns=TOP_K)
    NDCG_df = pd.DataFrame(columns=TOP_K)
    for epoch in range(N_EPOCH):
        t1 = time.clock()
        for batch_num in range(len(batches)-1):
            train_batch_data = []
            for sample in range(batches[batch_num], batches[batch_num+1]):
                (user, pos_item) = train_data_interaction[sample]
                sample_num = 0
                while sample_num < SAMPLE_RATE:
                    neg_item = int(random.uniform(0, item_num))
                    if not (neg_item in train_data[user]):
                        sample_num += 1
                        train_batch_data.append([user, pos_item, neg_item])
            train_batch_data = np.array(train_batch_data)
            _, loss = sess.run([model.updates, model.loss],
                               feed_dict={model.users: train_batch_data[:,0],
                                          model.pos_items: train_batch_data[:,1],
                                          model.neg_items: train_batch_data[:,2]})

        # test the model each epoch
        F1, NDCG = test_model(sess, model)
        t2 = time.clock()
        F1_max = max(F1_max, F1[0])
        # print performance
        print_value([epoch + 1, loss, F1_max, F1, NDCG])
        # save performance
        F1_df.loc[epoch + 1] = F1
        NDCG_df.loc[epoch + 1] = NDCG
        save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)
        if not loss < 10**10:
            break
        
    del model, loss, _, sess
    gc.collect()
Example #16
0
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.metrics.functional import f1
from pytorch_lightning.metrics.functional.classification import accuracy, multiclass_roc, auc, confusion_matrix

from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertModel, DistilBertTokenizer, DistilBertModel
from transformers import AdamW, get_linear_schedule_with_warmup

import matplotlib.pyplot as plt
import seaborn as sn

from read_data import *

_, _, _, slope_df = read_data('boundary',
                              dir_path='data/',
                              existing_company_only=False,
                              sample=None)
_, _, _, test_slope_df = read_test_data('boundary',
                                        dir_path='data/',
                                        existing_company_only=False,
                                        sample=None)
slope_df = pd.concat([slope_df, test_slope_df])

company_embedding_df = pd.read_csv('data/company_embedding_centered.csv')


class BertPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()
Example #17
0
def evaluate_model(model,
                   tokenizer,
                   trainer,
                   existing_company_only,
                   dir_path='data/',
                   batch_size=16,
                   num_workers=4):

    (train_ids, X_train,
     y_train), (val_ids, X_val,
                y_val), (test_ids, X_test, y_test), _ = read_data(
                    'correct',
                    dir_path=dir_path,
                    existing_company_only=existing_company_only,
                    sample=None)
    correct_test_dataloader = tokenize_and_dataloader(X_test,
                                                      y_test,
                                                      tokenizer,
                                                      test_ids,
                                                      batch_size,
                                                      num_workers,
                                                      random=False)

    (train_ids, X_train,
     y_train), (val_ids, X_val,
                y_val), (test_ids, X_test, y_test), _ = read_data(
                    'inverse',
                    dir_path=dir_path,
                    existing_company_only=existing_company_only,
                    sample=None)
    inverse_test_dataloader = tokenize_and_dataloader(X_test,
                                                      y_test,
                                                      tokenizer,
                                                      test_ids,
                                                      batch_size,
                                                      num_workers,
                                                      random=False)

    (train_ids, X_train,
     y_train), (val_ids, X_val,
                y_val), (test_ids, X_test, y_test), _ = read_data(
                    'boundary',
                    dir_path=dir_path,
                    existing_company_only=existing_company_only,
                    sample=None)
    boundary_test_dataloader = tokenize_and_dataloader(X_test,
                                                       y_test,
                                                       tokenizer,
                                                       test_ids,
                                                       batch_size,
                                                       num_workers,
                                                       random=False)

    model.incorrect_type = 'correct'
    trainer.test(model, correct_test_dataloader)

    model.incorrect_type = 'inverse'
    trainer.test(model, inverse_test_dataloader)

    model.incorrect_type = 'boundary'
    trainer.test(model, boundary_test_dataloader)
Example #18
0
                                     mean_y=mean_y_irr,
                                     std_y=std_y_irr)
        file.write(f"Validation Set Error Irr-from-3: {error} \n")
        error = test_model_from_path(Xs,
                                     y_irr,
                                     "models/test3/Irr-from-3/",
                                     reg=1,
                                     mean_y=mean_y_irr,
                                     std_y=std_y_irr,
                                     Time=list(range(N)))
        file.write(f"Total Set Error Irr-from-3: {error} \n")


if __name__ == "__main__":

    G1_AS_good, G1_AS_positive_offset, G1_AS_negative_offset, G1_DES_good, G1_DES_positive_offset, G1_DES_negative_offset, G2_good, G2_positive_offset, G2_negative_offset = read_data(
    )

    data_list = [
        G1_AS_good[list(G1_AS_good.keys())[-1]],
        G1_AS_positive_offset[list(G1_AS_positive_offset.keys())[-1]],
        G1_AS_negative_offset[list(G1_AS_negative_offset.keys())[-1]],
        G1_DES_good[list(G1_DES_good.keys())[-1]],
        G1_DES_positive_offset[list(G1_DES_positive_offset.keys())[-1]],
        G1_DES_negative_offset[list(G1_DES_negative_offset.keys())[-1]],
        G2_good[list(G2_good.keys())[-1]],
        G2_positive_offset[list(G2_positive_offset.keys())[-1]],
        G2_negative_offset[list(G2_negative_offset.keys())[-1]]
    ]

    #study_plot(*merge_data(*data_list))
Example #19
0
def train_model(para, path_excel):
    [
        DATASET_T, DATASET_S, MODEL, LR_REC, LR_DOM_pos, LR_DOM_neg, LAMDA,
        LR_REC_s, LAMDA_s, LAYER, EMB_DIM, BATCH_SIZE, SAMPLE_RATE, N_EPOCH, _,
        TOP_K, OPTIMIZATION, IF_PRETRAIN
    ] = para

    ## paths of data
    train_path_t = '../dataset/' + DATASET_T + '/train_data.json'
    train_path_s = '../dataset/' + DATASET_S + '/train_data.json'
    pretrain_path_t = '../dataset/' + DATASET_T + '/latent_embeddings.json'
    pretrain_path_s = '../dataset/' + DATASET_S + '/latent_embeddings.json'
    review_path_t = '../dataset/' + DATASET_T + '/review_embeddings.json'
    review_path_s = '../dataset/' + DATASET_S + '/review_embeddings.json'

    ## load train data
    [train_data_t, train_data_interaction_t, user_num_t,
     item_num_t] = read_data(train_path_t, BATCH_SIZE)
    [train_data_s, train_data_interaction_s, user_num_s,
     item_num_s] = read_data(train_path_s, BATCH_SIZE)

    pretrain_s = read_bases(pretrain_path_s)
    review_s = read_bases(review_path_s)
    try:
        pretrain_t = read_bases(pretrain_path_t)
    except:
        print('\n There is no pre-trained feature found !! \n')
        pretrain_t = [0, 0]
        IF_PRETRAIN = 0
    review_t = read_bases(review_path_t)

    ## define the model
    model = model_TDAR(layer=LAYER,
                       n_users_t=user_num_t,
                       n_items_t=item_num_t,
                       n_users_s=user_num_s,
                       n_items_s=item_num_s,
                       emb_dim=EMB_DIM,
                       lr_rec=LR_REC,
                       lr_dom_pos=LR_DOM_pos,
                       lr_dom_neg=LR_DOM_neg,
                       lamda=LAMDA,
                       lr_rec_s=LR_REC_s,
                       lamda_s=LAMDA_s,
                       optimization=OPTIMIZATION,
                       pretrain_t=pretrain_t,
                       pretrain_s=pretrain_s,
                       review_embeddings_t=review_t,
                       review_embeddings_s=review_s,
                       if_pretrain=IF_PRETRAIN)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    ## split the training samples into batches
    batches = list(range(0, len(train_data_interaction_t), BATCH_SIZE))
    batches.append(len(train_data_interaction_t))
    ## training iteratively
    F1_max_t = 0
    F1_max_s = 0
    pre_i_max = 0
    pre_u_max = 0
    F1_df = pd.DataFrame(columns=TOP_K)
    NDCG_df = pd.DataFrame(columns=TOP_K)
    loss_rec_s = 0
    loss_rec_t = 0
    for epoch in range(N_EPOCH):
        rd.shuffle(train_data_interaction_t)
        rd.shuffle(train_data_interaction_s)
        for batch_num in range(len(batches) - 1):
            train_batch_data_t = []
            train_batch_data_s = []
            for sample_t in range(batches[batch_num], batches[batch_num + 1]):
                (user_t, pos_item_t) = train_data_interaction_t[sample_t]
                sample_s = random.randint(0, len(train_data_interaction_s) - 1)
                (user_s, pos_item_s) = train_data_interaction_s[sample_s]
                train_batch_data_t.append([user_t, pos_item_t, 1, 1])
                train_batch_data_s.append([user_s, pos_item_s, 1, 0])
                # add negatives to the target domain
                sample_num = 0
                while sample_num < SAMPLE_RATE[0]:
                    neg_item_t = int(random.uniform(0, item_num_t))
                    if not (neg_item_t in train_data_t[user_t]):
                        sample_num += 1
                        train_batch_data_t.append([user_t, neg_item_t, 0, 1])
                # add negatives to the source domain
                sample_num = 0
                while sample_num < SAMPLE_RATE[1]:
                    neg_item_s = int(random.uniform(0, item_num_s))
                    if not (neg_item_s in train_data_s[user_s]):
                        sample_num += 1
                        train_batch_data_s.append([user_s, neg_item_s, 0, 0])
            train_batch_data_t = np.array(train_batch_data_t)
            train_batch_data_s = np.array(train_batch_data_s)
            try:
                [
                    update1_t, update1_s, update2_u, update2_i, update3_u,
                    update3_i, loss_rec_t, loss_rec_s, loss_domain_u,
                    loss_domain_i
                ] = sess.run(
                    [
                        model.update1_t, model.update1_s, model.update2_u,
                        model.update2_i, model.update3_u, model.update3_i,
                        model.loss_rec_t, model.loss_rec_s,
                        model.loss_domain_u, model.loss_domain_i
                    ],
                    feed_dict={
                        model.users_t: train_batch_data_t[:, 0],
                        model.items_t: train_batch_data_t[:, 1],
                        model.rec_label_t: train_batch_data_t[:, 2],
                        model.domain_label_t: train_batch_data_t[:, 3],
                        model.users_s: train_batch_data_s[:, 0],
                        model.items_s: train_batch_data_s[:, 1],
                        model.rec_label_s: train_batch_data_s[:, 2],
                        model.domain_label_s: train_batch_data_s[:, 3]
                    })
            except:
                update, loss_rec = sess.run(
                    [model.updates, model.loss],
                    feed_dict={
                        model.users: train_batch_data_t[:, 0],
                        model.items: train_batch_data_t[:, 1],
                        model.labels: train_batch_data_t[:, 2]
                    })

        F1_t, NDCG_t = test_model(sess, model, 't')
        F1_max_t = max(F1_max_t, F1_t[0])
        F1_s, NDCG_s = test_model(sess, model, 's')
        F1_max_s = max(F1_max_s, F1_s[0])
        pre_u, pre_i = test_domain(sess, model)

        ## print performance
        print_value([
            epoch + 1, loss_rec_t, loss_rec_s, loss_domain_u, loss_domain_i,
            F1_max_t, F1_max_s, pre_u, pre_i, F1_t, NDCG_t, F1_s, NDCG_s
        ])

        ## save performance
        F1_df.loc[epoch + 1] = F1_t
        NDCG_df.loc[epoch + 1] = NDCG_t
        save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']],
                   path_excel,
                   first_sheet=False)
        if not (loss_rec_s + loss_rec_t) < 10**10:
            break
Example #20
0
            minEntries.append(entry)
        elif d == minDist:
            minEntries.append(entry)
    total = 0
    print "nearest neighbors : ", minEntries
    for entry in minEntries:
        total += mood_rating(entry.get_mood())
    return mood(ceil(total / len(minEntries)))
    


def dist(list1, list2):
    dist = 0
    total = 0
    same = 0
    for item in list1:
        if item not in list2:
            dist += 1
        else:
            same += 1
        total += 1
    dist += (len(list2) - same)
    total += len(list2)
    return dist/float(total)

entries = read_data(sys.argv[1])
inputs = read_data(sys.argv[2])
for item in inputs:
    print item
    print nearest_neighbor(entries, item)
Example #21
0
            prev_cost = _cost
            nth += 1
    return weights


def train(X_train, Y_train):
    print('Start training model...')
    W = gradient_descent_SGD(X_train, Y_train, learning_rate, max_epochs,
                             cost_threshold)
    print('Train finished!')
    return W


if __name__ == "__main__":
    # read data
    X, Y = read_data('data.csv')

    # split data for training and testing 80:20
    # first insert 1 in every row for intercept b
    X.insert(loc=len(X.columns), column='intercept', value=1)
    print("splitting dataset into train and test sets...")

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=42)

    # regulaziration parameter
    C = 10000
    learning_rate = 0.000001
    max_epochs = 5000
Example #22
0
def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000,
             dataset='mnist.pkl.gz', batch_size=20, n_hidden=500):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization)

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz


   """
    datasets, user_ids = load_data("test_feature1")

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size + 1
    
    params, nerual_num = read_param(sys.argv[1])
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    rng = numpy.random.RandomState(1234)

    # construct the MLP class
    classifier = MLP(
        rng=rng,
        input=x,
        n_in=nerual_num[0],
        n_hidden=nerual_num[1:-1],
        n_out=nerual_num[-1],
        params = params
    )

    # start-snippet-4
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (
        classifier.negative_log_likelihood(y)
        + L1_reg * classifier.L1
        + L2_reg * classifier.L2_sqr
    )
    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.Y_pred(),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
	on_unused_input='ignore'
    )
    ###############
    #  TEST MODEL #
    ###############
    print('... testing')

    # early-stopping parameters
    patience = 100000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    
    # test it on the test set
    y_pred = []
    for i in range(n_test_batches):
        y_pred += list(test_model(i))
    #y_pred = [test_model(i) for i in range(n_test_batches)]
    print len(y_pred)
    ##test
    wb_Embedfile = "test_feature"#"../jd_wfeature"
    jd_wb_linkfile = "network"#"../../net_work10w"
    test_user =user_ids# get_user(wb_Embedfile, 0)
    user_pro = read_data("../../user_product_list10w", test_user)
    print len(user_pro)
    test_nega = readpro_nega("test_nega.txt")
    cate_users = get_cate_users(jd_wb_linkfile)
    test_itemPre(test_user, test_nega, cate_users, user_pro, y_pred)
Example #23
0
    We embeded the POS information to improve the model and provide some punish options



'''

from read_data import *
import collections as cc
from HMM import *
import numpy as np
import time
from add_pos import *
import matplotlib.pyplot as plt
train_arg_sent,train_tri_sent,test_arg_sent,\
test_tri_sent,train_trigger,train_argument,set_arg,set_tri, \
argument_tuple, trigger_tuple,vocab_arg,vocab_tri=read_data()

param_arg = {
    'type': 'arg',
    'conf_prob': cc.Counter(argument_tuple),
    'arg': np.array(list(set_arg)),
    'vocab': np.array(list(vocab_arg)),
    'Naive_bayes': 0,
    'k': 10**5,
    'O_punish': 0.5,
    'diag_punish': 0.5,
    'conf_mat': np.zeros([(len(vocab_arg)), (len(set_arg))]),
    'k_conf_mat': 10**-5,
    'lambda': 5,
    'lambda2': 2
}
Example #24
0
def train_model(para, path_excel, if_save_emb):
    [
        _,
        MODEL,
        LR,
        LAMDA,
        EMB_DIM,
        BATCH_SIZE,
        SAMPLE_RATE,
        N_EPOCH,
        _,
        _,
    ] = para
    ## paths of data
    train_path = DIR + 'train_data.json'
    save_text_embeddings_path = DIR + 'review_embeddings.json'
    save_latant_embeddings_path = DIR + 'latent_embeddings.json'
    text_embeddings_path = DIR + 'text.json'
    user_review_path = DIR + 'user_text.json'
    item_review_path = DIR + 'item_text.json'
    ## load train data
    [train_data, train_data_interaction, user_num,
     item_num] = read_data(train_path, BATCH_SIZE)
    if MODEL == 'TMN':
        text_matrix = load_features(text_embeddings_path)
        text_matrix = text_matrix.astype(np.float32)
        user_review = load_data(user_review_path)
        item_review = load_data(item_review_path)
        user_word_num = 0
        for review in user_review:
            user_word_num = max(len(review), user_word_num)
        item_word_num = 0
        for review in item_review:
            item_word_num = max(len(review), item_word_num)
        user_word_num = min(user_word_num, 200)
        item_word_num = min(item_word_num, 200)
        user_review_feature = np.ones((user_num, user_word_num))
        item_review_feature = np.ones((item_num, item_word_num))
        for user in range(user_num):
            user_review_feature[user] = assignment(user_review_feature[user],
                                                   user_review[user])
        for item in range(item_num):
            item_review_feature[item] = assignment(item_review_feature[item],
                                                   item_review[item])

    ## define the model
    if MODEL == 'MF':
        model = model_MF(n_users=user_num,
                         n_items=item_num,
                         emb_dim=EMB_DIM,
                         lr=LR,
                         lamda=LAMDA)
    if MODEL == 'TMN':
        model = model_TMN(n_users=user_num,
                          n_items=item_num,
                          emb_dim=EMB_DIM,
                          lr=LR,
                          lamda=LAMDA,
                          text_embeddings=text_matrix,
                          user_word_num=user_word_num,
                          item_word_num=item_word_num)
    if MODEL == 'TMF':
        review_embeddings = read_features(save_text_embeddings_path)
        model = model_TMF(n_users=user_num,
                          n_items=item_num,
                          emb_dim=EMB_DIM,
                          lr=LR,
                          lamda=LAMDA,
                          review_embeddings=review_embeddings)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    ## split the training samples into batches
    batches = list(range(0, len(train_data_interaction), BATCH_SIZE))
    batches.append(len(train_data_interaction))

    ## training iteratively
    F1_max = -10
    F1_df = pd.DataFrame(columns=TOP_K)
    NDCG_df = pd.DataFrame(columns=TOP_K)
    for epoch in range(N_EPOCH):
        for batch_num in range(len(batches) - 1):
            train_batch_data = []
            if MODEL == 'TMN':
                user_review_batch = np.ones(
                    ((1 + SAMPLE_RATE) *
                     (batches[batch_num + 1] - batches[batch_num]),
                     user_word_num))
                item_review_batch = np.ones(
                    ((1 + SAMPLE_RATE) *
                     (batches[batch_num + 1] - batches[batch_num]),
                     item_word_num))
            num = 0
            for sample in range(batches[batch_num], batches[batch_num + 1]):
                user, pos_item = train_data_interaction[sample]
                sample_num = 0
                train_batch_data.append([user, pos_item, 1])
                if MODEL == 'TMN':
                    user_review_batch[num] = user_review_feature[user]
                    item_review_batch[num] = item_review_feature[pos_item]
                num += 1
                while sample_num < SAMPLE_RATE:
                    neg_item = int(random.uniform(0, item_num))
                    if not (neg_item in train_data[user]):
                        sample_num += 1
                        train_batch_data.append([user, neg_item, 0])
                        if MODEL == 'TMN':
                            user_review_batch[num] = user_review_feature[user]
                            item_review_batch[num] = item_review_feature[
                                neg_item]
                        num += 1
            train_batch_data = np.array(train_batch_data)
            try:
                _, loss = sess.run(
                    [model.updates, model.loss],
                    feed_dict={
                        model.users: train_batch_data[:, 0],
                        model.items: train_batch_data[:, 1],
                        model.label: train_batch_data[:, 2],
                        model.user_word: user_review_batch,
                        model.item_word: item_review_batch
                    })
            except:
                _, loss = sess.run(
                    [model.updates, model.loss],
                    feed_dict={
                        model.users: train_batch_data[:, 0],
                        model.items: train_batch_data[:, 1],
                        model.label: train_batch_data[:, 2]
                    })
        if MODEL == 'TMN':
            F1, NDCG = test_model(sess, model, user_review_feature,
                                  item_review_feature)
        else:
            F1, NDCG = test_model(sess, model, 0, 0)
        if F1_max < F1[0]:
            F1_max = F1[0]
            if if_save_emb == 1:
                try:
                    user_text_embedding = np.zeros(
                        (user_num, np.shape(text_matrix)[1]))
                    item_text_embedding = np.zeros(
                        (item_num, np.shape(text_matrix)[1]))
                    user_batch_list = list(range(0, user_num, 500))
                    user_batch_list.append(user_num)
                    item_batch_list = list(range(0, item_num, 500))
                    item_batch_list.append(item_num)
                    for u in range(len(user_batch_list) - 1):
                        u1, u2 = user_batch_list[u], user_batch_list[u + 1]
                        user_batch = np.array(range(u1, u2))
                        user_review_batch = user_review_feature[u1:u2]
                        u_text_embedding = sess.run(
                            [model.u_text_embeddings],
                            feed_dict={
                                model.users: user_batch,
                                model.user_word: user_review_batch
                            })
                        user_text_embedding[u1:u2] = u_text_embedding[0]
                    for i in range(len(item_batch_list) - 1):
                        i1, i2 = item_batch_list[i], item_batch_list[i + 1]
                        item_batch = np.array(range(i1, i2))
                        item_review_batch = item_review_feature[i1:i2]
                        i_text_embedding = sess.run(
                            [model.i_text_embeddings],
                            feed_dict={
                                model.items: item_batch,
                                model.item_word: item_review_batch
                            })
                        item_text_embedding[i1:i2] = i_text_embedding[0]
                except:
                    user_embedding, item_embedding = sess.run(
                        [model.user_embeddings, model.item_embeddings])
        ## print performance
        print_value([epoch + 1, loss, F1_max, F1, NDCG])
        F1_df.loc[epoch + 1] = F1
        NDCG_df.loc[epoch + 1] = NDCG
        save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']],
                   path_excel,
                   first_sheet=False)
        if not loss < 10**10:
            break
    if if_save_emb == 1:
        if MODEL == 'TMN':
            save_embeddings(
                [user_text_embedding.tolist(),
                 item_text_embedding.tolist()], save_text_embeddings_path)
        if MODEL == 'TMF':
            save_embeddings([user_embedding.tolist(),
                             item_embedding.tolist()],
                            save_latant_embeddings_path)
        try:
            del u_text_embedding, i_text_embedding, user_text_embedding, item_text_embedding
        except:
            del user_embedding, item_embedding
    del model, loss, _, sess
    gc.collect()
Example #25
0
# and its values are dicts of stems. We don't need both this and Signatures!
#print config_lxa["affix_type"], 51
Lexicon = CLexicon()
Lexicon.infolder = config_lxa["complete_infilename"]
Lexicon.outfolder = config_lxa["outfolder"]
Lexicon.graphicsfolder = config_lxa["graphicsfolder"]
if config_lxa["affix_type"] == "prefix":
    Lexicon.FindSuffixesFlag = False
else:
    Lexicon.FindSuffixesFlag = True
# ---------------------.-----------------------------------------------##
#        read wordlist (dx1)
# --------------------------------------------------------------------##

infile = open(config_lxa["complete_infilename"])
read_data(config_lxa["datatype"], infile, Lexicon,
          config_lxa["BreakAtHyphensFlag"], config_lxa["word_count_limit"])
print "\n1. Finished reading word list. Word count:", len(
    Lexicon.Word_counts_dict), "\n"

# --------------------------------------------------------------------##
#        Initialize some output files
# --------------------------------------------------------------------##

if not os.path.isdir(config_lxa["outfolder"]):
    try:
        os.mkdir(config_lxa["outfolder"])
    except OSError:
        print("Creation of the directory %s failed." % path)
    else:
        print("Successfully created the directory %s ." % path)
# Module 2: Interpolate C-Content & d13C Ratios on cm scale
#    Assuming: C(z) = a*np.exp(-b*x) + c  ***


from read_data import *
import matplotlib.pyplot as plt
import numpy as np
import csv
  
#read_data(Folder_File = 'Raw_Data/forest.csv')

plot_title = "primary forest" 

depth, C_content, C_se, d13C, d13Cse = read_data(Folder_File = 'Raw_Data/forest.csv')


from scipy.optimize import curve_fit
def func(x, a, b, c):
    return a*np.exp(-b*x) + c


def interpolation():
    global plot_title
    x = depth
    yC = C_content

    popt, pcov = curve_fit(func, x, yC)
    print
    print "interpolation module ran successfuly "
    print "further calculation assume:"
    print
Example #27
0
from read_data import *
from preprocessing import *
from run_gams_model import *
import numpy as np
import pandas as pd

data_raw,features,features_all=read_data()
features_selected=features_all.drop(features_all.index[[37,50,51,72,73]])

# run for CV complication
proc_data=gen_proc_data(data_raw,features_selected,data_raw['cv_comp_new'])
models_cvcomp=get_all_gams_model(proc_data,features_selected)
best_model_cvcomp=get_best_model(models_cvcomp)
r_df_cvcomp = com.convert_to_r_dataframe(proc_data)
pred_score_cvcomp= statsf.predict(best_model_cvcomp,r_df_cvcomp, type="response")
pred_obser_cvcomp=proc_data['outcome'];
best_model_metric=calculate_metric(pred_obser_cvcomp,pred_score_cvcomp)
cutoff1_cvcomp=best_model_metric['thres'].iloc[best_model_metric['yod_index'].idxmax()-1]
cutoff2_cvcomp=cal_cutoff2(best_model_metric)
predicted_values_cvcomp =pd.DataFrame({'prediction':pred_score_cvcomp}, index=range(0,len(pred_score_cvcomp)))
predicted_values_cvcomp.ix[predicted_values_cvcomp.prediction <= cutoff1_cvcomp 'category'] = 'low'
predicted_values_cvcomp.ix[predicted_values_cvcomp.prediction >cutoff2_cvcomp, 'category'] = 'high'
predicted_values_cvcomp.ix[(predicted_values_cvcomp.prediction <=cutoff2_cvcomp) & (predicted_values_cvcomp.prediction>cutoff1_cvcomp), 'category'] = 'moderate'


# run for MV complication

proc_data=gen_proc_data(data_raw,features_selected,data_raw['MV_comp'])
models_mvcomp=get_all_gams_model(proc_data,features_selected)
best_model_mvcomp=get_best_model(models_mvcomp)
r_df_mvcomp = com.convert_to_r_dataframe(proc_data)
Example #28
0
def train():

    global_step = tf.Variable(0, name='global_step', trainable= False)
    train_dir = './logs'

    #z表示随机噪声,y表示约束条件
    y = tf.placeholder(tf.float32, [BATCH_SIZE, 10], name='y')
    images = tf.placeholder(tf.float32, [64,28,28,1], name='real_images')
    z = tf.placeholder(tf.float32, [None, 100], name='z')

    #由生成器生成图像G
    with tf.variable_scope("for_reuse_scope"):
        G = generator(z, y)
        #真实图像送入判别器
        D, D_logits = discriminator(images, y)

        samples = sampler(z,y)
        D_, D_logits_ = discriminator(G, y, reuse=True)

    d_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits, logits= tf.ones_like(D)))
    d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits_, logits=tf.zeros_like(D_)))
    d_loss = d_loss_real + d_loss_fake
    g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=D_logits_, logits=tf.ones_like(D_)))

    z_sum = tf.summary.histogram("z",z)
    d_sum = tf.summary.histogram("d",D)
    d__sum = tf.summary.histogram("d_", D_)
    G_sum = tf.summary.image("G",G)

    d_loss_real_sum = tf.summary.scalar("d_loss_real", d_loss_real)
    d_loss_fake_sum = tf.summary.scalar("d_loss_fake", d_loss_fake)
    d_loss_sum = tf.summary.scalar("d_loss", d_loss)
    g_loss_sum = tf.summary.scalar("g_loss", g_loss)

    #合并各自的总结
    g_sum = tf.summary.merge([z_sum, d__sum, G_sum, d_loss_fake_sum, g_loss_sum])
    d_sum = tf.summary.merge([z_sum, d_sum, d_loss_real_sum, d_loss_sum])

    #生成器和判别器要更新的变量,由于tf.train.Optimizer的var_list
    t_vars = tf.trainable_variables()
    d_vars = [var for var in t_vars if 'd_' in var.name]
    g_vars = [var for var in t_vars if 'g_' in var.name]

    saver = tf.train.Saver()

    #优化算法采用Adam

    d_optim = tf.train.AdamOptimizer(0.0002, beta1= 0.5).minimize(d_loss, var_list=d_vars, global_step=global_step)
    g_optim = tf.train.AdamOptimizer(0.0002, beta1= 0.5).minimize(g_loss, var_list=g_vars, global_step=global_step)

    os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction=0.2
    sess = tf.InteractiveSession(config=config)
    init = tf.initialize_all_variables()
    writer = tf.summary.FileWriter(train_dir, sess.graph)

    data_x, data_y = read_data()
    sample_z = np.random.uniform(-1, 1,size=(BATCH_SIZE, 100))
    sample_labels = data_y[0:64]
    sess.run(init)

    for epoch in range(25):
        batch_idxs = 1093
        for idx in range(batch_idxs):
            batch_images = data_x[idx * 64 : (idx + 1) *64]
            batch_labels = data_y[idx * 64 : (idx + 1) * 64]
            batch_z = np.random.uniform(-1, 1, size=(BATCH_SIZE, 100)) #uniform 在(-1,1)随机采样

            #更新D的参数
            _, summary_str = sess.run([d_optim, d_sum], feed_dict={images:batch_images, z:batch_z, y:batch_labels})

            writer.add_summary(summary_str, idx+1)

            #更新G的参数
            _, summary_str = sess.run([g_optim, g_sum], feed_dict={z:batch_z, y:batch_labels})
            writer.add_summary(summary_str, idx+1)
            #更新两次G的参数确保网络稳定
            _, summary_str = sess.run([g_optim, g_sum], feed_dict={z:batch_z, y:batch_labels})

            writer.add_summary(summary_str, idx+1)

            #计算训练过程中的损失,打印出来
            errD_fake = d_loss_fake.eval({z:batch_z, y:batch_labels})#执行字符串表达式,并返回表达式的值
            errD_real = d_loss_real.eval({images:batch_images, y:batch_labels})
            errG = g_loss.eval({z:batch_z, y:batch_labels})

            if idx %20 ==0:
                print("Epoch : [%2d] [%4d/%4d] d_loss: %.8f, g_loss:%.8f"\
                      % (epoch, idx, batch_idxs, errD_fake+errD_real, errG))

            if idx %100 ==1:
                sample = sess.run(samples, feed_dict={z:sample_z, y:sample_labels})
                samples_path = os.getcwd()+'/samples/'
                save_image(sample, [8,8], samples_path + 'test_%d_epoch_%d.png' %(epoch, idx))
                print('save down')

            if idx % 500 ==2:
                checkpoint_path = os.path.join(train_dir, 'DCGAN_model.ckpt')
                saver.save(sess, checkpoint_path, global_step=idx+1)
    sess.close()
Example #29
0
# 1- https://www.kaggle.com/charma69/titanic-data-science-solutions/edit

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score, accuracy_score

from matplotlib import pyplot as plt
from read_data import *
from sklearn import metrics

# 1.import data
dataset = 'WBCD.csv'
Data = read_data(dataset)

# 2.Fill with median
Data = fillMed(Data)

#3.data precessing
# select features and Normalization
x = Data.loc[:, [
    'ClumpTkns', 'UnofCSize', 'UnofCShape', 'MargAdh', 'SngEpiCSize',
    'BareNuc', 'BlandCrmtn', 'NrmlNuc', 'Mitoses'
]]
y = Data['Malignant']
# TRANSFROM GIVES 1 % LESS ACCURATE RESULT!!!!
min_max_scaler = preprocessing.MaxAbsScaler()
x = min_max_scaler.fit_transform(x)
Example #30
0
def main(args):
    word_embeddings = p.load(open(args.word_embeddings, 'rb'))
    word_embeddings = np.array(word_embeddings)
    word2index = p.load(open(args.vocab, 'rb'))

    index2kwd, kwd2index, index2cnt = read_kwd_vocab(args.kwd_vocab)
    if hparams.BALANCE_KWD_CLASS:
        # adjust weight for different kwd class based on median freqency
        index2cnt = np.array(index2cnt)
        base_freq = np.median(index2cnt)
        kwd_weight = np.sqrt(base_freq / index2cnt)
        kwd_weight = torch.FloatTensor(kwd_weight)
        if hparams.USE_CUDA:
            kwd_weight = kwd_weight.cuda()
    else:
        kwd_weight = None

    subset_count = args.subset_count if args.subset_count > 0 else None
    train_data = read_data(args.train_context, args.train_question,
                           args.train_ids, args.max_post_len,
                           args.max_ques_len, subset_count)
    test_data = read_data(args.tune_context, args.tune_question, args.tune_ids,
                          args.max_post_len, args.max_ques_len, subset_count)

    if args.kwd_data_dir:  # load pre-extracted kwd, save time in training
        print(f"load kwds from {args.kwd_data_dir}")
        train_kwds = read_kwds(os.path.join(args.kwd_data_dir, "train.kwds"),
                               kwd2index, subset_count)
        test_kwds = read_kwds(os.path.join(args.kwd_data_dir, "tune.kwds"),
                              kwd2index, subset_count)
        assert len(train_kwds) == len(train_data), print(
            len(train_kwds), len(train_data))
        assert len(test_kwds) == len(test_data)
    else:
        train_kwds, test_kwds = None, None

    print('No. of train_data %d' % len(train_data))
    print('No. of test_data %d' % len(test_data))

    print("Preprocessing train")
    q_train_data = preprocess_data(train_data,
                                   word2index,
                                   kwd2index,
                                   args.max_post_len,
                                   args.max_ques_len,
                                   args.kwd_data_dir,
                                   extract_kwd=False)
    q_train_data = [np.array(x) for x in q_train_data]
    print("Preprocessing val")
    q_test_data = preprocess_data(test_data,
                                  word2index,
                                  kwd2index,
                                  args.max_post_len,
                                  args.max_ques_len,
                                  args.kwd_data_dir,
                                  extract_kwd=False)
    q_test_data = [np.array(x) for x in q_test_data]

    if args.pretrain_ques:
        run_seq2seq(q_train_data, q_test_data, word2index, word_embeddings,
                    hparams.MAX_QUES_LEN, kwd_weight,
                    not args.freeze_kwd_model, train_kwds, test_kwds,
                    kwd2index, args.kwd_model_dir, args.save_dir,
                    args.load_models_dir)
    elif args.pretrain_kwd:
        run_kwd(q_train_data, q_test_data, index2kwd, word_embeddings,
                kwd_weight, train_kwds, test_kwds, kwd2index, args.save_dir)
    else:
        print('Please specify model to pretrain')
        return
Example #31
0
    >>> A = np.array([(2,4,1,3),(-1,-2,1,0),(0,0,2,2),(3,6,2,5)])
    >>> independent_columns(A)
    np.array([[1, 4],
              [2, 5],
              [3, 6]])
    """
    Q, R = linalg.qr(A)
    independent = np.where(np.abs(R.diagonal()) > tol)[0]
    #print independent
    return independent
    #return A[:, independent]


if __name__ == "__main__":
    data = read_data(_TRAINING_FILE_NAME_)
    #print data.describe()
    train_labels = data.label
    train_labels = train_labels.reshape(train_labels.size, 1)

    train_data = data.drop("label", 1)
    train_data = train_data.drop(BLACKLIST, axis=1)

    ft = open(_FIELDS_FILE_, "r")
    categ = []  # list of categorical variables for transform
    non_categ = []
    categ = []
    for line in ft.readlines():
        splits = line.split()
        if splits[1] == "numeric":
            non_categ.append(splits[0])
from matplotlib import pyplot
from read_data import *
from Scratch import *
from numpy import *

scratchList = read_data(
    "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\cer biala\\wynik-ce-bi.txt"
)
scratchList1 = read_data(
    "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\cer czarna\\wynik-ce-cz.txt"
)
scratchList2 = read_data(
    "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\alu cylinder\\wynik-al-cy.txt"
)
scratchList3 = read_data(
    "C:\\Users\\Michał\\Desktop\\Aleksander\\Scratch\\alu gwint\\wynik-al-gw.txt"
)

# sampleList = [scratchList, scratchList1]
# sampleList = [scratchList2, scratchList3]
sampleList = [scratchList, scratchList1, scratchList2, scratchList3]
labelList = [
    'white coating', 'black coating', '5056 aluminium', '7075 aluminium'
]
colorList = ['red', 'black', 'blue', 'green']
avgMaxDepth = []
maxDepth = []

#for each sample
for scratchList in sampleList:
    sum1 = 0
from __future__ import print_function
import numpy as np
import sys
sys.path.append('../geom')
from point import *
from fitsemivariance import *
from semivariance import *
from covariance import *
from read_data import *
from prepare_interpolation_data import *
from okriging import *

Z = read_data('../data/necoldem250.dat')

hh = 50
lags = np.arange(0, 3000, hh)
test_results = []
N = len(Z)
mask = [True for i in range(N)]
numNeighbors = 10

for i in range(N):
    mask[i] = False
    x = Point(Z[i][0], Z[i][1])
    P = [Z[j] for j in range(N) if mask[j] == True]
    P1 = prepare_interpolation_data(x, P, numNeighbors)[0]
    P1 = np.array(P1)
    gamma = semivar(P1, lags, hh)  #*@\label{krig:cross:semivar}
    if len(gamma) == 0:
        continue
    semivariogram = fitsemivariogram(P1, gamma, spherical)
Example #34
0
from matplotlib import pyplot
from read_data import *
from Scratch import *
from numpy import *

scratchList = read_data("C:\\Users\\ja\\Desktop\\Aleksander\\Scratch\\cer biala\\wynik-ce-bi.txt")
scratchList1 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\1\\krzywe.txt")
scratchList2 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\2\\krzywe.txt")
scratchList3 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\3\\krzywe.txt")
scratchList4 = read_data("I:\OneDrive\\doktorat\\praca doktorska\\badania\\scratch\\4\\krzywe.txt")

sampleList = [scratchList,scratchList1,scratchList2,scratchList3,scratchList4]
avgMaxDepth = []
maxDepth = []


#for each sample
for scratchList in sampleList:
    sum1 = 0
    maxList = []
    #for each scratch
    for scratchObject in scratchList:
        #adding base line
        scratchObject.addBaseline()
        
        #moving average for depth for topography 2 (after scratch)
        scratchObject.topo2.depth = convolve(scratchObject.topo2.depth, ones(30,)/30, mode='full')
        
        #computing max dephts
        sum1 = sum1 + scratchObject.maxDepthOfTopo2()
        maxList.append(scratchObject.maxDepthOfTopo2())
from read_data import *
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

model_name = "FeedForward"

data = read_data('test')
X_test = data.to_numpy()
model = load_model('Models/' + model_name + '.hdf5')
predictions = model.predict(X_test)
predictions = np.around(predictions).flatten()
predictions = predictions.astype(int)

d = {'PassengerId': range(892, 1310), 'Survived': predictions}
df = pd.DataFrame(data=d)
df.to_csv('Results/' + model_name + '.csv', index=False)
def get_cleaned_data(path):
    df = read_data(path)
    df['review_cleaned'] = df['review'].apply(lambda x: clean_text(x))
    return df