コード例 #1
0
    def __init__(self):
        new_line(50)
        print 'started to build all training and testing data...'
        self.tfidf_transformer = TfidfTransformer()
        self.vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, min_df=5, tokenizer=stemTokenizer)
        self.svd = TruncatedSVD(n_components=50, random_state=42)
        self.nmf = NMF(n_components=50, random_state=42)
        self.mm = MinMaxScaler()

        # build training data
        self.train_data = fetch_data(categories, 'train')
        self.train_labels = build_labels(self.train_data)
        self.vectors = self.to_vec(self.train_data.data)
        self.tfidf = self.to_tfidf(self.vectors)
        self.tfidf_SVD = self.to_SVD(self.tfidf)
        self.tfidf_NMF = self.to_NMF(self.tfidf)
        self.tfidf_mm = self.mm.fit_transform(self.tfidf_SVD)

        # build testing data
        self.test_data = fetch_data(categories, 'test')
        self.test_labels = build_labels(self.test_data)
        self.test_vectors = self.vectorizer.transform(self.test_data.data)
        self.test_tfidf = self.tfidf_transformer.transform(self.test_vectors)
        self.test_tfidf_SVD = self.svd.transform(self.test_tfidf)
        self.test_tfidf_NMF = self.nmf.transform(self.test_tfidf)
        self.test_tfidf_mm = self.mm.fit_transform(self.test_tfidf_SVD)
        print 'finished building all training and testing data...'
        new_line(50)
        print ' '
コード例 #2
0
def test(sess, model, test_url, batch_size):
    test_set, test_count, _ = utils.data_set(test_url)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
            test_set, test_count, idx_batch, FLAGS.vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask}
        loss, kld = sess.run([model.objective, model.kld], input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum / len(test_batches)
    print('| Epoch test: {:d} |'.format(1),
          '| Perplexity: {:.9f}'.format(print_ppx),
          '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
          '| KLD: {:.5}'.format(print_kld))
コード例 #3
0
ファイル: app.py プロジェクト: tjanko13/stoltz_scottman
def forecast_plot(tickername, steps):
    data = fetch_data(tickername).reset_index()
    data['Type'] = "HISTORICAL"
    model = build_model(tickername)
    fcast = model.forecast(int(steps))
    new_series = pd.date_range(data['Date'].iloc[-1], periods=int(steps))
    fcast_df = pd.DataFrame({'Date': new_series,
                             'Close': fcast[0],
                             'Type': "FORECAST"})
    final_df = pd.concat([data[['Date', 'Close', 'Type']], fcast_df])
    fig = px.line(final_df, x='Date', y='Close', color='Type')
    return fig.to_html()
コード例 #4
0
def question_i():
    categories = [
        "comp.graphics", "comp.os.ms-windows.misc", "comp.sys.ibm.pc.hardware",
        "comp.sys.mac.hardware", "rec.autos", "rec.motorcycles",
        "rec.sport.baseball", "rec.sport.hockey"
    ]

    train, test = utils.fetch_data(categories)
    train.target = list(map(lambda x: int(0 <= x and x < 4), train.target))
    test.target = list(map(lambda x: int(0 <= x and x < 4), test.target))

    params = list(range(-3, 4))
    l1_accuracies = []
    l2_accuracies = []

    for param in params:
        l1_classifier = LogisticRegression(penalty='l1',
                                           C=10**param,
                                           solver='liblinear')
        logging.info("Regularization Parameter set to {0}".format(param))
        l1_accuracies.append(
            utils.classify(l1_classifier,
                           "Logistic Regression l1",
                           train,
                           test,
                           cv=False,
                           mean=True))
        l2_classifier = LogisticRegression(penalty='l2',
                                           C=10**param,
                                           solver='liblinear')
        l2_accuracies.append(
            utils.classify(l2_classifier,
                           "Logistic Regression l2",
                           train,
                           test,
                           cv=False,
                           mean=True))

    plt.figure(1)
    plt.subplot(211)
    plt.plot(l1_accuracies)
    plt.xticks(range(6), [10**param for param in params])
    plt.title("Accuracy of L1 Logistic Regression vs regularization parameter")

    plt.subplot(212)
    plt.plot(l2_accuracies)
    plt.xticks(range(6), [10**param for param in params])
    plt.title("Accuracy of L2 Logistic Regression vs regularization parameter")
    plt.show()
コード例 #5
0
    def j(self):
        print_question('j')
        # build training data
        train = fetch_data(cat_4, 'train')
        vectors = self.to_vec(train.data)
        tfidf = self.to_tfidf(vectors)
        nmf = self.to_NMF(tfidf)

        # build testing data
        test = fetch_data(cat_4, 'test')
        vectors_test = self.vectorizer.transform(test.data)
        tfidf_test = self.tfidf_transformer.transform(vectors_test)
        nmf_test = self.nmf.transform(tfidf_test)

        # build classifiers
        svc = svm.LinearSVC(C=1, random_state=42)
        nb = MultinomialNB()
        ovo = OneVsOneClassifier(svc)
        ovr = OneVsRestClassifier(svc)

        # train and test
        self.multi_classify(nb, nmf, nmf_test, train.target, test.target, 'naive bayes')
        self.multi_classify(ovo, nmf, nmf_test, train.target, test.target, 'one vs one')      
        self.multi_classify(ovr, nmf, nmf_test, train.target, test.target, 'one vs rest')
コード例 #6
0
ファイル: app.py プロジェクト: moonquay/stoltz_scottman
def forecast_plot(tickername, steps):
    data = fetch_data(tickername).reset_index()
    data['Type'] = 'HISTORICAL'
    model = build_model(tickername)
    fcast = model.forecast(int(steps))
    new_series = pd.date_range(data['Date'].iloc[-1], periods=int(steps))
    fcast_df = pd.DataFrame({
        'Date': new_series,
        'Close': fcast[0],
        'Type': 'FORECAST'
    })
    final_df = pd.concat([data[['Date', 'Close', 'Type']], fcast_df])
    fig = px.line(final_df, x='Date', y='Close', color='Type')
    # fig = go.Figure([go.Scatter(x=data['Date'], y=data['Close'])])
    # fig.add_trace(go.Scatter(x=fcast['Date'], y=fcast[0]))
    return fig.to_html()
コード例 #7
0
    def __init__(self):
        self.tfidf_transformer = TfidfTransformer()
        self.vectorizer = CountVectorizer(analyzer='word', stop_words=stop_words, min_df=3, tokenizer=tokenizer)
        self.svd = TruncatedSVD(n_components=1000, random_state=0)

        # build training data
        self.train_data = fetch_data(categories, 'train')
        self.train_labels = build_labels(self.train_data)
        self.vectors = self.to_vec(self.train_data.data)
        self.tfidf = self.to_tfidf(self.vectors)

        #build clustering required data
        self.lsip2 = TruncatedSVD(n_components=2, random_state=0)
        self.lsi_data = self.lsip2.fit_transform(self.tfidf)
        self.nmfp2 = NMF(n_components=2, init='random', random_state=0)
        self.nmf_data = self.nmfp2.fit_transform(self.tfidf)
コード例 #8
0
def train(nvdm, train_url, optimizer, batch_size=64, training_epochs=1000):
    train_set, train_count = utils.data_set(train_url)
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size)
        loss_sum = 0.0
        for idx_batch in train_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                train_set, train_count, idx_batch, 2000)
            data_batch = torch.FloatTensor(data_batch)
            mask = torch.FloatTensor(mask)
            loss = nvdm(data_batch, mask)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()

        print(loss_sum / len(train_batches))
コード例 #9
0
def question_j():
    logging.info("<Question J> Multiclass Classification")
    category = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    train, test = utils.fetch_data(category)

    train_idf = utils.model_data(train)
    test_idf = utils.model_data(test)
    logging.info("Creating TFxIDF Vector Representations")

    logging.info("Performing LSI on TFxIDF Matrices")
    # apply LSI to TDxIDF matrices
    svd = TruncatedSVD(n_components=50)
    train_lsi = svd.fit_transform(train_idf)
    test_lsi = svd.fit_transform(test_idf)

    logging.info("TFxIDF Matrices Transformed")

    logging.info("Size of Transformed Training Dataset: {0}".format(
        train_lsi.shape))
    logging.info("Size of Transformed Testing Dataset: {0}".format(
        test_lsi.shape))

    clf_list = [
        OneVsOneClassifier(GaussianNB()),
        OneVsOneClassifier(svm.SVC(kernel='linear')),
        OneVsRestClassifier(GaussianNB()),
        OneVsRestClassifier(svm.SVC(kernel='linear'))
    ]
    clf_name = [
        'OneVsOneClassifier Naive Bayes', 'OneVsOneClassifier SVM',
        'OneVsRestClassifier Naive Bayes', 'OneVsRestClassifier SVM'
    ]

    # perform classification
    for clf, clf_n in zip(clf_list, clf_name):
        logging.info("Training {0} Classifier ".format(clf_n))
        clf.fit(train_lsi, train.target)
        logging.info("Testing {0} Classifier".format(clf_n))
        test_predicted = clf.predict(test_lsi)
        utils.calculate_stats(test.target, test_predicted)
コード例 #10
0
def question_c():
    logging.info(
        "<Question C> Getting the significance and TFxICF representation")
    all_categories = train_full_set.target_names

    all_docs_per_category = []

    classes_list = [
        train_full_set.target_names.index("comp.sys.ibm.pc.hardware"),
        train_full_set.target_names.index("comp.sys.mac.hardware"),
        train_full_set.target_names.index("misc.forsale"),
        train_full_set.target_names.index("soc.religion.christian")
    ]

    logging.info(
        "Store data from all docs of a certain category as entries in all_data_category"
    )
    for cat in all_categories:
        train_category = utils.fetch_data([cat])[0]
        data_category = train_category.data
        temp = ''
        for doc in data_category:
            temp += ' ' + doc
        all_docs_per_category.append(temp)

    logging.info("Now build frequency tables for each class")

    vectorized_newsgroups_train = utils.remove_stop_words(
        all_docs_per_category)

    print(vectorized_newsgroups_train.shape)

    max_term_freq_per_category = [0] * vectorized_newsgroups_train.shape[0]
    category_count_per_term = [0] * vectorized_newsgroups_train.shape[1]

    for i in range(vectorized_newsgroups_train.shape[0]):
        max_term_freq_per_category[i] = max(
            vectorized_newsgroups_train[i].data)

    category_count_per_term = vectorized_newsgroups_train.sum(axis=0)

    print(max_term_freq_per_category)
    print(category_count_per_term)
コード例 #11
0
    def c(self):
        print_question('c')
        allDoc = []
        for cat in allCat:
            data = fetch_data([cat], 'train').data
            poke = ""
            for doc in data:
                poke = poke + " " + doc
            allDoc.append(poke)

        vectors_full = self.to_vec(allDoc)
        tficf_train = self.to_tfidf(vectors_full)
        tficf_train_copy = tficf_train.copy()
        features = self.vectorizer.get_feature_names()
        for i in range(4):
            words = []
            for j in range(10):
                doc = tficf_train_copy[i]
                max_index = np.argmax(doc)
                words.append(features[max_index])
                tficf_train_copy[i, max_index] = 0
            print allCat[i], words
コード例 #12
0
def getPdfCOVID19():

    # Get data from Google Spreadsheets
    raw_data = fetch_data()[0]
    # Create a JSON from data.
    column_names = raw_data[0]
    final_data = []
    for data in raw_data[2:]:
        single_data = dict()
        counter = 0
        for col in column_names:
            single_data[col] = data[counter]
            counter += 1
        final_data.append(single_data)
    mapping_id = "1_iE1D8Pvsq7SQMMjHWhOhidGvkXENiluq01RXvb3n5g"
    doc_template_id = "1g7EvvBPsMi2kXyg0am-iRZ72DJNZNtyUrRwKieXhWn0"
    application_id = "Covid19"
    username = "******"
    unique_ids = []
    print(db)
    for data in final_data:
        json_data = pdfData(reqd_data=data,
                            var_mapping_id=mapping_id,
                            doc_template_id=doc_template_id,
                            form_id=None,
                            form_name=None,
                            instance_id=uuid.uuid4(),
                            user_name=username,
                            application_id=application_id,
                            form_submission_date=None)
        db.session.add(json_data)  # Adds new User record to database
        db.session.flush(
        )  # Pushing the object to the database so that it gets assigned a unique id
        unique_ids.append(json_data.unique_id)
        db.session.commit()  # Commits all changes
        status = 'submitted'
    return {"status": status, "uniqueId": unique_ids}
コード例 #13
0
ファイル: nvdm.py プロジェクト: shshnk94/nvdm
def evaluate(model, training_data, training_count, session, step, train_loss=None, epoch=None, summaries=None, writer=None, saver=None):

  #Get theta for the H1.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h1.feat' if step != 'test' else 'test_h1.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)
   
  theta = []
  for idx_batch in data_batches:

    data_batch, count_batch, mask = utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)
    input_feed = {model.x.name: data_batch, model.mask.name: mask}

    logit_theta = session.run(model.doc_vec, input_feed)
    theta.append(softmax(logit_theta, axis=1)) 

  theta = np.concatenate(theta, axis=0)

  weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Matrix:0')[0].eval(session)
  bias = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder/projection/Bias:0')[0].eval(session)
  beta = softmax(weights + bias, axis=1)

  #H2 to calculate perplexity.
  data_url = os.path.join(FLAGS.data_dir, 'valid_h2.feat' if step != 'test' else 'test_h2.feat')
  dataset, dataset_count = utils.data_set(data_url)
  data_batches = utils.create_batches(len(dataset), FLAGS.batch_size, shuffle=False)

  test_data = [utils.fetch_data(dataset, dataset_count, idx_batch, FLAGS.vocab_size)[0] for idx_batch in data_batches]
  test_data = np.concatenate(test_data, axis=0)

  perplexity = get_perplexity(test_data, theta, beta)
  coherence = get_topic_coherence(beta, training_data, 'nvdm') if  step == 'test' else np.nan
  diversity = get_topic_diversity(beta, 'nvdm') if step == 'test' else np.nan
    
  if step == 'val':

    #tloss = tf.get_default_graph().get_tensor_by_name('tloss:0') 
    #vppl = tf.get_default_graph().get_tensor_by_name('vppl:0') 

    #weight_summaries = session.run(summaries, feed_dict={tloss: train_loss, vppl: perplexity})
    #weight_summaries = summaries.eval(session=session)
    #writer.add_summary(weight_summaries, epoch)
    save_path = saver.save(session, os.path.join(ckpt, 'model.ckpt'))

    print("Model saved in path: %s" % ckpt)
    print('| Epoch dev: {:d} |'.format(epoch+1)) 

  else:
    
    ## get most used topics
    cnt = 0
    thetaWeightedAvg = np.zeros((1, FLAGS.n_topic))
    data_batches = utils.create_batches(len(training_data), FLAGS.batch_size, shuffle=False)

    for idx_batch in data_batches:

        batch, count_batch, mask = utils.fetch_data(training_data, training_count, idx_batch, FLAGS.vocab_size)
        sums = batch.sum(axis=1)
        cnt += sums.sum(axis=0)

        input_feed = {model.x.name: batch, model.mask.name: mask}
        logit_theta = session.run(model.doc_vec, input_feed)
        theta = softmax(logit_theta, axis=1)
        weighed_theta = (theta.T * sums).T
        thetaWeightedAvg += weighed_theta.sum(axis=0)

    thetaWeightedAvg = thetaWeightedAvg.squeeze() / cnt
    print('\nThe 10 most used topics are {}'.format(thetaWeightedAvg.argsort()[::-1][:10]))

    with open(FLAGS.data_dir + '/vocab.new', 'rb') as f:
      vocab = pkl.load(f)

    topic_indices = list(np.random.choice(FLAGS.n_topic, 10)) # 10 random topics
    print('\n')

    with open(ckpt + '/topics.txt', 'w') as f:
      for k in range(FLAGS.n_topic):
        gamma = beta[k]
        top_words = list(gamma.argsort()[-FLAGS.n_words+1:][::-1])
        topic_words = [vocab[a] for a in top_words]
        f.write(str(k) + ' ' + str(topic_words) + '\n')
        print('Topic {}: {}'.format(k, topic_words))

  with open(ckpt + '/' + step + '_scores.csv', 'a') as handle:
    handle.write(str(perplexity) + ',' + str(coherence) + ',' + str(diversity) + '\n')
コード例 #14
0
def get_weighted_scores(assignment,
                        sess=None,
                        ta_mean=TA_MEAN,
                        ta_stdev=TA_STDEV,
                        lambda_=TA_LAMBDA):
    @ensure_matrix
    def normalize_transform(data):
        mean = data.mean()
        std = data.std()
        return ((data - mean) / (std if std else 1)) * ta_stdev + ta_mean

    @ensure_matrix
    def bc_transform(data):
        normal, _ = boxcox(data)
        transform = inv_boxcox(normal, lambda_)
        return normalize_transform(transform)

    def transform_group(score_dict):
        ''' Flatten dict of scores, transform, then regroup '''
        indexed = [[i, uid, score] for uid in score_dict
                   for i, score in enumerate(score_dict[uid])]
        idxs, scores = [ix[:2] for ix in indexed], [ix[2] for ix in indexed]
        if not len(idxs):
            print(
                'No student feedback has been given yet - cannot calculate weighted scores.'
            )
            return {}

        transformed = [i + [k] for i, k in zip(idxs, bc_transform(scores))]
        transformed = {
            k: [v[2] for v in sorted(list(grp), key=lambda y: y[0])
                ]  # Sort by index and keep score
            for k, grp in groupby(transformed, lambda x: x[1])
        }  # Group by UID
        return transformed

    data = fetch_data(assignment, sess)

    st_score = [d[k] for d in data for k in d if d[k] and 'score' in k]
    st_mean = np.mean(st_score)
    st_stdev = np.std(st_score)

    # Collate the scores each student gave
    st_score = dd(list)
    for d in data:
        for i in range(1, 5):
            uid = d['student_display_id_%i' % i]
            score = d['student_score_%i' % i]
            if uid and score:
                st_score[uid].append(score)

    # Get kurtosis/skewness tranformed scores
    transformed = transform_group(st_score)

    # Calculate mean & stdev for each student
    stats = {}
    for s in st_score:
        if st_score[s]:
            bc = bc_transform(st_score[s])
            mean = np.mean(st_score[s])
            std = np.std(st_score[s])
        else:
            bc = [None] * len(st_score[s])
            mean = ta_mean
            std = 1

        bc_ind = dict(zip(st_score[s], bc))
        bc_tog = dict(zip(st_score[s], transformed[s]))
        stats[s] = (mean, std, bc_ind, bc_tog)

    # Create new scores based on normalized student scores
    averaged = {}
    all_ = []
    for d in data:
        scores = []
        name = d['First Name (Student)'].strip(
        ) + ' ' + d['Last Name (Student)'].strip()
        for i in range(1, 5):

            uid = d['student_display_id_%i' % i]
            score = d['student_score_%i' % i]
            if uid and score:
                mean, std, bc_ind, bc_tog = stats[uid]
                if std == 0: std = 1

                ind = ((score - mean) / std) * ta_stdev + ta_mean
                tog = ((score - st_mean) / st_stdev) * ta_stdev + ta_mean
                avg = (ind + tog) / 2.

                bc_ind = bc_ind[score] if bc_ind[score] else avg
                bc_tog = bc_tog[score]

                final = np.mean([avg, bc_ind, bc_tog])
                scores.append(final)
        averaged[name.lower().strip()] = np.mean(scores) if scores else ta_mean

    # Final Box-Cox transform
    scores = [averaged[k] for k in sorted(averaged.keys())]
    transform = bc_transform(scores)
    averaged = {
        k: round(transform[i], 2)
        for i, k in enumerate(sorted(averaged.keys()))
    }

    return averaged
コード例 #15
0
def analyze_spreadsheet(assignment):
    ''' Analyze an assignment's scores after TA grading is completed '''
    from glob import glob
    from scipy.stats import ks_2samp
    import matplotlib.pyplot as plt
    import matplotlib.mlab as mlab

    for folder in glob('./assignments/*/'):
        if assignment.lower() in folder.lower():
            folder = folder.replace('\\', '/')
            assignment = folder.split('/')[1]
            break

    data = fetch_data(assignment, overwrite=OVERWRITE)
    exclude = []
    data = [
        d for d in data
        if d['TA Name (First and Last)'] not in exclude and d['TA Score']
    ]
    TAs = sorted(set([d['TA Name (First and Last)'] for d in data]))
    print('Number of grading TAs: %i' % len(TAs))

    st_scores = [d[k] for d in data for k in d if d[k] and 'score' in k]
    print('\nStudent Statistics:')
    st_mean, st_stdev, st_skew, st_kurt = distribution_stats(st_scores)

    ta_scores = [d['TA Score'] for d in data]
    if not any(ta_scores):
        print('Remaining analysis can\'t be completed without TA scores.')
        return

    print('\nTA Statistics:')
    ta_mean, ta_stdev, ta_skew, ta_kurt = distribution_stats(ta_scores)

    indexed_scores = [(i, d[k] if d[k] else ta_mean)
                      for i, d in enumerate(data)
                      for k in ['student_score_%i' % j for j in range(1, 5)]
                      if k in d and d[k] is not None]
    idxs, scores = [ix[0] for ix in indexed_scores
                    ], [ix[1] for ix in indexed_scores]
    ks_transform = ks_align(scores, ta_scores)
    ks_transform = zip(idxs, ks_transform)

    idx = 0
    curr = []
    transformed = []
    for i, score in ks_transform:
        if i == idx:
            curr.append(score)
        else:
            transformed.append(curr)
            curr = [score]
            idx = i
    if curr:
        transformed.append(curr)

    # Collate the scores each student gave
    st_score = dd(list)
    st_weight = dd(list)
    idx = inc = 0
    for d in data:
        scores = get_student_scores(d)
        for i in range(1, 5):
            uid = d['student_display_id_%i' % i]
            score = d['student_score_%i' % i]
            avg = np.mean(scores) if scores else ta_mean

            if score == 0:
                d['student_score_%i' % i] = score = avg
            if uid and score:
                # Add student weights - signed difference from the mean, on average
                st_score[uid].append(score)
                st_weight[uid].append((avg - score))
                d['ks_Score_%i' % i] = transformed[idx][i - 1]
                if not i - 1: inc = 1
        idx += inc
        inc = 0

    # Make a copy for future comparison
    raw_data = [dict(d) for d in data]

    # Calculate mean & stdev for each student
    stats = {}
    weights = {}
    for s in st_score:
        if st_score[s]:
            ks = ks_align(st_score[s], ta_scores)
            mean = np.mean(st_score[s])
            std = np.std(st_score[s])
        else:
            ks = [None] * len(st_score[s])
            mean = ta_mean
            std = 1
        stats[s] = (mean, std, dict(zip(st_score[s], ks)))
        weights[s] = np.mean(st_weight[s])

    # Create new scores based on normalized student scores
    individual = []  # Normalize each student by using their three grades
    together = []  # Normalize scores based on overall student mean & stdev
    averaged = []  # Normalize based on average of individual and together
    for d in raw_data:
        individual.append(dict(d))
        together.append(dict(d))
        averaged.append(dict(d))

        for i in range(1, 5):

            uid = d['student_display_id_%i' % i]
            score = d['student_score_%i' % i]
            if uid and score:
                d['student_weight_%i' %
                  i] = weights[d['student_display_id_%i' % i]]

                mean, std, normed = stats[uid]
                if std == 0: std = 1

                ind = ((score - mean) / std) * ta_stdev + ta_mean
                tog = ((score - st_mean) / st_stdev) * ta_stdev + ta_mean
                avg = (ind + tog) / 2.

                ks_norm = normed[score]
                ks_norm2 = d['ks_Score_%i' % i]
                avg = np.mean([avg, ks_norm2, ks_norm if ks_norm else avg])

                individual[-1]['student_score_%i' % i] = ind
                together[-1]['student_score_%i' % i] = tog
                averaged[-1]['student_score_%i' % i] = avg

    D = averaged
    ta_scores = [
        d for d in D
        if any(k for k in ['student_score_%i' % j for j in range(1, 5)]
               if k in d and d[k] is not None)
    ]
    scores = [(i,
               np.mean([
                   d[k] for k in ['student_score_%i' % j for j in range(1, 5)]
                   if d[k]
               ])) for i, d in enumerate(ta_scores)]

    ta_scores = [(i, d['TA Score']) for i, d in enumerate(ta_scores)]
    ta_scores = sorted(ta_scores, key=lambda x: x[1])

    idxs = [t for t, _ in scores]
    # scores = [t for _,t in scores]
    scores = ks_align([t for _, t in scores], [t for _, t in ta_scores])
    scores = np.array(scores)
    scores = ((scores - scores.mean()) / scores.std()) * ta_stdev + ta_mean

    i = inc = 0
    for a in averaged:
        for k in ['student_score_%i' % j for j in range(1, 5)]:
            if k in a and a[k] is not None:
                a[k] = scores[i]
                inc = 1
        i += inc
        inc = 0

    D = averaged
    ta_scores = [
        d for d in D
        if any(k for k in ['student_score_%i' % j for j in range(1, 5)]
               if k in d and d[k] is not None)
    ]
    scores = dict([(i,
                    np.mean([
                        d[k]
                        for k in ['student_score_%i' % j for j in range(1, 5)]
                        if d[k]
                    ])) for i, d in enumerate(ta_scores)])

    st_scores = get_weighted_scores(assignment)
    st_scores = [
        st_scores[(d['First Name (Student)'].strip() + ' ' +
                   d['Last Name (Student)'].strip()).lower()]
        for d in ta_scores
    ]

    ta_scores = [(i, d['TA Score']) for i, d in enumerate(ta_scores)]
    ta_scores = sorted(ta_scores, key=lambda x: x[1])
    # idxs = [t for t,_ in scores]

    # scores = dict(zip(idxs, scores))
    st_scores = [scores[j] for j, _ in ta_scores]
    plt.plot([t for _, t in ta_scores], label='TA Scores')
    plt.plot(st_scores, alpha=.5, label='Weighted Scores')

    x = range(len(st_scores))
    coef = np.polyfit(x, st_scores, 3)
    p = np.poly1d(coef)

    yhat = p(x)
    ybar = np.sum(st_scores) / float(len(st_scores))
    ssreg = np.sum((yhat - ybar)**2)
    sstot = np.sum((st_scores - ybar)**2)
    print('R^2:%s' % ssreg / sstot)
    ks = ks_2samp(st_scores, [t for _, t in ta_scores])
    s = ks.statistic
    pv = ks.pvalue

    x = np.linspace(0, len(st_scores), 100)
    plt.plot(x,
             p(x),
             label='Cubic fit (R^2=%.2f, KS-statistic=%.2f pval=%.2f)' %
             (ssreg / sstot, s, pv))
    coef = np.polyfit(range(len(st_scores)), st_scores, 1)
    p = np.poly1d(coef)

    plt.plot(x, p(x), 'k--', label='Linear fit')
    plt.legend(loc='lower center', fancybox=True, prop={'size': 9})
    plt.title(assignment.title())
    plt.show()

    all_data = [('Raw Score', raw_data), ('Averaged', averaged),
                ('-Individual', individual), ('-Together', together)]

    # Sanity check to ensure scores were properly normalized
    for name, data in all_data[1:]:
        data = [d[k] for d in data for k in d if d[k] and 'score' in k]
        mean = np.mean(data)
        stdev = np.std(data)
        assert(abs(mean - ta_mean) < 1 and abs(stdev - ta_stdev) < 1), \
                'Scores improperly normalized: %.2f & %.2f for %s' %(mean, stdev, name)

    safe_diff = lambda score, data: 0 if not data else score - np.mean(data)
    difference = lambda data: [
        safe_diff(d['TA Score'], [
            d['student_score_%i' % i] for i in range(1, 5)
            if d['student_score_%i' % i]
        ]) for d in data
    ]

    print('-----------------------------\n\nCrowd / TA score differences:')

    # Calculate the difference of crowd-sourced score from individual TA score
    for ta in TAs:
        ta_data = [
            d['TA Score'] for d in raw_data
            if d['TA Name (First and Last)'] == ta
        ]

        print('\n-----', ta, '-----')
        print('Overall mean & stdev:\t\t\t %.2f & %.2f' %
              (np.mean(ta_data), np.std(ta_data)))
        for label, data in all_data:
            ta_data = [d for d in data if d['TA Name (First and Last)'] == ta]
            diff = difference(ta_data)
            mu, sig = np.mean(np.abs(diff)), np.std(np.abs(diff))
            print(label, 'difference mean & stdev:\t %.2f & %.2f' % (mu, sig))

            # Plot best fit line for averaged
            if label == 'Averaged':
                hist, n = np.histogram(diff, 50)
                plt.plot(n, mlab.normpdf(n, mu, sig), alpha=.7, label=ta)

    print('\n=========== Overall =============')

    # Calculate the overall difference of crowd-sourced score from all TA scores
    for label, data in all_data:
        diff = difference(data)
        mu, sig = np.mean(np.abs(diff)), np.std(np.abs(diff))
        print(label, 'difference mean & stdev:\t %.2f & %.2f' % (mu, sig))

        # Plot best fit line for averaged
        if label == 'Averaged':
            hist, n = np.histogram(diff, 50)
            plt.plot(n, mlab.normpdf(n, mu, sig), '--')
    plt.legend(loc='best', prop={'size': 7})
    plt.show()

    # Join all data into one cohesive set
    dataset = []
    for i, d in enumerate(raw_data):

        # All sets need same amount of features
        if not d['student_score_3']:
            continue

        features = []
        for j in range(1, 4):
            features.append(d['student_score_%i' % j])
            features.append(len(d['student_comment_%i' % j]))
            features.append(d['student_weight_%i' % j])

            for _, data in all_data[1:]:
                features.append(data[i]['student_score_%i' % j])
        features.append(d['TA Score'])
        dataset.append(features)

    with open('assignments/' + assignment.title() + '/dataset.csv', 'w+') as f:
        for d in dataset:
            f.write(','.join([str(v) for v in d]) + '\n')
コード例 #16
0
ファイル: main.py プロジェクト: abpalaciot/cb_apiv4
def organizations(request):
    """
        To fetch and update the Organizations entity
    """
    print("\n-------Getting Organizations entities-------\n")
    END_POINT = 'searches/organizations'

    YESTURDAY_DATE = utils.get_yesterday_date()
    TODAY_DATE = utils.get_today_date()
    COLLECTION_NAME = 'organization_entities'

    QUERY = {
        "field_ids": [
            "acquirer_identifier", "aliases", "categories", "category_groups",
            "closed_on", "company_type", "contact_email", "created_at",
            "delisted_on", "demo_days", "description", "diversity_spotlights",
            "entity_def_id", "equity_funding_total", "exited_on", "facebook",
            "facet_ids", "founded_on", "founder_identifiers", "funding_stage",
            "funding_total", "funds_total", "hub_tags", "identifier",
            "image_id", "image_url", "investor_identifiers", "investor_stage",
            "investor_type", "ipo_status", "last_equity_funding_total",
            "last_equity_funding_type", "last_funding_at",
            "last_funding_total", "last_funding_type", "layout_id",
            "legal_name", "linkedin", "listed_stock_symbol",
            "location_group_identifiers", "location_identifiers", "name",
            "num_acquisitions", "num_alumni", "num_articles",
            "num_current_advisor_positions", "num_current_positions",
            "num_diversity_spotlight_investments", "num_employees_enum",
            "num_enrollments", "num_event_appearances", "num_exits",
            "num_exits_ipo", "num_founder_alumni", "num_founders",
            "num_funding_rounds", "num_funds", "num_investments",
            "num_investors", "num_lead_investments", "num_lead_investors",
            "num_past_positions", "num_portfolio_organizations",
            "num_sub_organizations", "operating_status", "override_layout_id",
            "owner_identifier", "permalink", "permalink_aliases",
            "phone_number", "program_application_deadline", "program_duration",
            "program_type", "rank_delta_d30", "rank_delta_d7",
            "rank_delta_d90", "rank_org", "rank_principal", "revenue_range",
            "school_method", "school_program", "school_type",
            "short_description", "status", "stock_exchange_symbol",
            "stock_symbol", "twitter", "updated_at", "uuid", "valuation",
            "valuation_date", "website", "website_url", "went_public_on"
        ],
        "order": [{
            "field_id": "rank_org",
            "sort": "asc"
        }],
        "query": [
            {
                "type": "predicate",
                "field_id": "updated_at",
                "operator_id": "gte",
                "values": [str(YESTURDAY_DATE)]
            },
        ],
        "limit":
        1000
    }

    total_count, entities = utils.fetch_data(QUERY, END_POINT)
    if total_count is None:
        return "Error in parsing the API response. Please check the logs."

    print("total count: ", total_count)

    # get the organization collection
    org_col = utils.get_mongodb_collection(COLLECTION_NAME)

    fetch_records_count = 0

    while fetch_records_count < total_count:
        if fetch_records_count != 0:
            _, entities = utils.fetch_data(QUERY, END_POINT)

        if not entities:
            print("no entities left i.e., entities = %s. moving on." %
                  len(entities))
            break

        for e in entities:
            if e:
                e['insert_date'] = TODAY_DATE
            else:
                print("Entity is empty: ", e)

        inserted = org_col.insert_many(entities)
        fetch_records_count += len(entities)
        print("inserted records: ")
        pprint(inserted.inserted_ids)
        print("total_count: ", total_count, ", fetched records: ",
              fetch_records_count)

        # get the last record

        print("------------------------")

        after_id = entities[-1].get('uuid', None)
        if after_id:
            print("Get next batch after id: ", after_id)
            # print("Entities len: ", )
            QUERY['after_id'] = after_id
        entities.clear()

    msg = {
        'entity': 'Organization',
        'total_record_updated': fetch_records_count
    }
    return jsonify(msg)
コード例 #17
0
ファイル: main.py プロジェクト: abpalaciot/cb_apiv4
def press_references(request):
    """
        To fetch and update the Press References entity.
    """

    print("\n-------Getting Press References entities-------\n")
    COLLECTION_NAME = 'press_reference_entities'
    END_POINT = 'searches/press_references'
    TODAY_DATE = utils.get_today_date()
    YESTURDAY_DATE = utils.get_yesterday_date()

    QUERY = {
        "field_ids": [
            "activity_entities", "author", "created_at", "entity_def_id",
            "identifier", "posted_on", "publisher", "thumbnail_url", "title",
            "updated_at", "url", "uuid"
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "updated_at",
                "operator_id": "gte",
                "values": [str(YESTURDAY_DATE)]
            },
        ],
        "order": [{
            "field_id": "updated_at",
            "sort": "asc",
            "nulls": "last"
        }],
        "limit":
        1000,
    }

    total_count, entities = utils.fetch_data(QUERY, END_POINT)
    if total_count is None:
        return "Error in parsing the API response. Please check the logs."

    print("total count: ", total_count)

    # get the press_references collection
    col = utils.get_mongodb_collection(COLLECTION_NAME)

    fetch_records_count = 0

    # storing into the database and pagination
    while fetch_records_count < total_count:
        if fetch_records_count != 0:
            _, entities = utils.fetch_data(QUERY, END_POINT)

        if not entities:
            print("no entities left i.e., entities = %s. moving on." %
                  len(entities))
            break

        for e in entities:
            if e:
                e['insert_date'] = TODAY_DATE
            else:
                print("Entity is empty: ", e)

        inserted = col.insert_many(entities)
        fetch_records_count += len(entities)
        print("inserted records: ")
        pprint(inserted.inserted_ids)
        print("total_count: ", total_count, ", fetched records: ",
              fetch_records_count)

        print("------------------------")

        # get the last record
        after_id = entities[-1].get('uuid', None)
        if after_id:
            print("Get next batch after id: ", after_id)
            # print("Entities len: ", )
            QUERY['after_id'] = after_id
        entities.clear()

    msg = {
        'entity': 'press_references',
        'total_record_updated': fetch_records_count
    }
    return jsonify(msg)
コード例 #18
0
ファイル: main.py プロジェクト: abpalaciot/cb_apiv4
def acquisitions(request):
    """
        To fetch and update the Acquisitions entity.
    """

    print("\n-------Getting Acquisitions entities-------\n")
    COLLECTION_NAME = 'acquisitions_entities'
    END_POINT = 'searches/acquisitions'
    TODAY_DATE = utils.get_today_date()
    YESTURDAY_DATE = utils.get_yesterday_date()

    QUERY = {
        "field_ids": [
            "acquiree_categories", "acquiree_funding_total",
            "acquiree_identifier", "acquiree_last_funding_type",
            "acquiree_locations", "acquiree_num_funding_rounds",
            "acquiree_revenue_range", "acquiree_short_description",
            "acquirer_categories", "acquirer_funding_stage",
            "acquirer_funding_total", "acquirer_identifier",
            "acquirer_locations", "acquirer_num_funding_rounds",
            "acquirer_revenue_range", "acquirer_short_description",
            "acquisition_type", "announced_on", "completed_on", "created_at",
            "disposition_of_acquired", "entity_def_id", "identifier",
            "permalink", "price", "rank_acquisition", "short_description",
            "status", "terms", "updated_at", "uuid"
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "updated_at",
                "operator_id": "gte",
                "values": [str(YESTURDAY_DATE)]
            },
        ],
        "order": [{
            "field_id": "updated_at",
            "sort": "asc",
            "nulls": "last"
        }],
        "limit":
        1000,
    }

    total_count, entities = utils.fetch_data(QUERY, END_POINT)
    if total_count is None:
        return "Error in parsing the API response. Please check the logs."

    print("total count: ", total_count)

    # get the acquisitions collection
    col = utils.get_mongodb_collection(COLLECTION_NAME)

    fetch_records_count = 0

    # storing into the database and pagination
    while fetch_records_count < total_count:
        if fetch_records_count != 0:
            _, entities = utils.fetch_data(QUERY, END_POINT)

        if not entities:
            print("no entities left i.e., entities = %s. moving on." %
                  len(entities))
            break

        for e in entities:
            if e:
                e['insert_date'] = TODAY_DATE
            else:
                print("Entity is empty: ", e)

        inserted = col.insert_many(entities)
        fetch_records_count += len(entities)
        print("inserted records: ")
        pprint(inserted.inserted_ids)
        print("total_count: ", total_count, ", fetched records: ",
              fetch_records_count)

        print("------------------------")

        # get the last record
        after_id = entities[-1].get('uuid', None)
        if after_id:
            print("Get next batch after id: ", after_id)
            # print("Entities len: ", )
            QUERY['after_id'] = after_id
        entities.clear()

    msg = {
        'entity': 'acquisitions',
        'total_record_updated': fetch_records_count
    }
    return jsonify(msg)
コード例 #19
0
ファイル: app.py プロジェクト: tjanko13/stoltz_scottman
def plot(tickername):
    data = fetch_data(tickername).reset_index()
    fig = px.line(data, x='Date', y='Close')
    return fig.to_html()
コード例 #20
0
def train(sess,
          model,
          train_url,
          test_url,
          dev_url,
          model_url,
          batch_size,
          saver,
          training_epochs=400,
          alternate_epochs=1):
    """train nvctm model."""
    train_set, train_count = utils.data_set(train_url)
    dev_set, dev_count = utils.data_set(dev_url)
    test_set, test_count = utils.data_set(test_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    train_theta = []
    train_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                res_sum = 0
                log_sum = 0
                mean_sum = 0
                var_sum = 0
                m = None
                Um = None
                enc = None

                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld, mean, Umean, enc, rec_loss, log_s, mean_s,
                        vk_show, theta, beta, lp, v) = sess.run((optim, [
                            model.objective, model.kld, model.mean, model.U,
                            model.vk, model.recons_loss, model.log_squre,
                            model.mean_squre, model.vk_show, model.theta,
                            model.beta, model.log_prob, model.variance
                        ]), input_feed)
                    m = mean
                    Um = Umean
                    # print('*********************vk show', vk_show)
                    # print('Umean', Umean[0])
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    res_sum += np.sum(rec_loss)
                    log_sum += np.sum(log_s)
                    mean_sum += np.sum(mean_s)
                    var_sum += np.sum(v) / np.sum(mask)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_res = res_sum / len(train_batches)
                print_log = log_sum / len(train_batches)
                print_mean = mean_sum / len(train_batches)
                print_var = var_sum / len(train_batches)

                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity per word
                    # '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld),
                    '| stddev {:.5}'.format(print_var),
                    '| res_loss: {:5}'.format(print_res),
                    '| log_loss: {:5}'.format(print_log),
                    '| mean_loss: {:5}'.format(print_mean))

                with codecs.open('./nvctm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1
                    ) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./nvctm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('nvctm',
                                         FLAGS.data_dir + '/train.feat',
                                         FLAGS.vocab_size)

        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        var_sum = 0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld, v = sess.run(
                [model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            var_sum += np.sum(v) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_var = var_sum / len(train_batches)
        # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| stddev {:.5}'.format(print_var),
              '| KLD: {:.5}'.format(print_kld))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            var_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld, v = sess.run(
                    [model.objective, model.kld, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                var_sum += np.sum(v) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_var = var_sum / len(train_batches)
            # print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| stddev {:.5}'.format(print_var),
                  '| KLD: {:.5}\n'.format(print_kld))
    npmi.print_coherence('nvctm', FLAGS.data_dir + '/train.feat',
                         FLAGS.vocab_size)
    saver.save(sess, model_url)
コード例 #21
0
#     )

st.sidebar.title("Simple Pattern Finder")
st.sidebar.image("img/pattern.png")

symbol = st.sidebar.text_input(label='Symbol', value='SPY')

today = date.today()
delta = timedelta(days=50)
start = today - delta

start_date = st.sidebar.date_input(label='From :', value=start)
end_date = st.sidebar.date_input(label='To :')

ticker_info = yf.Ticker(symbol)
df = utils.fetch_data(symbol, start_date=start_date, end_date=end_date)

try:
    st.title(ticker_info.info['shortName'])
except:
    st.error('No data found, symbol may be delisted')
    st.stop()

fig = go.Figure(data=[
    go.Candlestick(x=df.index,
                   open=df['Open'],
                   high=df['High'],
                   low=df['Low'],
                   close=df['Close'])
])
コード例 #22
0
def main(inp, method, training_size, epoch):

    sample_func, data = sampling_methods[method]

    if data[-4:] == '.pth':
        data = torch.load(data)

    device = torch.device('cpu')
    batch_size = 128
    budget = epoch

    config = {
        'ndim': 250,
        'sdim': 56,
        'num_gnn_layers': 2,
        'g_aggr': 'gsum',
        'num_acc_layers': 4,
        'lr': 0.00001,
    }

    t0 = time()
    test_dataset = fetch_data('data/test_data_20.pth')
    logging.info('Loaded test graphs in {} sec.'.format(round(time() - t0, 2)))
    t0 = time()
    val_dataset = fetch_data('data/validation_data_10.pth')
    logging.info('Loaded validation graphs model in {} sec.'.format(
        round(time() - t0, 2)))

    test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)
    val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

    criterion = nn.MSELoss()

    for num in [training_size]:
        ratio = np.round(num / 100, 2)
        run_name = '{}{}'.format(method, num)

        rmse_list = list()
        all_loss = list()
        best_rmse_list = []
        for step in range(5):

            logger.info('sampling')
            sampled_dataset = sample_func(ratio, data)
            train_loader = DataLoader(sampled_dataset,
                                      batch_size=batch_size,
                                      shuffle=True)
            logger.info('start run {}_{} with {}% ({} graphs) '.format(
                run_name, step + 1, num, len(sampled_dataset)))

            model = GNNpred(config['ndim'], config['sdim']).to(device)
            optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])

            best_val = np.inf
            best_test = np.inf
            for epoch in range(int(budget)):
                loss = 0
                model.train()
                running_loss = torch.Tensor().to(device)
                for i, graph_batch in enumerate(train_loader):
                    graph_batch = graph_batch.to(device)
                    optimizer.zero_grad()
                    output = model(graph_batch.edge_index,
                                   graph_batch.node_atts,
                                   graph_batch.batch.to(device))
                    loss = criterion(output.view(-1), graph_batch.acc)
                    running_loss = torch.cat([running_loss, loss.view(-1)])
                    loss.backward()
                    optimizer.step()
                loss = torch.sqrt(torch.mean(running_loss)).item()
                all_loss.append(loss)

                logger.info('epoch {}:\tloss = {}'.format(
                    epoch, my_round(loss, 4)))

                val_rmse, _, val_acc = evaluate(model, val_loader, device)

                logger.info('epoch {}:\tval_rmse = {}'.format(
                    epoch, my_round(val_rmse, 4)))

                test_rmse, test_mae, _ = evaluate(model, test_loader, device)

                logger.info('epoch {}:\ttest_rmse = {}'.format(
                    epoch, my_round(test_rmse, 4)))
                logger.info('epoch {}:\ttest_mae = {}'.format(
                    epoch, my_round(test_mae, 4)))

                if val_rmse < best_val:
                    best_val = val_rmse
                    best_test = test_rmse


#                         save(model, run_name)
                rmse_list.append(best_val)
            best_rmse_list.append(best_val)
            logger.info('step {}:\tbest_test = {}'.format(
                step + 1, my_round(best_test, 4)))

            torch.save(rmse_list,
                       path_results + '/{}_all_rmse.pth'.format(run_name))
            logger.info('Saved all validation rmse to {}'.format(path_results))

            torch.save(best_rmse_list,
                       path_results + '/{}_best_rmse.pth'.format(run_name))
            logger.info('Saved best validation rmse of each run to {}'.format(
                path_results))

            torch.save(all_loss,
                       path_results + '/{}_loss.pth'.format(run_name))
            logger.info('Saved trainings loss to {}'.format(path_results))

            torch.save(val_acc,
                       path_saved_acc + '/{}_val_acc.pth'.format(run_name))
            logger.info(
                'Saved true and predicted accuracy of validation set to {}'.
                format(path_saved_acc))

            _, _, train_acc = evaluate(model, train_loader, device)
            torch.save(train_acc,
                       path_saved_acc + '/{}_train_acc.pth'.format(run_name))
            logger.info(
                'Saved true and predicted accuracy of training set to {}'.
                format(path_saved_acc))

            logger.info('epoch {}:\ttest_rmse = {}'.format(
                epoch, my_round(test_rmse, 4)))

    return loss, val_rmse, test_rmse, test_mae, model.number_of_parameters()
コード例 #23
0
ファイル: gsm.py プロジェクト: linkstrife/CR-GSM-NVCTM
def train(sess, model, train_url, test_url, dev_url, batch_size, training_epochs=1000, alternate_epochs=1):
    """train gsm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    dev_set, dev_count = utils.data_set(dev_url)

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

    kld_list = []
    var_list = []
    train_theta = []
    train_beta = []
    test_theta = []
    test_beta = []
    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
        # -------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optimize = model.optimize_dec
                print_mode = 'updating decoder'
            elif switch == 1:
                optimize = model.optimize_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                var_sum = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)

                    input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: True, model.gamma.name: epoch/training_epochs}
                    _, (loss, kld, v, theta, beta) =\
                        sess.run((optimize, [model.reconstruction_loss, model.kld, model.variance, model.topic_dist, model.beta]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    var_sum += np.sum(v) / np.sum(mask)
                    # print([np.max(theta[i]) for i in range(batch_size)])
                    # print([np.argmax(theta[i]) for i in range(batch_size)])
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)

                    if epoch == training_epochs - 1 and switch == 1 and i == alternate_epochs - 1:
                        train_theta.extend(theta)
                        train_beta.extend(beta)

                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print_var = var_sum / len(train_batches)
                kld_list.append(print_kld)
                var_list.append(print_var)
                print('| Epoch train: {:d}'.format(epoch + 1),
                      print_mode, '{:d}'.format(i + 1),
                      '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
                      '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
                      '| KLD: {:.5}'.format(print_kld),
                      '| stddev {:.5}'.format(print_var))

                with codecs.open('./gsm_train_theta', 'wb') as fp:
                    pickle.dump(np.array(train_theta), fp)
                fp.close()

                if (epoch + 1) % 50 == 0 and switch == 1 and i == alternate_epochs - 1:
                    with codecs.open('./gsm_train_beta', 'wb') as fp:
                        pickle.dump(beta, fp)
                    fp.close()
                    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

        # -------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        var_sum = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
            loss, kld, v = sess.run([model.objective, model.kld, model.variance], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            var_sum += np.sum(v) / np.sum(mask)
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print_var = var_sum / len(train_batches)
        print('\n| Epoch dev: {:d}'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld),
              '| stddev: {:.5}'.format(print_var))

        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx, idx_batch in enumerate(test_batches):
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask, model.is_training.name: False, model.gamma.name: 0}
                loss, kld, theta, beta, v = sess.run([model.objective, model.kld, model.topic_dist, model.beta, model.variance], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
                test_theta.extend(theta)
                if idx == len(test_batches) - 1:
                    test_beta.extend(beta)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d}'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld),
                  '| stddev: {:.5}\n'.format(print_var))

    npmi.print_coherence('gsm', FLAGS.data_dir + '/train.feat', FLAGS.vocab_size)

    with codecs.open('./test_theta', 'wb') as fp:
        pickle.dump(test_theta, fp)
    fp.close()

    with codecs.open('./test_beta', 'wb') as fp:
        pickle.dump(test_beta, fp)
    fp.close()

    with codecs.open('./kld.txt', 'w', 'utf-8') as fp:
        for idx, kld in enumerate(kld_list):
            if idx < len(kld_list) - 1:
                fp.write(str(kld) + ', ')
            else:
                fp.write(str(kld))
        fp.close()
    with codecs.open('./var.txt', 'w', 'utf-8') as fp:
        for idx, var in enumerate(var_list):
            if idx < len(var_list) - 1:
                fp.write(str(var) + ', ')
            else:
                fp.write(str(var))
        fp.close()
コード例 #24
0
        [estimator_model.evaluate],
        input_dependencies=[splitter_cv_external, transform_data_external],
        name='estimator_model_external',
        flatten_inputs=[True, False],
        parallel=parameters['parallel'])

    # Creating tree structure (for output/input flow)
    splitter_cv_external.set_children_tasks([compressor_external])
    compressor_external.set_children_tasks([transform_data_external])
    transform_data_external.set_children_tasks([estimator_model_external])
    logs.validate()

    try:
        logs.info("Fetching and preprocessing input data...")
        stimuli_representations_paths, fMRI_paths = fetch_data(
            parameters['path_to_fmridata'], input_path, subject,
            parameters['language'], parameters['models'])
        stimuli_representations = transformer.process_representations(
            stimuli_representations_paths, parameters['models'])
        fMRI_data = transformer.process_fmri_data(
            fMRI_paths, masker, parameters['add_noise_to_constant'])
        logs.validate()

        logs.info("Executing pipeline...", end='\n')
        pipeline = Pipeline()
        pipeline.fit(
            splitter_cv_external,
            logs)  # retrieve the flow from children and input_dependencies
        maps = pipeline.compute(stimuli_representations,
                                fMRI_data,
                                output_path,
コード例 #25
0
def train(sess, model, 
          train_url, 
          test_url, 
          batch_size, 
          vocab_size,
          training_epochs=200, 
          alternate_epochs=1,#10
          lexicon=[],
          result_file='test.txt',
          B=1,
          warm_up_period=100):
  """train nvdm model."""
  train_set, train_count = utils.data_set(train_url)
  test_set, test_count = utils.data_set(test_url)
  # hold-out development dataset
  train_size=len(train_set)
  validation_size=int(train_size*0.1)
  dev_set = train_set[:validation_size]
  dev_count = train_count[:validation_size]
  train_set = train_set[validation_size:]
  train_count = train_count[validation_size:]
  print('sizes',train_size,validation_size,len(dev_set),len(train_set))
  optimize_jointly = True
  dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
  test_batches = utils.create_batches(len(test_set), batch_size, shuffle=False)

  warm_up = 0
  start_min_alpha = 0.00001
  min_alpha = start_min_alpha
  warm_up_alpha=False
  start_B=4
  curr_B=B
  
  #for early stopping
  best_print_ana_ppx=1e10
  early_stopping_iters=30
  no_improvement_iters=0
  stopped=False
  epoch=-1
  #for epoch in range(training_epochs):
  while not stopped:
    epoch+=1
    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)
    if warm_up<1.:
      warm_up += 1./warm_up_period
    else:
      warm_up=1.
   
    # train
    #for switch in range(0, 2):
    if optimize_jointly:
      optim = model.optim_all
      print_mode = 'updating encoder and decoder'
    elif switch == 0:
      optim = model.optim_dec
      print_mode = 'updating decoder'
    else:
      optim = model.optim_enc
      print_mode = 'updating encoder'
    for i in range(alternate_epochs):
      loss_sum = 0.0
      ana_loss_sum = 0.0
      ppx_sum = 0.0
      kld_sum = 0.0
      ana_kld_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum=0.0
      for idx_batch in train_batches:
        data_batch, count_batch, mask = utils.fetch_data(
        train_set, train_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 0.75,model.warm_up.name: warm_up,model.min_alpha.name:min_alpha,model.B.name: curr_B}
        _, (loss,recon, kld,ana_loss,ana_kld) = sess.run((optim, 
                                    [model.true_objective, model.recons_loss, model.kld,model.analytical_objective,model.analytical_kld]),
                                    input_feed)
        loss_sum += np.sum(loss)
        ana_loss_sum += np.sum(ana_loss)
        kld_sum += np.sum(kld) / np.sum(mask) 
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        # to avoid nan error
        count_batch = np.add(count_batch, 1e-12)
        # per document loss
        ppx_sum += np.sum(np.divide(loss, count_batch)) 
        doc_count += np.sum(mask)
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(train_batches)
      dec_vars = utils.variable_parser(tf.trainable_variables(), 'decoder')
      phi = dec_vars[0]
      phi = sess.run(phi)
      utils.print_top_words(phi, lexicon,result_file=None)
      print_ppx = np.exp(loss_sum / word_count)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(train_batches)
      print_ana_kld = ana_kld_sum/len(train_batches)
      

      print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld),
               '| Loss: {:.5}'.format(print_loss),
               '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld))
    if warm_up_alpha:
      if min_alpha>0.0001:
        min_alpha-=(start_min_alpha-0.0001)/training_epochs
    #-------------------------------
    # dev
    loss_sum = 0.0
    kld_sum = 0.0
    ppx_sum = 0.0
    word_count = 0
    doc_count = 0
    recon_sum=0.0
    print_ana_ppx = 0.0
    ana_loss_sum = 0.0
    for idx_batch in dev_batches:
      data_batch, count_batch, mask = utils.fetch_data(
          dev_set, dev_count, idx_batch, vocab_size)
      input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}#,model.B.name: B
      loss,recon, kld,ana_loss = sess.run([model.objective, model.recons_loss, model.analytical_kld,model.analytical_objective],
                           input_feed)
      loss_sum += np.sum(loss)
      ana_loss_sum += np.sum(ana_loss)
      kld_sum += np.sum(kld) / np.sum(mask)  
      word_count += np.sum(count_batch)
      count_batch = np.add(count_batch, 1e-12)
      ppx_sum += np.sum(np.divide(loss, count_batch))
      doc_count += np.sum(mask) 
      recon_sum+=np.sum(recon)
    print_ana_ppx = np.exp(ana_loss_sum / word_count)
    print_ppx = np.exp(loss_sum / word_count)
    print_ppx_perdoc = np.exp(ppx_sum / doc_count)
    print_kld = kld_sum/len(dev_batches)
    print_loss = recon_sum/len(dev_batches)
    if print_ana_ppx<best_print_ana_ppx:
      no_improvement_iters=0
      best_print_ana_ppx=print_ana_ppx
      #check on validation set, if ppx better-> save improved model
      tf.train.Saver().save(sess, 'models/improved_model_bernoulli') 
    else:
      no_improvement_iters+=1
      print('no_improvement_iters',no_improvement_iters,'best ppx',best_print_ana_ppx)
      if no_improvement_iters>=early_stopping_iters:
          #if model has not improved for 30 iterations, stop training
          ###########STOP TRAINING############
          stopped=True
          print('stop training after',epoch,'iterations,no_improvement_iters',no_improvement_iters)
          ###########LOAD BEST MODEL##########
          print('load stored model')
          tf.train.Saver().restore(sess,'models/improved_model_bernoulli')
          
    print('| Epoch dev: {:d} |'.format(epoch+1), 
           '| Perplexity: {:.9f}'.format(print_ppx),
           '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
           '| KLD: {:.5}'.format(print_kld)  ,
           '| Loss: {:.5}'.format(print_loss))  
    #-------------------------------
    # test
    #if epoch%10==0 or epoch==training_epochs-1:
    if FLAGS.test:
      #if epoch==training_epochs-1:
      if stopped:
        #only do it once in the end
        coherence=utils.topic_coherence(test_set,phi, lexicon)
        print('topic coherence',str(coherence))
      loss_sum = 0.0
      kld_sum = 0.0
      ppx_sum = 0.0
      word_count = 0
      doc_count = 0
      recon_sum = 0.0
      ana_loss_sum = 0.0
      ana_kld_sum = 0.0
      for idx_batch in test_batches:
        data_batch, count_batch, mask = utils.fetch_data(
          test_set, test_count, idx_batch, vocab_size)
        input_feed = {model.x.name: data_batch, model.mask.name: mask,model.keep_prob.name: 1.0,model.warm_up.name: 1.0,model.min_alpha.name:min_alpha,model.B.name: B}
        loss, recon,kld,ana_loss,ana_kld = sess.run([model.objective, model.recons_loss,model.kld,model.analytical_objective,model.analytical_kld],
                             input_feed)
        loss_sum += np.sum(loss)
        kld_sum += np.sum(kld)/np.sum(mask) 
        ana_loss_sum += np.sum(ana_loss)
        ana_kld_sum += np.sum(ana_kld) / np.sum(mask)
        word_count += np.sum(count_batch)
        count_batch = np.add(count_batch, 1e-12)
        ppx_sum += np.sum(np.divide(loss, count_batch))
        doc_count += np.sum(mask) 
        recon_sum+=np.sum(recon)
      print_loss = recon_sum/len(test_batches)
      print_ppx = np.exp(loss_sum / word_count)
      print_ppx_perdoc = np.exp(ppx_sum / doc_count)
      print_kld = kld_sum/len(test_batches)
      print_ana_ppx = np.exp(ana_loss_sum / word_count)
      print_ana_kld = ana_kld_sum/len(train_batches)
      print('| Epoch test: {:d} |'.format(epoch+1), 
             '| Perplexity: {:.9f}'.format(print_ppx),
             '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
             '| KLD: {:.5}'.format(print_kld),
             '| Loss: {:.5}'.format(print_loss),
             '| ppx anal.: {:.5f}'.format(print_ana_ppx),
               '|KLD anal.: {:.5f}'.format(print_ana_kld)) 
コード例 #26
0
ファイル: app.py プロジェクト: tjanko13/stoltz_scottman
def data(tickername):
    data = fetch_data(tickername)
    return data.reset_index().to_html()
コード例 #27
0
ファイル: aggregator.py プロジェクト: flaxter/CityRank
def aggregation_as_json(request, selected_sources, source_weights, debugging = False, formfields = None, current_item_rank = None, include_all_sources = True, quick = True, compare1 = None, compare2 = None):
	if debugging:
		print "FOOBAR!!! aggregation_as_json!"
		print "selected_sources", selected_sources

	schema = [('Item', 'string'), ('Rank', 'number'), ('Latitude', 'number'), ('Longitude', 'number')]
	order = ['Item', 'Rank', 'Latitude', 'Longitude']

	if include_all_sources: 
		for source in selected_sources:
			name = 'rank' + str(source.id)
			schema.append((name, 'string'))
			order.append(name)

	schema.append(('index', 'number'))
	order.append('index')

	selected_sources_count = len(selected_sources)
	print "selected_sources_count = ", selected_sources_count

	if selected_sources_count == 0:
		table = DataTableYUI(schema, None)
		json_table = table.ToJSonYUI(columns_order=order, order_by=(order[1], "desc"), include_index=True)
		return json_table, None

	print "starting utils.fetch_Data!!!!!!!!!!!!!!!!!!!!!!!!!"
	start = time()
	table, rows, items, display, middle = utils.fetch_data(selected_sources)
	totTime = time() - start
	print "utils.fetch_data TOTAL TIME:", totTime

	using_old_ranking = False

	numitems = len(items)
	if 'old_ranking_values' in request.session and len(request.session['old_ranking_values']) == numitems and 'old_ranking' in request.session and len(request.session['old_ranking']) == numitems:
		schema.insert(2, ('Previous', 'string'))
		order.insert(2, 'Previous')
		using_old_ranking = True
	
	# nothing to aggregate, just zero or one data source
		
	pairs = numitems * (numitems - 1) / 2
	weights = [source_weights[source] for source in selected_sources]

	start = time()
	print "starting utils.process!!!!!!!!!!!!!!!!!!!!!!!!!"
	p0, Ybar, num_comparisons = utils.process(table, rows, selected_sources_count, weights, pairs)
	totTime = time() - start
	print "utils.process TOTAL TIME:", totTime
	final_wt = [scipy.sqrt(n) for n in num_comparisons]

	if debugging:
		print "To be passed into the C++ code"
		print "p = ", p0
		print "Ybar = ", Ybar
		print "wt = ", final_wt
		print "numitems = ", numitems
		print "numpairs = ", len(num_comparisons)

	if using_old_ranking and False:
		p0 = request.session['old_ranking_values']

	def ij_to_index(i, j):
		return i * (i - 1) / 2 + j

	compare1 = 10
	compare2 = 5
	# compare items indexed compare1 and compare2
	if debugging and compare1 and compare2 and compare2 < compare1:
		index = ij_to_index(compare1, compare2)
		#print "Comparing", items[compare1], items[compare2], " - ", Ybar[index]

	before = zip(p0, items)
	before.sort(key=lambda x: x[0], reverse=True)	

	start = time()
	print "starting!"
	p1 = [0.0] * numitems
	leastsq.optimize(numitems, pairs, p0, Ybar, final_wt, p1)
		
	totTime = time() - start
	print "\n\n### TOTAL TIME2:", totTime

	if compare1 and compare2:
		pass #print "Re-comparing", items[compare1], items[compare2], " - ", p1[compare1] - p1[compare2]

	pairwise = after = None
	if debugging: 
		pairwise = [[None] * numitems for i in range(numitems)]
		after = [[None] * numitems for i in range(numitems)]

		n = 0
		for i in range(numitems):
			after[i][i] = pairwise[i][i] = [0, '']
		for i in range(1, numitems):
			for j in range(i):
				pref1 = pref2 = ''
				if(Ybar[n] < 0): # negative value indicates i is preferred to j, set class for the td element so it is colored accordingly
					pref1 = 'row'
					pref2 = 'column'
				elif Ybar[n] > 0:
					pref1 = 'column'
					pref2 = 'row'

				pairwise[i][j] = [Ybar[n], pref1]
				pairwise[j][i] = [-1 * Ybar[n], pref2]

				if(p1[j] - p1[i] < 0): 
					pref1 = 'row'
					pref2 = 'column'
				else:
					pref1 = 'column'
					pref2 = 'row'

				after[i][j] = [p1[j] - p1[i], pref1]
				after[j][i] = [p1[i] - p1[j], pref2]
				n += 1

	assert len(p1) == numitems

	min_value = min(p1)
	values_range = max(p1) - min_value
	scalar = 100.0 / values_range

	# since there's a fair amount of error in our optimization algorithm
	# round to the nearest decimal so as not to have fake rankings appear
	p1 = [round((x - min_value) * scalar, 1) for x in p1]
	if debugging:
		print "Check against C++: ", p1

#	p1 = [round(x, 1) for x in p1]

	ranked = zip(p1, items)
	ranked.sort(key=lambda x: x[0], reverse=True)	

	request.session['current_ranking'] = list(map(None, *ranked)[1])

	i = -1
	if current_item_rank: # find out item's rank
		i = items.index(current_item_rank[0])	
		current_item = items[i] # "<span class=\"current_item\">%s</span>" % items[i]
#		items[i] = current_item

	if current_item_rank: # find out item's rank
		current_item_rank[1] = ranked.index((p1[i], current_item))

	if include_all_sources:
		for i in range(len(table)): # INEFFICIENT!!!
			for j in range(len(table[i])):
				if table[i][j]:
					if True: #table[i][j].scaled_value > 0:
						table[i][j] = "<span class=\"table-rank\">%d</span><span class=\"table-suffix\">%s</span> <span class=\"table-score\">%.02f</span>" % (table[i][j].rank, utils.rank_suffix(table[i][j].rank), table[i][j].scaled_value)
					else:
						table[i][j] = "<span class=\"table-rank\">%d</span><span class=\"table-suffix\">%s</span> <span class=\"table-score\">%.01f</span>" % (table[i][j].rank, utils.rank_suffix(table[i][j].rank), table[i][j].scaled_value)
				else:
					table[i][j] = None #(1000, '')


		latitude = [item.lat for item in items]
		longitude = [item.long for item in items]

		if using_old_ranking:
			ranked3 = zip(request.session['old_ranking'], p1) # OMG a terrible hack!
			ranked3.sort(key=lambda x: x[1], reverse=False)
			old_ranking = [r[0] for r in ranked3]
			item_names = [item.name for item in items]

			ranked2 = zip(item_names, p1, old_ranking, latitude, longitude)
		else:
			ranked2 = zip(items, p1, latitude, longitude)

		ranked2 = [a + tuple(d) for a, d in zip(ranked2, table)]
	else:
		ranked2 = zip(items, p1)

	ranked2.sort(key=lambda x: x[1], reverse=False)	
	
	request.session['old_ranking_values'] = p1
	request.session['old_ranking'] = [r[0] for r in ranked2]

	table = DataTableYUI(schema, ranked2)
	json_table = table.ToJSonYUI(columns_order=order, order_by=(order[1], "desc"), include_index=True)

#	for r, b in zip(ranked[:15], before[:15]):
#		print b, r

	# map(None, a, b) is a confusing way of zip(a,b), but only if len(a) == len(b)
	# otherwise zip truncates one of the lists (http://docs.python.org/library/functions.html#zip)
	# whereas map puts None's in instead

	return json_table, items, ranked, display, pairwise, after
コード例 #28
0
ファイル: nvdm.py プロジェクト: shshnk94/nvdm
def train(sess, model, train_url, batch_size, training_epochs=1000, alternate_epochs=10):

  train_set, train_count = utils.data_set(train_url)

  summaries = None#get_summaries(sess) 
  writer = None#tf.summary.FileWriter(ckpt + '/logs/', sess.graph)
  saver = tf.train.Saver()

  sess.graph.finalize()
 
  total_mem = 0
  mem = 0
 
  for epoch in range(training_epochs):

    train_batches = utils.create_batches(len(train_set), batch_size, shuffle=True)

    for switch in range(0, 2):

      if switch == 0:
        optim = model.optim_dec
        print_mode = 'updating decoder'
      else:
        optim = model.optim_enc
        print_mode = 'updating encoder'

      for i in range(alternate_epochs):

        loss_sum = 0.0
        ppx_sum = 0.0
        kld_sum = 0.0
        word_count = 0
        doc_count = 0

        for idx_batch in train_batches:

          data_batch, count_batch, mask = utils.fetch_data(train_set, train_count, idx_batch, FLAGS.vocab_size)
          input_feed = {model.x.name: data_batch, model.mask.name: mask}
          _, (loss, kld) = sess.run((optim, [model.objective, model.kld]), input_feed)

          #loss, kld = tf.cast(loss, tf.float64), tf.cast(kld, tf.float64)
          loss_sum += np.sum(loss)
          kld_sum += np.sum(kld) / np.sum(mask)  
          word_count += np.sum(count_batch)
          # to avoid nan error
          count_batch = np.add(count_batch, 1e-12)
          # per document loss
          ppx_sum += np.sum(np.divide(loss, count_batch)) 
          doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum/len(train_batches)
        print('| Epoch train: {:d} |'.format(epoch+1), 
               print_mode, '{:d}'.format(i),
               '| Corpus ppx: {:.5f}'.format(print_ppx),  # perplexity for all docs
               '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),  # perplexity for per doc
               '| KLD: {:.5}'.format(print_kld))
        
    evaluate(model, train_set, train_count, sess, 'val', (loss_sum + kld_sum), epoch, summaries, writer, saver)

    current_mem = process.memory_info().rss / (1024 ** 2)
    total_mem += (current_mem - mem)
    print("Memory increase: {}, Cumulative memory: {}, and current {} in MB".format(current_mem - mem, total_mem, current_mem))
    mem = current_mem
    gc.collect()
コード例 #29
0
ファイル: main.py プロジェクト: abpalaciot/cb_apiv4
def funding_rounds(request):
    """
        To fetch and update the Funding Rounds entity.
    """

    print("\n-------Getting Funding Rounds entities-------\n")

    COLLECTION_NAME = 'funding_rounds_entities'
    END_POINT = 'searches/funding_rounds'
    TODAY_DATE = utils.get_today_date()
    YESTURDAY_DATE = utils.get_yesterday_date()

    QUERY = {
        "field_ids": [
            "announced_on",
            "closed_on",
            "created_at",
            "entity_def_id",
            "funded_organization_categories",
            "funded_organization_description",
            "funded_organization_diversity_spotlights",
            "funded_organization_funding_stage",
            "funded_organization_funding_total",
            "funded_organization_identifier",
            "funded_organization_location",
            "funded_organization_revenue_range",
            "identifier",
            "image_id",
            "investment_stage",
            "investment_type",
            "investor_identifiers",
            "is_equity",
            "lead_investor_identifiers",
            "money_raised",
            "name",
            "num_investors",
            "num_partners",
            "permalink",
            "post_money_valuation",
            "pre_money_valuation",
            "rank_funding_round",
            "short_description",
            "target_money_raised",
            "updated_at",
            "uuid",
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "updated_at",
                "operator_id": "gte",
                "values": [str(YESTURDAY_DATE)]
            },
        ],
        "order": [{
            "field_id": "updated_at",
            "sort": "asc",
            "nulls": "last"
        }],
        "limit":
        1000,
    }

    total_count, entities = utils.fetch_data(QUERY, END_POINT)

    # TODO to add this to all of the functions
    if total_count is None:
        return "Error in parsing the API response. Please check the logs."

    print("total count: ", total_count)

    # get the people collection
    col = utils.get_mongodb_collection(COLLECTION_NAME)

    fetch_records_count = 0

    # storing into the database and pagination
    while fetch_records_count < total_count:
        if fetch_records_count != 0:
            _, entities = utils.fetch_data(QUERY, END_POINT)

        if not entities:
            print("no entities left i.e., entities = %s. moving on." %
                  len(entities))
            break

        for e in entities:
            if e:
                e['insert_date'] = TODAY_DATE
            else:
                print("Entity is empty: ", e)

        inserted = col.insert_many(entities)
        fetch_records_count += len(entities)
        print("inserted records: ")
        pprint(inserted.inserted_ids)
        print("total_count: ", total_count, ", fetched records: ",
              fetch_records_count)

        print("------------------------")

        # get the last record
        after_id = entities[-1].get('uuid', None)
        if after_id:
            print("Get next batch after id: ", after_id)
            # print("Entities len: ", )
            QUERY['after_id'] = after_id
        entities.clear()

    msg = {
        'entity': 'funding_rounds',
        'total_record_updated': fetch_records_count
    }
    return jsonify(msg)
コード例 #30
0
ファイル: nvdm.py プロジェクト: md-experiments/nvdm
def train(sess,
          model,
          train_url,
          test_url,
          batch_size,
          training_epochs=1000,
          alternate_epochs=10):
    """train nvdm model."""
    train_set, train_count = utils.data_set(train_url)
    test_set, test_count = utils.data_set(test_url)
    # hold-out development dataset
    dev_set = test_set[:50]
    dev_count = test_count[:50]

    dev_batches = utils.create_batches(len(dev_set), batch_size, shuffle=False)
    test_batches = utils.create_batches(len(test_set),
                                        batch_size,
                                        shuffle=False)

    for epoch in range(training_epochs):
        train_batches = utils.create_batches(len(train_set),
                                             batch_size,
                                             shuffle=True)
        #-------------------------------
        # train
        for switch in range(0, 2):
            if switch == 0:
                optim = model.optim_dec
                print_mode = 'updating decoder'
            else:
                optim = model.optim_enc
                print_mode = 'updating encoder'
            for i in range(alternate_epochs):
                loss_sum = 0.0
                ppx_sum = 0.0
                kld_sum = 0.0
                word_count = 0
                doc_count = 0
                for idx_batch in train_batches:
                    data_batch, count_batch, mask = utils.fetch_data(
                        train_set, train_count, idx_batch, FLAGS.vocab_size)
                    input_feed = {
                        model.x.name: data_batch,
                        model.mask.name: mask
                    }
                    _, (loss, kld) = sess.run(
                        (optim, [model.objective, model.kld]), input_feed)
                    loss_sum += np.sum(loss)
                    kld_sum += np.sum(kld) / np.sum(mask)
                    word_count += np.sum(count_batch)
                    # to avoid nan error
                    count_batch = np.add(count_batch, 1e-12)
                    # per document loss
                    ppx_sum += np.sum(np.divide(loss, count_batch))
                    doc_count += np.sum(mask)
                print_ppx = np.exp(loss_sum / word_count)
                print_ppx_perdoc = np.exp(ppx_sum / doc_count)
                print_kld = kld_sum / len(train_batches)
                print(
                    '| Epoch train: {:d} |'.format(epoch + 1),
                    print_mode,
                    '{:d}'.format(i),
                    '| Corpus ppx: {:.5f}'.format(
                        print_ppx),  # perplexity for all docs
                    '| Per doc ppx: {:.5f}'.format(
                        print_ppx_perdoc),  # perplexity for per doc
                    '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # dev
        loss_sum = 0.0
        kld_sum = 0.0
        ppx_sum = 0.0
        word_count = 0
        doc_count = 0
        for idx_batch in dev_batches:
            data_batch, count_batch, mask = utils.fetch_data(
                dev_set, dev_count, idx_batch, FLAGS.vocab_size)
            input_feed = {model.x.name: data_batch, model.mask.name: mask}
            loss, kld = sess.run([model.objective, model.kld], input_feed)
            loss_sum += np.sum(loss)
            kld_sum += np.sum(kld) / np.sum(mask)
            word_count += np.sum(count_batch)
            count_batch = np.add(count_batch, 1e-12)
            ppx_sum += np.sum(np.divide(loss, count_batch))
            doc_count += np.sum(mask)
        print_ppx = np.exp(loss_sum / word_count)
        print_ppx_perdoc = np.exp(ppx_sum / doc_count)
        print_kld = kld_sum / len(dev_batches)
        print('| Epoch dev: {:d} |'.format(epoch + 1),
              '| Perplexity: {:.9f}'.format(print_ppx),
              '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
              '| KLD: {:.5}'.format(print_kld))
        #-------------------------------
        # test
        if FLAGS.test:
            loss_sum = 0.0
            kld_sum = 0.0
            ppx_sum = 0.0
            word_count = 0
            doc_count = 0
            for idx_batch in test_batches:
                data_batch, count_batch, mask = utils.fetch_data(
                    test_set, test_count, idx_batch, FLAGS.vocab_size)
                input_feed = {model.x.name: data_batch, model.mask.name: mask}
                loss, kld = sess.run([model.objective, model.kld], input_feed)
                loss_sum += np.sum(loss)
                kld_sum += np.sum(kld) / np.sum(mask)
                word_count += np.sum(count_batch)
                count_batch = np.add(count_batch, 1e-12)
                ppx_sum += np.sum(np.divide(loss, count_batch))
                doc_count += np.sum(mask)
            print_ppx = np.exp(loss_sum / word_count)
            print_ppx_perdoc = np.exp(ppx_sum / doc_count)
            print_kld = kld_sum / len(test_batches)
            print('| Epoch test: {:d} |'.format(epoch + 1),
                  '| Perplexity: {:.9f}'.format(print_ppx),
                  '| Per doc ppx: {:.5f}'.format(print_ppx_perdoc),
                  '| KLD: {:.5}'.format(print_kld))
コード例 #31
0
ファイル: main.py プロジェクト: abpalaciot/cb_apiv4
def people(request):
    """
        To fetch and update the People entity.
    """
    print("\n-------Getting people entities-------\n")

    END_POINT = 'searches/people'
    COLLECTION_NAME = 'people_entities'
    YESTURDAY_DATE = utils.get_yesterday_date()
    TODAY_DATE = utils.get_today_date()

    QUERY = {
        "field_ids": [
            "aliases",
            "born_on",
            "created_at",
            "description",
            "died_on",
            "entity_def_id",
            "facebook",
            "facet_ids",
            "first_name",
            "gender",
            "identifier",
            "image_id",
            "image_url",
            "investor_stage",
            "investor_type",
            "last_name",
            "layout_id",
            "linkedin",
            "location_group_identifiers",
            "location_identifiers",
            "middle_name",
            "name",
            "num_articles",
            "num_current_advisor_jobs",
            "num_current_jobs",
            "num_diversity_spotlight_investments",
            "num_event_appearances",
            "num_exits",
            "num_exits_ipo",
            "num_founded_organizations",
            "num_investments",
            "num_jobs",
            "num_lead_investments",
            "num_partner_investments",
            "num_past_advisor_jobs",
            "num_past_jobs",
            "num_portfolio_organizations",
            "override_layout_id",
            "permalink",
            "permalink_aliases",
            "primary_job_title",
            "primary_organization",
            "rank_delta_d30",
            "rank_delta_d7",
            "rank_delta_d90",
            "rank_person",
            "rank_principal",
            "short_description",
            "twitter",
            "updated_at",
            "uuid",
            "website",
            "website_url",
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "updated_at",
                "operator_id": "gte",
                "values": [str(YESTURDAY_DATE)]
            },
        ],
        "order": [{
            "field_id": "rank_person",
            "sort": "asc",
            "nulls": "last"
        }],
        "limit":
        1000,
    }

    total_count, entities = utils.fetch_data(QUERY, END_POINT)
    if total_count is None:
        return "Error in parsing the API response. Please check the logs."

    print("total count: ", total_count)

    # get the people collection
    col = utils.get_mongodb_collection(COLLECTION_NAME)

    fetch_records_count = 0

    # storing into the database and pagination
    while fetch_records_count < total_count:
        if fetch_records_count != 0:
            _, entities = utils.fetch_data(QUERY, END_POINT)

        if not entities:
            print("no entities left i.e., entities = %s. moving on." %
                  len(entities))
            break

        for e in entities:
            if e:
                e['insert_date'] = TODAY_DATE
            else:
                print("Entity is empty: ", e)

        inserted = col.insert_many(entities)
        fetch_records_count += len(entities)
        print("inserted records: ")
        pprint(inserted.inserted_ids)
        print("total_count: ", total_count, ", fetched records: ",
              fetch_records_count)

        # get the last record

        print("------------------------")

        after_id = entities[-1].get('uuid', None)
        if after_id:
            print("Get next batch after id: ", after_id)
            # print("Entities len: ", )
            QUERY['after_id'] = after_id
        entities.clear()

    msg = {'entity': 'Poeple', 'total_record_updated': fetch_records_count}
    return jsonify(msg)