Esempio n. 1
0
def _commit(confirmed_, unconfirmed_):
    from utils import load_data
    from utils import save_data
    
    auditing = load_data(_params['filenames'][5])
    confirmed = load_data(_params['filenames'][6])
    if not confirmed: confirmed = []
    unconfirmed = load_data(_params['filenames'][7])
    if not unconfirmed: unconfirmed = []
    
    i = 0
    while i < len(auditing):
        if  auditing[i]['matching'][0]['venue_id'] in confirmed_:
            auditing[i]['status'] = 'confirmed'
            a = auditing.pop(i)
            confirmed.append(a)
            i -= 1
        elif auditing[i]['matching'][0]['venue_id'] in unconfirmed_:
            auditing[i]['status'] = 'unconfirmed'
            a = auditing.pop(i)
            unconfirmed.append(a)
            i -= 1
        i += 1
    
    save_data([a[0] for a in auditing], _params['filenames'][5])
    save_data(auditing, _params['filenames'][5])
    save_data(confirmed, _params['filenames'][6])
    save_data(unconfirmed, _params['filenames'][7])
Esempio n. 2
0
    def evaluate(self, goldp = None, silverp = None,
                       gold_data = None, silver_data = None,
                       print_score = True):
        """
        * Compares two syllabified lists in string format
          (e.g. ser-uaes):
            gold = ground truth
            silver = as predicted by system
        * Both lists can be passed as lists (`gold_data`,
          `silver_data`) or can be loaded from files 
          (`goldp`, `silverp`).
        * Will return the token-level accuracy and hyphenation
          accuracy of the silver predictions (will print these
          if `print_score` is True).

        """
        if goldp:
            gold_data = utils.load_data(goldp)
        if silverp:
            silver_data = utils.load_data(silverp)

        _, gold_Y = self.vectorize(gold_data)
        _, silver_Y = self.vectorize(silver_data)

        token_acc, hyphen_acc = utils.metrics(utils.pred_to_classes(gold_Y),
                                        utils.pred_to_classes(silver_Y))

        if print_score:
            print('\t- evaluation scores:')
            print('\t\t + token acc:', round(token_acc, 2))
            print('\t\t + hyphen acc:', round(hyphen_acc, 2))

        return token_acc, hyphen_acc
Esempio n. 3
0
def run():
    # load in members, orient by bioguide ID
    print("Loading current legislators...")
    current = load_data("legislators-current.yaml")

    current_bioguide = { }
    for m in current:
      if "bioguide" in m["id"]:
        current_bioguide[m["id"]["bioguide"]] = m

    # remove out-of-office people from current committee membership
    print("Sweeping committee membership...")
    membership_current = load_data("committee-membership-current.yaml")
    for committee_id in list(membership_current.keys()):
      for member in membership_current[committee_id]:
        if member["bioguide"] not in current_bioguide:
          print("\t[%s] Ding ding ding! (%s)" % (member["bioguide"], member["name"]))
          membership_current[committee_id].remove(member)
    save_data(membership_current, "committee-membership-current.yaml")

    # remove out-of-office people from social media info
    print("Sweeping social media accounts...")
    socialmedia_current = load_data("legislators-social-media.yaml")
    for member in list(socialmedia_current):
      if member["id"]["bioguide"] not in current_bioguide:
        print("\t[%s] Ding ding ding! (%s)" % (member["id"]["bioguide"], member["social"]))
        socialmedia_current.remove(member)
    save_data(socialmedia_current, "legislators-social-media.yaml")
def main(state_num):
    matches_filename = 'matches_%d' % state_num
    print 'Loading %s ...' % matches_filename
    matches = utils.load_data(matches_filename)

    matches_reduced_filename = 'matches_reduced'
    try:
        print "Loading matches_reduced ..."
        matches_reduced = utils.load_data(matches_reduced_filename)
    except:
        print "Matches_reduced doesn't exists, creating new."
        matches_reduced = {}

    num_matches = len(matches.keys())

    for keyIdx, matchId in enumerate(matches.keys()):
        print "\rMatch %d out of %d [%0.1f%%]" % (keyIdx + 1, num_matches, (keyIdx + 1) / float(num_matches) * 100),

        summoners = []
        num_summoners = len(matches[matchId]['participants'])
        for i in range(num_summoners):
            champLevel = matches[matchId]['participants'][i]['stats']['champLevel']
            summonerId = matches[matchId]['participantIdentities'][i]['player']['summonerId']
            winner = matches[matchId]['participants'][i]['stats']['winner']
            summoners += [{'champLevel': champLevel, 'summonerId': summonerId, 'winner': winner}]
        matches_reduced[matchId] = {'summoners': summoners}

    print "Saving %s ..." % matches_reduced_filename
    utils.save_data(matches_reduced, matches_reduced_filename)
    print "Done!"
def main():
    parser = argparse.ArgumentParser(description='Generate input files for hunalign')
    parser.add_argument('ja', help='tokenized ja json')
    parser.add_argument('en', help='tokenized en json')
    parser.add_argument('prefix', help='output prefix')
    parser.add_argument('batchfile', help='output batchfile')
    parser.add_argument('--b', help='approximate batch size', type=int, default=5000)
    args = parser.parse_args()

    recipes_ja = utils.load_data(args.ja)
    recipes_en = utils.load_data(args.en)

    iteration = 1
    langs = ('ja', 'en')
    num_lines = [0 for _ in langs]  # keep track of the number of lines printed out
    output_filenames = [get_filename(args.prefix, iteration, lang) for lang in langs]
    output_files = [open(filename, 'w') for filename in output_filenames]

    batchfile_output = [(output_filenames[0], output_filenames[1],
                         get_filename(args.prefix, iteration, 'align'))]

    for recipes in izip(recipes_ja, recipes_en):
        for index, lang in enumerate(langs):
            recipe = recipes[index]
            output_file = output_files[index]

            print_item_to_file(output_file, recipe['name'])
            print_items_to_file(output_file, recipe['description'])
            print_itemss_to_file(output_file, recipe['instructions'])
            print_items_to_file(output_file, recipe['advice'])
            print_items_to_file(output_file, recipe['history'])

            num_lines[index] += (1 + # name
                                 len(recipe['description']) +
                                 sum(map(lambda inst: len(inst), recipe['instructions'])) +
                                 len(recipe['advice']) +
                                 len(recipe['history']))


        if any(map(lambda num_line: num_line > args.b, num_lines)):
            for output_file in output_files:
                output_file.close()

            # reset
            iteration += 1
            num_lines = [0 for _ in langs]
            output_filenames = [get_filename(args.prefix, iteration, lang) for lang in langs]
            output_files = [open(filename, 'w') for filename in output_filenames]

            batchfile_output.append((output_filenames[0], output_filenames[1],
                                     get_filename(args.prefix, iteration, 'align')))

    for output_file in output_files:
        output_file.close()

    with open(args.batchfile, 'w') as f:
        for output in batchfile_output:
            f.write('{0}\t{1}\t{2}\n'.format(*output))
Esempio n. 6
0
def main():
    train_url = "e:/data/comment_sentiment/train_set.csv"
    test_url = "e:/data/comment_sentiment/test_set.csv"
    x, y = utils.load_data(train_url)
    test_x, test_y = utils.load_data(test_url)

    lr = LogisticRegression()
    lr.train_model(x, y)
    pre_y = lr.predict(test_x)
    utils.show_result(test_y, pre_y, "logistic_comment")
def main():
    train_url = "train_set.csv"
    test_url = "test_set.csv"
    x, y = utils.load_data(train_url)
    test_x, test_y = utils.load_data(test_url)

    gda = GDA()
    gda.train_model(x, y)
    pre_y = gda.predict(test_x)
    utils.show_result(test_y, pre_y, "gda_comment")
Esempio n. 8
0
def main():
    train_url = "e:/data/comment_sentiment/train_set.csv"
    test_url = "e:/data/comment_sentiment/test_set.csv"
    train_x, train_y = utils.load_data(train_url)
    test_x, test_y = utils.load_data(test_url)

    bayes = Bayes()
    bayes.train_model(train_x, train_y)
    pre_y = bayes.predict(test_x)
    utils.show_result(test_y, np.array([pre_y]).T, "bayes_comment")
Esempio n. 9
0
def process_categories():
	reviews = load_data("vegas_reviews.json")
	def pizza(line):
		return 'Pizza' in line['categories']
	vegas_cats = set(food_lib.map_to_arg(load_data("food_businesses.json", pizza), 'business_id'))
	times = []
	for r in reviews:
		if r['business_id'] in vegas_cats:
			times.append(r['date'])
	print(sorted(times))
Esempio n. 10
0
def _load_dbs():
    global user_dict
    global movie_dict
    try:
        movie_dict = load_data(movie_pkl_file_name)
    except IOError:
        print 'there is no pkl file named %s' % movie_pkl_file_name
    try: 
        user_dict = load_data(user_pkl_file_name)
    except IOError:
        print 'there is no pkl file named %s' % user_pkl_file_name
Esempio n. 11
0
    def create_sorted_dict(self, offline=False):

        if offline:
            # Offline c*k zaman aliyor, gerek yok gibi bir sey.
            #TODO Try-except koymak lazim, dosyalar yok belki
            self.cf_simsorted_dict = load_data(PKL + 'cf_simsorted.pkl')
            self.cb_simsorted_dict = load_data(PKL + 'cb_simsorted.pkl')
        else:
            if self.cb_prox is None or self.cf_prox is None:
                self.create_proximity_matrices()
            self.cb_simsorted_dict = utils.sortSparseMatrix(self.cb_prox)
            print "cb dict has been calculated"
            self.cf_simsorted_dict = utils.sortSparseMatrix(self.cf_prox)
def main():
    parser = argparse.ArgumentParser(description='Check whether ja-en recipes '
                                                 'have same number of ingredients')
    parser.add_argument('ja', help='sorted ja recipes')
    parser.add_argument('en', help='sorted en recipes')
    args = parser.parse_args()

    recipes_ja = utils.load_data(args.ja)
    recipes_en = utils.load_data(args.en)

    for recipe_ja, recipe_en in izip(recipes_ja, recipes_en):
        assert(recipe_ja['id'] == recipe_en['id'])
        if len(recipe_ja['ingredients']) != len(recipe_en['ingredients']):
            print(recipe_ja['id'])
Esempio n. 13
0
def main(args):
	'''Module main function'''
	global database
	global genetic_algorithm
	global joint_positions
	global goal_positions
	pygame.init()
	random.seed()
	database = utils.initialize_database(args, 'RobotTrainingData')
	database.set_objective_names(['Tiempo', r'Error en $\theta_1$', r'Error en $\theta_2$', r'Error en $\theta_3$', 'Energía'])
	problem = EV3Problem()
	generation = database.properties['highest_population']
	population_size = database.properties['population_size']
	genetic_algorithm = evolution.NSGA(problem, population_size)

	x_path = os.path.abspath(pkg_resources.resource_filename('resources.ev3', 'x_train.txt'))
	y_path = os.path.abspath(pkg_resources.resource_filename('resources.ev3', 'y_train.txt'))
	batch_start = (generation % 10) * N_GOALS
	joint_positions = np.loadtxt(x_path)[batch_start : batch_start + N_GOALS, :]
	goal_positions = np.loadtxt(y_path)[batch_start : batch_start + N_GOALS, :]

	if generation > 0:
		parents, children = utils.load_data(database)
		genetic_algorithm.set_population(parents)
		genetic_algorithm.set_children(children)
	for _ in range(args.iterations):
		generation += 1
		print('Starting generation ' + str(generation))
		genetic_algorithm.iterate()
		database.create_population()
		utils.save_data(genetic_algorithm, database)
		print('=' * (SCREEN_WIDTH - 1))
def compute_pval_rsa(seed):
    stim, voxels = load_data(n_samples, n_features, model=model, seed=seed,
                             heteroscedastic=heteroscedastic)

    # compute similarity
    stim_ = stim
    if stim.shape[1] == 1:
        stim_ = np.hstack((stim, - stim))

    stim_similarity = square_pdist(stim_)  # np.corrcoef(stim_)
    voxels_similarity = square_pdist(voxels)  # np.corrcoef(voxels)

    # indices to extract lower triangular part of a matrix
    lw_idx = np.triu_indices(n_samples, k=1)

    stim_vsim = stim_similarity[lw_idx]
    voxels_vsim = voxels_similarity[lw_idx]

    # compute the statistic
    # T = np.corrcoef(stim_vsim, voxels_vsim)[0, 1]
    T = spearmanr(voxels_vsim, stim_vsim)[0]
    T_perm = []
    for i in range(n_draws):
        # permute the labels
        perm = np.random.permutation(n_samples)
        # voxels_vsim_perm = np.corrcoef(voxels[perm])[lw_idx]
        voxels_vsim_perm = square_pdist(voxels[perm])[lw_idx]
        # compute the test statistic
        # T_perm.append(np.corrcoef(voxels_vsim_perm, stim_vsim)[0, 1])
        T_perm.append(spearmanr(voxels_vsim_perm, stim_vsim)[0])

    pval = 1 - percentileofscore(np.array(T_perm), T) / 100.
    return pval
Esempio n. 15
0
def t1_3():
    data = utils.load_data('data2D.npy').astype("float32")
    for k in [3]:
        rvals = kmeans(data, 1e-3, k, epochs=1000)
        t_loss = rvals['training_loss']
        v_loss = rvals['validation_loss']
        mu = rvals['mu']
        plt.clf()
        fig = plt.figure(1, figsize=(16,12))
        plt.plot(np.arange(len(t_loss)), t_loss)
        plt.savefig("t12_2_k%d.png" % k)

        t = classify(data, mu)
        colors = iter(cm.rainbow(np.linspace(0, 1, len(t))))
        plt.clf()
        #fig = plt.figure(1, figsize=(16,12))
        for i in range(len(t)):
            print 'plotting scatter...'
            print 'cluster x, y shape ', t[i][:, 0].shape, t[i][:, 1].shape
            print 'cluster x, y shape ', t[i][:, 0].shape, t[i][:, 1].shape
            
            s = plt.scatter(t[i][:, 0], t[i][:, 1], color=next(colors))
            #print "returned ", s
        plt.show() 
        plt.savefig('t12_3_scatter_k%d.png' % (i))
Esempio n. 16
0
def main():
  if len(sys.argv) != 2:
    print 'Usage:\t./next_day_prediction.py TICKER_SYMBOL'
    print 'Ex:\t./next_day_prediction.py NFLX'
    sys.exit()


  k = 10

  D = utils.load_data('i30/stocks/' + sys.argv[1] + '.csv')
  #load in past year's data
  train = D[-365:]

  X_train, y_train = utils.timestep_transform(train, k)
  model, scaler = utils.generate_model(X_train, y_train)
  pred_val = model.predict(scaler.transform([utils.day_transform(D, k)])).tolist()
  
  print 'Current day closing value:'
  print '\t', D[-1][-2]

  print 'Projected change in closing value:'
  print '\t', 100*(pred_val[0]-1)

  print 'Project next day closing value:'
  print '\t', pred_val[0]*D[-1][-2]
Esempio n. 17
0
def _load_statistics():
    global _statistics
    filename = get_path('datasets' , 'statistics.json')
    _statistics = load_data(filename, verbose=False)
    if not _statistics or not isinstance(_statistics, dict) : 
        _statistics = {}
    print ('')
Esempio n. 18
0
def main():
    parser = argparse.ArgumentParser(description='Tokenize all')
    parser.add_argument('recipes', help='recipes.json')
    parser.add_argument('--lang', choices=('en', 'ja'))
    args = parser.parse_args()

    recipes = utils.load_data(args.recipes)

    for recipe in recipes:
        name = word_tokenize(recipe['name'], args.lang)
        description = sent_word_tokenize(recipe['description'], args.lang)

        ingredients_name = map(lambda ing_name: word_tokenize(ing_name, args.lang),
                               map(lambda ing: ing['name'], recipe['ingredients']))
        ingredients_quantity = map(lambda ing_qt: word_tokenize(ing_qt, args.lang),
                                   map(lambda ing: ing['quantity'], recipe['ingredients']))
        ingredients = map(lambda pair: {'name': pair[0], 'quantity': pair[1]},
                          zip(ingredients_name, ingredients_quantity))
        instructions = map(lambda inst: sent_word_tokenize(inst, args.lang),
                           recipe['instructions'])

        advice = sent_word_tokenize(recipe['advice'], args.lang)
        history = sent_word_tokenize(recipe['history'], args.lang)

        recipe = {
            'id': recipe['id'],
            'name': name,
            'description': description,
            'ingredients': ingredients,
            'instructions': instructions,
            'advice': advice,
            'history': history,
        }
        print(json.dumps(recipe))
Esempio n. 19
0
def next_day_prediction(ticker_symbol, training_days):
  
  k = 10
  D = utils.load_data('i30/stocks/' + ticker_symbol + '.csv')
  #load in past year's data
  train = D[-1*training_days:]

  X_train, y_train = utils.timestep_transform(train, k)
  model, scaler = utils.generate_model(X_train, y_train)
  pred_val = model.predict(scaler.transform([utils.day_transform(D, k)])).tolist()
  
  curr_close = D[-1][-2]
  change = pred_val[0]-1
  next_close = pred_val[0]*curr_close

  '''
  print 'Current day closing value:'
  print '\t', curr_close

  print 'Projected change in closing value:'
  print '\t', 100*change

  print 'Project next day closing value:'
  print '\t', next_close
  '''

  return (curr_close, change, next_close)
Esempio n. 20
0
def train(in_file):
    xvals, yvals = utils.load_data(in_file)
    xvals, yvals = utils.randomize(xvals, yvals)
    network = build_network()
    model = tflearn.DNN(network)
    model.fit(xvals, yvals, n_epoch=200, validation_set=0.2)
    model.save('circle.tflearn')
def run():
    options = utils.flags()
    debug = options.get('debug', False)

    filename = "legislators-current.yaml"
    args = utils.args()
    legislators = load_data(filename)

    if len(args) != 0:
        bioguides = args
        print("Fetching contact forms for %s..." % ', '.join(bioguides))
    else:
        bioguides = [member['id']['bioguide'] for member in legislators]
        print("Fetching contact forms for all current members...")

    for legislator in legislators:
        bioguide = legislator['id']['bioguide']
        if bioguide not in bioguides: continue
        if bioguide in SKIP_BIOGUIDES: continue

        if debug: print("Downloading form for %s" % bioguide, flush=True)

        try:
            steps = contact_steps_for(bioguide)
        except LegislatorNotFoundError as e:
            if debug: print("skipping, %s..." % e, flush=True)
            continue

        legislator['terms'][-1]['contact_form'] = steps['contact_form']['steps'][0]['visit']

    print("Saving data to %s..." % filename)
    save_data(legislators, filename)
Esempio n. 22
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)
        
        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
Esempio n. 23
0
def stacker_data_v2(cutoff, num_least_correlated_cols):
    X, y_train = load_data()
    oof_predictions, lb_predictions, oof_ginis = load_predictions_with_cutoff(PREDICTION_PATH, cutoff)
    X_train, X_test = oof_predictions, lb_predictions
    new_cols = least_correlated_cols(X_train, num_least_correlated_cols)
    X_train, X_test = X_train[new_cols], X_test[new_cols]
    return X_train, y_train, X_test
Esempio n. 24
0
def data_v1():
    X, y_train = load_data()
    X = pd.get_dummies(X)

    is_train_obs = X.index.get_level_values('obs_type') == 'train'
    X_train, X_test = X[is_train_obs], X[~is_train_obs]
    return X_train, y_train, X_test
Esempio n. 25
0
def test_cA(learning_rate=0.01, training_epochs=20,
            dataset='../datasets/mnist.pkl.gz',
            batch_size=10, output_folder='cA_plots', contraction_level=.1):

    datasets = load_data(dataset)
    
    train_set_x, train_set_y = datasets[0]
    
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] 
    n_train_batches /= batch_size

    index = T.lscalar() 
    x = T.matrix('x')

    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    rng = numpy.random.RandomState(123)

    ca = cA(numpy_rng=rng, input=x,
            n_visible=28 * 28, n_hidden=500, n_batchsize=batch_size)

    cost, updates = ca.get_cost_updates(contraction_level=contraction_level,
                                        learning_rate=learning_rate)

    train_ca = theano.function(
        [index],
        [T.mean(ca.L_rec), ca.L_jacob],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size]
        }
    )

    start_time = timeit.default_timer()

    for epoch in xrange(training_epochs):
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_ca(batch_index))

        c_array = numpy.vstack(c)
        print 'Training epoch %d, reconstruction cost ' % epoch, numpy.mean(
            c_array[0]), ' jacobian norm ', numpy.mean(numpy.sqrt(c_array[1]))

    end_time = timeit.default_timer()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    image = Image.fromarray(tile_raster_images(
        X=ca.W.get_value(borrow=True).T,
        img_shape=(28, 28), tile_shape=(10, 10),
        tile_spacing=(1, 1)))

    image.save('cae_filters.png')

    os.chdir('../')
Esempio n. 26
0
def load_data(random_state=1066, n=1000, max_phrase_length=100):
    data = utils.load_data(random_state=random_state,
                           n=n,
                           max_phrase_length=max_phrase_length)

    X_train, y_train = data[0]
    X_valid, y_valid = data[1]
    X_test, y_test = data[2]

    X_train = X_train.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)
    X_valid = X_valid.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)
    X_test = X_test.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)

    # Robert: what about reshaping this data for 1D convs?
    # hstack() instead of hstack() in when creatign X in utils?

    return dict(
        X_train=theano.shared(lasagne.utils.floatX(X_train)),
        y_train=T.cast(theano.shared(y_train), 'int32'),
        X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
        y_valid=T.cast(theano.shared(y_valid), 'int32'),
        X_test=theano.shared(lasagne.utils.floatX(X_test)),
        y_test=T.cast(theano.shared(y_test), 'int32'),
        num_examples_train=X_train.shape[0],
        num_examples_valid=X_valid.shape[0],
        num_examples_test=X_test.shape[0],
        #input_height=X_train.shape[2], # what's the equivalent in our vectors?
        #input_width=X_train.shape[3],
        output_dim=5, # since five sentiment class
        )
def _match_from_models(model_filename, search_func, get_entity_by_id_func, filenames, threshold, prompt, verbose=True):
    from utils import load_data
    
    if verbose == True: print ('Loading models...')
    models = load_data(model_filename)
    if verbose == True: print ('Done.')
    models = _pre_process_models(models, filenames)
    _run_match(models, search_func, get_entity_by_id_func, filenames, threshold, prompt)
Esempio n. 28
0
def data_v5():
    X, y_train = load_data()
    X = pd.get_dummies(X)
    X.drop(['T2_V10', 'T2_V7', 'T1_V13', 'T1_V10'], axis=1, inplace=True)

    is_train_obs = X.index.get_level_values('obs_type') == 'train'
    X_train, X_test = X[is_train_obs], X[~is_train_obs]
    return X_train, y_train, X_test
Esempio n. 29
0
def load_triangle():
    """
    Get the text into a 2D array of ints.
    """
    triangle_string = utils.load_data(FILENAME)
    triangle_lines = [line for line in triangle_string.split('\n')]
    triangle = [[int(x) for x in line.split()] for line in triangle_lines]
    return triangle
Esempio n. 30
0
def load_higgs_data(data_file, valid_size, normalize):
    # we get back a tuple of train data, test data, train weights, train labels, and test labels
    dataset = load_data(data_file, valid_size, encoding='integer', normalize=normalize)

    train_set_x, train_set_y = load_shared_dataset((dataset[0], dataset[3]))
    valid_set_x, valid_set_y = load_shared_dataset((dataset[1], dataset[4]))

    return [(train_set_x, train_set_y), (valid_set_x, valid_set_y)]
Esempio n. 31
0
import project1 as p1
import utils
import numpy as np

#-------------------------------------------------------------------------------
# Data loading. There is no need to edit code in this section.
#-------------------------------------------------------------------------------

train_data = utils.load_data('reviews_train.tsv')
val_data = utils.load_data('reviews_val.tsv')
test_data = utils.load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

dictionary = p1.bag_of_words(train_texts)

train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary)
val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary)
test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary)

#-------------------------------------------------------------------------------
# Problem 5
#-------------------------------------------------------------------------------

 toy_features, toy_labels = toy_data = utils.load_toy_data('toy_data.tsv')

 T = 10
 L = 0.2
Esempio n. 32
0
batch_size = 128
epochs = 10

img_size = 224  # input image dimensions
channel_size = 1
label_size = 1  # label dimensions

img_dims = (img_size, img_size, channel_size)
label_dims = (label_size, label_size)

filepath_labels = 'lol_labels.txt'
filepath_data = 'lol_images.zip'
data_size = 60000

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = load_data()
x_train = x_train.reshape(x_train.shape[0], img_size, img_size, channel_size)
x_test = x_test.reshape(x_test.shape[0], img_size, img_size, channel_size)
input_shape = (img_size, img_size, channel_size)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

###### Model 0 ##############################
model = Sequential()
model.add(
Esempio n. 33
0
def load_normalize_data(path):

    data, label = load_data(path)
    processed_data = data_preprocessing(data)

    return processed_data, label
Esempio n. 34
0
        y_batch = y_batch.to(device=args.device)

        out = net(P, roots, X_batch, X_batch_daily, X_batch_weekly, r2sDic,
                  s2rDic, randomtrajs, mask1)
        loss = masked_mae_loss(out, y_batch)
        loss.backward()
        optimizer.step()

        epoch_training_losses.append(loss.detach().cpu().numpy())
    return sum(epoch_training_losses) / len(epoch_training_losses)


if __name__ == '__main__':
    torch.manual_seed(1)

    X, r2sDic, s2rDic, trajDic, keys = load_data(pathNum, pathLen)
    split_line1 = int(X.shape[2] * 0.7)
    split_line2 = int(X.shape[2] * 0.8)
    split_line3 = int(X.shape[2])

    np.save("train_cd.npy", X[:, :, :split_line1])
    np.save("val_cd.npy", X[:, :, split_line1:split_line2])
    np.save("test_cd.npy", X[:, :, split_line2:])
    means = np.mean(X[:, :, :split_line1], axis=(0, 2))
    stds = np.std(X[:, :, :split_line1], axis=(0, 2))
    X = X - means[0]
    X = X / stds[0]
    print(means)
    print(stds)
    print(X.shape)
Esempio n. 35
0
            else:
                return False

parser = argparse.ArgumentParser()
parser.add_argument('--lr', type=float, default=1e-2, help='Learning rate for the parameters')
parser.add_argument('--wd', type=float, default=1e-2, help='Weight decay for the parameters')
parser.add_argument('--n_hid', type=int, default=112, help='hidden layer for RNN')
parser.add_argument('--n_iter', type=int, default=9, help='(time-steps + 1) for RNN')
parser.add_argument('--dataset', type=str, default='cora', help='dataset, also use "citeseer" or "pubmed"')
parser.add_argument('--ps', type=int, default=5, help='patience for early stopping')
parser.add_argument('--d1', type=float, default=0.2, help='dropout rate for RNN')
parser.add_argument('--d2', type=float, default=0.2, help='dropout rate for dense(attention)')
parser.add_argument('--d3', type=float, default=0.4, help='dropout rate for dense(classification)')
            
arg = parser.parse_args()
features_, labels_, adj, deg, deg_inv = load_data(arg.dataset)
P = torch.from_numpy(deg_inv.dot(adj.todense()))
features = torch.from_numpy(features_.todense())
labels = torch.from_numpy(labels_).long()
n_nodes, n_feats = features_.shape[0], features_.shape[1]
n_class = np.int(np.max(labels_) + 1)
### Belows are the hyperparameters
n_hids = arg.n_hid
n_iters = arg.n_iter
d1 = arg.d1 # Dropout rate for RNN
d2 = arg.d2 # Dropout rate for attention
d3 = arg.d3 # Dropout rate for dense(classification)
n_epochs = arg.n_iter
lr = arg.lr # Learning rate for the parameters
wd = arg.wd # Weight decay for the parameters
ps = arg.ps #Patience rate for Early Stopping
Esempio n. 36
0
import utils

n_players, max_marble = utils.load_data()

print(utils.compute_max_score(n_players, max_marble))
Esempio n. 37
0
def train(model, supervisor, num_label):
    trX, trY, num_tr_batch, valX, valY, num_val_batch = load_data(
        cfg.dataset, cfg.batch_size, is_training=True)
    Y = valY[:num_val_batch * cfg.batch_size].reshape((-1, 1))

    fd_train_acc, fd_loss, fd_val_acc = save_to()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with supervisor.managed_session(config=config) as sess:
        print("\nNote: all of results will be saved to directory: " +
              cfg.results)
        for epoch in range(cfg.epoch):
            print("Training for epoch %d/%d:" % (epoch, cfg.epoch))
            if supervisor.should_stop():
                print('supervisor stoped!')
                break
            for step in tqdm(range(num_tr_batch),
                             total=num_tr_batch,
                             ncols=70,
                             leave=False,
                             unit='b'):
                start = step * cfg.batch_size
                end = start + cfg.batch_size
                global_step = epoch * num_tr_batch + step

                if global_step % cfg.train_sum_freq == 0:
                    _, loss, train_acc, summary_str = sess.run([
                        model.train_op, model.total_loss, model.accuracy,
                        model.train_summary
                    ])
                    assert not np.isnan(
                        loss), 'Something wrong! loss is nan...'
                    supervisor.summary_writer.add_summary(
                        summary_str, global_step)

                    fd_loss.write(str(global_step) + ',' + str(loss) + "\n")
                    fd_loss.flush()
                    fd_train_acc.write(
                        str(global_step) + ',' +
                        str(train_acc / cfg.batch_size) + "\n")
                    fd_train_acc.flush()
                else:
                    sess.run(model.train_op)

                if cfg.val_sum_freq != 0 and (
                        global_step) % cfg.val_sum_freq == 0:
                    val_acc = 0
                    for i in range(num_val_batch):
                        start = i * cfg.batch_size
                        end = start + cfg.batch_size
                        acc = sess.run(
                            model.accuracy, {
                                model.X: valX[start:end],
                                model.labels: valY[start:end]
                            })
                        val_acc += acc
                    val_acc = val_acc / (cfg.batch_size * num_val_batch)
                    fd_val_acc.write(
                        str(global_step) + ',' + str(val_acc) + '\n')
                    fd_val_acc.flush()

            if (epoch + 1) % cfg.save_freq == 0:
                supervisor.saver.save(
                    sess, cfg.logdir + '/model_epoch_%04d_step_%02d' %
                    (epoch, global_step))

        fd_val_acc.close()
        fd_train_acc.close()
        fd_loss.close()
Esempio n. 38
0
    model = DQN(state_dim, NUM_ACTIONS, NUM_OBJECTS)
    optimizer = optim.SGD(model.parameters(), lr=ALPHA)

    single_run_epoch_rewards_test = []
    pbar = tqdm(range(NUM_EPOCHS), ncols=80)
    for _ in pbar:
        single_run_epoch_rewards_test.append(run_epoch())
        pbar.set_description(
            "Avg reward: {:0.6f} | Ewma reward: {:0.6f}".format(
                np.mean(single_run_epoch_rewards_test),
                utils.ewma(single_run_epoch_rewards_test)))
    return single_run_epoch_rewards_test


if __name__ == '__main__':
    state_texts = utils.load_data('game.tsv')
    dictionary = utils.bag_of_words(state_texts)
    state_dim = len(dictionary)

    # set up the game
    framework.load_game_data()

    epoch_rewards_test = []  # shape NUM_RUNS * NUM_EPOCHS

    for _ in range(NUM_RUNS):
        epoch_rewards_test.append(run())

    epoch_rewards_test = np.array(epoch_rewards_test)

    x = np.arange(NUM_EPOCHS)
    fig, axis = plt.subplots()
Esempio n. 39
0
    def train(self, epochs, batch_size=32, sample_interval=500, start_point=0):

        # Load the dataset
        X_train, y_train = utils.load_data(self.writer)

        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(start_point, epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch of images
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            imgs = X_train[idx]

            # Sample noise as generator input
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # The labels of the digits that the generator tries to create an
            # image representation of
            sampled_labels = np.random.uniform(0, 1,
                                               (batch_size, self.num_classes))
            sampled_labels = np.around(sampled_labels)

            # Generate a half batch of new images
            gen_imgs = self.generator.predict([noise, sampled_labels])

            # Image labels. 0-9
            img_labels = y_train[idx]

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(
                imgs, [valid, img_labels])
            d_loss_fake = self.discriminator.train_on_batch(
                gen_imgs, [fake, sampled_labels])
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------

            # Train the generator
            g_loss = self.combined.train_on_batch([noise, sampled_labels],
                                                  [valid, sampled_labels])

            # Plot the progress
            print(
                "%d [D loss: %f, acc.: %.2f%%, op_acc: %.2f%%] [G loss: %f]" %
                (epoch, d_loss[0], 100 * d_loss[3], 100 * d_loss[4],
                 g_loss[0]))
            utils.write_log(
                self.writer,
                ['D loss', 'G loss', 'accuracy', 'class accuracy'],
                [d_loss[0], g_loss[0], 100 * d_loss[3], 100 * d_loss[4]],
                epoch)

            # If at save interval => save generated image samples
            if epoch % sample_interval == 0:
                utils.save_model(
                    '%s/' %
                    ('acgan' if self.flags.name is None else self.flags.name),
                    self.generator, self.discriminator, epoch)
                self.sample_images(epoch)
Esempio n. 40
0
def main(filename):
    """
    Main function for generating submissions.
    """
    y_pred_all = []
    X_train, y_train_all, X_test = load_data()
    for n in range(3):

        print(
            "############## working on dataset {} ###################".format(
                str(n + 1)))
        # process
        y_train = 2 * np.array(y_train_all[2000 * n:2000 * (n + 1)]) - 1

        k, n_mismatch = 13, 3
        if n != 0:
            print("Compute gram matrix for first kernel")
            gram_train_13_3, gram_test_13_3 = get_gram_matrix(
                X_train[2000 * n:2000 * (n + 1)],
                X_test[1000 * n:1000 * (n + 1)],
                k=k,
                n_mismatch=n_mismatch,
                n_kernel=n + 1,
            )

        k, n_mismatch = 12, 2
        if n != 0:
            print("Compute gram matrix for  second kernel ")
            gram_train_12_2, gram_test_12_2 = get_gram_matrix(
                X_train[2000 * n:2000 * (n + 1)],
                X_test[1000 * n:1000 * (n + 1)],
                k=k,
                n_mismatch=n_mismatch,
                n_kernel=n + 1,
            )

        print("Compute gram matrix for third kernel ")
        k, n_mismatch = 13, 2
        gram_train_13_2, gram_test_13_2 = get_gram_matrix(
            X_train[2000 * n:2000 * (n + 1)],
            X_test[1000 * n:1000 * (n + 1)],
            k=k,
            n_mismatch=n_mismatch,
            n_kernel=n + 1,
        )

        print("Training and generating prediction")
        if n == 0:
            train_grams = [gram_train_13_2]
            test_grams = [gram_test_13_2]
            y_pred = predict_first_set(train_grams, test_grams, y_train)
        elif n == 1:
            train_grams = [gram_train_13_2, gram_train_12_2, gram_train_13_3]
            test_grams = [gram_test_13_2, gram_test_12_2, gram_test_13_3]
            y_pred = predict_second_set(train_grams, test_grams, y_train)
        else:
            train_grams = [gram_train_13_2, gram_train_12_2, gram_train_13_3]
            test_grams = [gram_test_13_2, gram_test_12_2, gram_test_13_3]
            y_pred = predict_third_set(train_grams, test_grams, y_train)

        y_pred = (y_pred + 1) / 2
        y_pred_all += list(y_pred)

    print("Saving prediction in CSV file")

    with open(filename, "w") as csvfile:
        fieldnames = ["Id", "Bound"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for i in tqdm(range(0, len(y_pred_all))):
            writer.writerow({"Id": i, "Bound": int(y_pred_all[i])})

    print("You can find results on " + filename)
fieldmap = {
    "congbio": "bioguide",
    #"fec": "fec", # handled specially...
    "govtrack":
    "govtrack",  # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
    "opensecrets": "opensecrets",
    "votesmart": "votesmart",
    "cspan": "cspan",
}
int_fields = ("govtrack", "votesmart", "cspan")

# default to not caching
cache = utils.flags().get('cache', False)

# Load legislator files and map bioguide IDs.
y1 = utils.load_data("legislators-current.yaml")
y2 = utils.load_data("legislators-historical.yaml")
bioguides = {}
for y in y1 + y2:
    bioguides[y["id"]["bioguide"]] = y

# Okay now the Wikipedia stuff...


def get_matching_pages():
    # Does a Wikipedia API search for pages containing either of the
    # two templates. Returns the pages.

    page_titles = set()

    for template in ("CongLinks", "CongBio"):
Esempio n. 42
0
def run_fix_mask(args, seed):

    pruning.setup_seed(seed)
    adj, features, labels, idx_train, idx_val, idx_test = load_data(
        args['dataset'])

    node_num = features.size()[0]
    class_num = labels.numpy().max() + 1

    adj = adj.cuda()
    features = features.cuda()
    labels = labels.cuda()
    loss_func = nn.CrossEntropyLoss()

    net_gcn = net.net_gcn(embedding_dim=args['embedding_dim'], adj=adj)
    pruning.add_mask(net_gcn)
    net_gcn = net_gcn.cuda()

    print("load : {}".format(args['weight_dir']))
    encoder_weight = {}
    cl_ckpt = torch.load(args['weight_dir'], map_location='cuda')
    encoder_weight['weight_orig_weight'] = cl_ckpt['gcn.fc.weight']
    ori_state_dict = net_gcn.net_layer[0].state_dict()
    ori_state_dict.update(encoder_weight)
    net_gcn.net_layer[0].load_state_dict(ori_state_dict)

    for name, param in net_gcn.named_parameters():
        if 'mask' in name:
            param.requires_grad = False

    optimizer = torch.optim.Adam(net_gcn.parameters(),
                                 lr=args['lr'],
                                 weight_decay=args['weight_decay'])
    acc_test = 0.0
    best_val_acc = {'val_acc': 0, 'epoch': 0, 'test_acc': 0}

    for epoch in range(args['total_epoch']):

        optimizer.zero_grad()
        output = net_gcn(features, adj)
        loss = loss_func(output[idx_train], labels[idx_train])
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            output = net_gcn(features, adj, val_test=True)
            acc_val = f1_score(labels[idx_val].cpu().numpy(),
                               output[idx_val].cpu().numpy().argmax(axis=1),
                               average='micro')
            acc_test = f1_score(labels[idx_test].cpu().numpy(),
                                output[idx_test].cpu().numpy().argmax(axis=1),
                                average='micro')
            if acc_val > best_val_acc['val_acc']:
                best_val_acc['val_acc'] = acc_val
                best_val_acc['test_acc'] = acc_test
                best_val_acc['epoch'] = epoch

        print(
            "(Fix Mask) Epoch:[{}] Val:[{:.2f}] Test:[{:.2f}] | Final Val:[{:.2f}] Test:[{:.2f}] at Epoch:[{}]"
            .format(epoch, acc_val * 100, acc_test * 100,
                    best_val_acc['val_acc'] * 100,
                    best_val_acc['test_acc'] * 100, best_val_acc['epoch']))

    return best_val_acc['val_acc'], best_val_acc['test_acc'], best_val_acc[
        'epoch']
Esempio n. 43
0
        fn += 1 iff test_group[i][0] = M and classify(test+group[i] = B"""
        test_group = test_group
        FP, FN = 0, 0
        size = len(test_group)
        for i in range(size):
            if self.root.find_class_by_example(
                    test_group[i]) == 'M' and test_group[i][0] == 'B':
                FP += 1
            elif self.root.find_class_by_example(
                    test_group[i]) == 'B' and test_group[i][0] == 'M':
                FN += 1
        loss = lost(FP, FN, size)
        return loss


if __name__ == '__main__':
    data = load_data("train.csv")
    classifier = ID3(data)
    classifier.train()
    tester = load_data("test.csv")
    classifier.test(tester, True)
    """loss calc"""
    # loss = classifier.test_by_loss(tester)
    # print(loss)
    """this is the experiment"""
    # experiment("train.csv")
    """this is the accuracy check with M = 1"""
    # classifier = ID3(data, 1)
    # classifier.train()
    # classifier.test(tester, True)
Esempio n. 44
0
import project1 as p1
import utils

#-------------------------------------------------------------------------------
# Data loading. There is no need to edit code in this section.
#-------------------------------------------------------------------------------

train_data = utils.load_data('reviews_train.tsv')
# val_data = utils.load_data('reviews_val.tsv')
# test_data = utils.load_data('reviews_test.tsv')

train_texts, train_labels = zip(*((sample['text'], sample['sentiment']) for sample in train_data))
# val_texts, val_labels = zip(*((sample['text'], sample['sentiment']) for sample in val_data))
# test_texts, test_labels = zip(*((sample['text'], sample['sentiment']) for sample in test_data))

# dictionary = p1.bag_of_words(train_texts)
# dictionary_no_stopwords = p1.bag_of_words_removed_stopwords(train_texts)

# print("Length of Normal Dictionary:", len(dictionary), "\nLength of Dictionary Without Stopwords and Punc:", len(dictionary_no_stopwords))

# train_bow_features = p1.extract_bow_feature_vectors(train_texts, dictionary)
# val_bow_features = p1.extract_bow_feature_vectors(val_texts, dictionary)
# test_bow_features = p1.extract_bow_feature_vectors(test_texts, dictionary)

# # get the feature vectors with stopwords removed, punctuation removed, and words counted with frequency
# train_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(train_texts, dictionary_no_stopwords)
# val_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(val_texts, dictionary_no_stopwords)
# test_bow_features_no_stopwords = p1.extract_bow_feature_vectors_with_frequency(test_texts, dictionary_no_stopwords)


# # get the final features
Esempio n. 45
0
model.summary()


x_input = tf.placeholder(tf.float32, [None, 224, 224, 3])
y = model(x_input)
t = tf.placeholder(tf.float32, [None, 10])
learning_rate = tf.placeholder(tf.float32, [])
cost = earth_mover_loss(t, y)
train = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(cost)
######################################################################################
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from skimage.transform import resize

train_X_raw, train_y_raw = utils.load_data()

train_X_raw = train_X_raw
train_y_raw = train_y_raw

train_X_raw, test_X_raw, train_y_raw, test_y_raw = train_test_split(train_X_raw, train_y_raw, test_size=0.1, random_state=42)

test_X = np.zeros([0, image_size, image_size, 3])
for image in range(test_X_raw.shape[0]):
    pic = test_X_raw[image]
    img = resize(pic,(image_size,image_size,3))
    temp = np.reshape(img, (1, image_size,image_size,3))
    test_X = np.append(test_X, temp, axis=0)
test_y = test_y_raw
print('test size:')
print(test_X.shape)
def main(argv=None):
    print("Loading training data..")
    train_data = load_data(FLAGS.train_prefix, load_walks=True)
    print("Done loading training data..")
    train(train_data)
Esempio n. 47
0
if __name__ == "__main__":

    # argparse
    args = get_train_args()

    # check path_to_save existence
    if os.path.exists(args.path_to_save_folder):
        raise FileExistsError("save path folder already exists")

    # set seed and device
    set_global_seed(args.seed)
    device = torch.device(args.device)

    # load data
    data = load_data(path=args.path_to_data, verbose=args.verbose)

    # char2idx
    char2idx = get_char2idx(data, verbose=args.verbose)

    # dataset, collator, dataloader
    train_dataset = LMDataset(
        data,
        char2idx,
        max_length=args.max_length,
        verbose=args.verbose,
    )
    train_collator = LMCollator(padding_value=char2idx[EOS], )
    train_loader = DataLoader(
        train_dataset,
        batch_size=args.batch_size,
Esempio n. 48
0
import os
import numpy as np
import random
from config import config_setting
from model import Model
from utils import load_data
from train import train
from torch import nn

if __name__ == '__main__':
    cfg = config_setting()
    train_loader, test_loader, features = load_data(cfg)
    model = Model(features, cfg)
    if cfg.use_cuda:
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        model = model.cuda()
    print(model)
    train(model, train_loader, test_loader, features, cfg)
Esempio n. 49
0
def run():

    # Field mapping. And which fields should be turned into integers.
    # See https://en.wikipedia.org/wiki/Template:CongLinks for what's possibly available.
    fieldmap = {
        "congbio": "bioguide",
        #"fec": "fec", # handled specially...
        "govtrack":
        "govtrack",  # for sanity checking since we definitely have this already (I caught some Wikipedia errors)
        "opensecrets": "opensecrets",
        "votesmart": "votesmart",
        "cspan": "cspan",
    }
    int_fields = ("govtrack", "votesmart", "cspan")

    # default to not caching
    cache = utils.flags().get('cache', False)

    # Load legislator files and map bioguide IDs.
    y1 = utils.load_data("legislators-current.yaml")
    y2 = utils.load_data("legislators-historical.yaml")
    bioguides = {}
    for y in y1 + y2:
        bioguides[y["id"]["bioguide"]] = y

    # Okay now the Wikipedia stuff...

    def get_matching_pages():
        # Does a Wikipedia API search for pages containing either of the
        # two templates. Returns the pages.

        page_titles = set()

        for template in ("CongLinks", "CongBio"):
            eicontinue = ""
            while True:
                # construct query URL, using the "eicontinue" of the last query to get the next batch
                url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
                if eicontinue: url += "&eicontinue=" + eicontinue

                # load the XML
                print("Getting %s pages (%d...)" %
                      (template, len(page_titles)))
                dom = lxml.etree.fromstring(utils.download(
                    url, None, True))  # can't cache eicontinue probably

                for pgname in dom.xpath("query/embeddedin/ei/@title"):
                    page_titles.add(pgname)

                # get the next eicontinue value and loop
                eicontinue = dom.xpath(
                    "string(query-continue/embeddedin/@eicontinue)")
                if not eicontinue: break

        return page_titles

    # Get the list of Wikipedia pages that use any of the templates we care about.
    page_list_cache_file = os.path.join(utils.cache_dir(),
                                        "legislators/wikipedia/page_titles")
    if cache and os.path.exists(page_list_cache_file):
        # Load from cache.
        matching_pages = open(page_list_cache_file).read().split("\n")
    else:
        # Query Wikipedia API and save to cache.
        matching_pages = get_matching_pages()
        utils.write(("\n".join(matching_pages)), page_list_cache_file)

    # Filter out things that aren't actually pages (User:, Talk:, etcetera, anything with a colon).
    matching_pages = [p for p in matching_pages if ":" not in p]

    # Load each page's content and parse the template.
    for p in sorted(matching_pages):
        if " campaign" in p: continue
        if " (surname)" in p: continue
        if "career of " in p: continue
        if "for Congress" in p: continue
        if p.startswith("List of "): continue
        if p in ("New York in the American Civil War",
                 "Upper Marlboro, Maryland"):
            continue

        # Query the Wikipedia API to get the raw page content in XML,
        # and then use XPath to get the raw page text.
        url = "http://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(
            p.encode("utf8")) + "&export&exportnowrap"
        cache_path = "legislators/wikipedia/pages/" + p
        dom = lxml.etree.fromstring(utils.download(url, cache_path, not cache))
        page_content = dom.xpath(
            "string(mw:page/mw:revision/mw:text)",
            namespaces={"mw": "http://www.mediawiki.org/xml/export-0.8/"})

        # Build a dict for the IDs that we want to insert into our files.
        new_ids = {
            "wikipedia":
            p  # Wikipedia page name, with spaces for spaces (not underscores)
        }

        if "CongLinks" in page_content:
            # Parse the key/val pairs in the template.
            m = re.search(r"\{\{\s*CongLinks\s+([^}]*\S)\s*\}\}", page_content)
            if not m: continue  # no template?
            for arg in m.group(1).split("|"):
                if "=" not in arg: continue
                key, val = arg.split("=", 1)
                key = key.strip()
                val = val.strip()
                if val and key in fieldmap:
                    try:
                        if fieldmap[key] in int_fields: val = int(val)
                    except ValueError:
                        print("invalid value", key, val)
                        continue

                    if key == "opensecrets":
                        val = val.replace("&newMem=Y", "").replace(
                            "&newmem=Y", "").replace("&cycle=2004",
                                                     "").upper()
                    new_ids[fieldmap[key]] = val

            if "bioguide" not in new_ids: continue
            new_ids["bioguide"] = new_ids["bioguide"].upper()  # hmm
            bioguide = new_ids["bioguide"]

        else:
            m = re.search(r"\{\{\s*CongBio\s*\|\s*(\w+)\s*\}\}", page_content)
            if not m: continue  # no template?
            bioguide = m.group(1).upper()

        if not bioguide in bioguides:
            print(
                "Member not found: " + bioguide, p,
                "(Might have been a delegate to the Constitutional Convention.)"
            )
            continue

        # handle FEC ids specially because they are stored in an array...
        fec_id = new_ids.get("fec")
        if fec_id: del new_ids["fec"]

        member = bioguides[bioguide]
        member["id"].update(new_ids)

        # ...finish the FEC id.
        if fec_id:
            if fec_id not in bioguides[bioguide]["id"].get("fec", []):
                bioguides[bioguide]["id"].setdefault("fec", []).append(fec_id)

        #print p.encode("utf8"), new_ids

    utils.save_data(y1, "legislators-current.yaml")
    utils.save_data(y2, "legislators-historical.yaml")
Esempio n. 50
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {'dim': (23, None, 1),
              'nfft': 512,
              'spec_len': 250,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval', args=args)

    utt2ark, utt2idx, all_list, utt2data = {}, {}, [], {}
    for idx, kaldi_data_dir in enumerate(args.kaldi_data_dirs):
        if not os.path.exists(args.emb_out_dirs[idx]):
            os.makedirs(args.emb_out_dirs[idx])
        feats_path = os.path.join(kaldi_data_dir, 'feats.scp')
        vad_path = os.path.join(kaldi_data_dir, 'vad.scp')
        assert os.path.exists(feats_path), 'Path `{}` does not exists.'.format(feats_path)

        with open(feats_path) as f:
            for line in f:
                key, ark = line.split()
                ark, position = ark.split(':')
                input_tuple = (key, ark, int(position))
                utt2data[key] = ut.load_data(input_tuple, mode='eval')
                utt2idx[key] = idx

        with open(vad_path) as f:
            for line in f:
                key, ark = line.split()
                ark, position = ark.split(':')
                vad_array = None
                for ark_key, vec in kaldi_io.read_vec_flt_ark(ark):
                    if key == ark_key:
                        vad_array = np.array(vec, dtype=bool)
                assert vad_array is not None

                assert vad_array.size == utt2data[key].shape[1], 'Shapes does not fit: vad {}, mfcc {}'.format(
                    vad_array.size, utt2data[key].shape[1])
                utt2data[key] = ut.apply_cmvn_sliding(utt2data[key]).T[vad_array]

    # ==> load pre-trained model ???
    if os.path.isfile(args.resume):
        network_eval.load_weights(os.path.join(args.resume), by_name=True)
        print('==> successfully loaded model {}.'.format(args.resume))
    else:
        raise IOError("==> no checkpoint found at '{}'".format(args.resume))

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    for idx, utt in enumerate(utt2data):
        embedding = network_eval.predict(utt2data[utt].T[np.newaxis, :, :, np.newaxis]).squeeze()
        ut.write_txt_vectors(
            os.path.join(args.emb_out_dirs[utt2idx[utt]], 'xvector.{}.txt'.format(idx)), {utt: embedding})
Esempio n. 51
0
from keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger
from keras.optimizers import SGD
from models import simple_CNN
from utils import load_data, preprocess_input
import keras.backend as K
import tensorflow as tf

data_path = '../datasets/fer2013/fer2013.csv'
model_save_path = '../trained_models/simpler_CNN.hdf5'
faces, emotions = load_data(data_path)
faces = preprocess_input(faces)
num_classes = emotions.shape[1]
image_size = faces.shape[1:]
batch_size = 128
num_epochs = 1000

model = simple_CNN(image_size, num_classes)
model.compile(optimizer='adam', loss='categorical_crossentropy',
                                        metrics=['accuracy'])
csv_logger = CSVLogger('training.log')
early_stop = EarlyStopping('val_acc',patience=200,verbose=1)
model_checkpoint = ModelCheckpoint(model_save_path,
                                    'val_acc', verbose=1,
                                    save_best_only=True)

model_callbacks = [early_stop, model_checkpoint, csv_logger]

#keras bug 
K.get_session().run(tf.global_variables_initializer())
model.fit(faces,emotions,batch_size,num_epochs,verbose=1,
                                    callbacks=model_callbacks,
Esempio n. 52
0
from deepy.trainers import SGDTrainer, LearningRateAnnealer, AdamTrainer
from deepy.layers import LSTM
from layers import FullOutputLayer

logging.basicConfig(level=logging.INFO)

default_model = os.path.join(os.path.dirname(__file__), "models",
                             "lstm_rnnlm.gz")

if __name__ == '__main__':
    ap = ArgumentParser()
    ap.add_argument("--model", default="")
    ap.add_argument("--small", action="store_true")
    args = ap.parse_args()

    vocab, lmdata = load_data(small=args.small, history_len=5, batch_size=64)
    model = NeuralLM(vocab.size)
    model.stack(
        LSTM(hidden_size=100,
             output_type="sequence",
             persistent_state=True,
             batch_size=lmdata.size,
             reset_state_for_input=0), FullOutputLayer(vocab.size))

    if os.path.exists(args.model):
        model.load_params(args.model)

    trainer = SGDTrainer(
        model, {
            "learning_rate": LearningRateAnnealer.learning_rate(1.2),
            "weight_l2": 1e-7
Esempio n. 53
0
    # parse training arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--epochs', type = int, default = 10000, help = 'Number of epochs to train.')
    parser.add_argument('--lr', type = float, default = 0.005, help = 'Initial learning rate.')
    parser.add_argument('--weight_decay', type = float, default = 5e-4, help = 'Weight decay (L2 loss on parameters).')
    parser.add_argument('--hidden', type = int, default = 8, help = 'Number of hidden units.')
    parser.add_argument('--n_heads', type = int, default = 8, help = 'Number of head attentions.')
    parser.add_argument('--dropout', type = float, default = 0.6, help = 'Dropout rate (1 - keep probability).')
    parser.add_argument('--alpha', type = float, default = 0.2, help = 'Alpha for the leaky_relu.')
    parser.add_argument('--patience', type = int, default = 100, help = 'Patience')

    args = parser.parse_args()
    args.use_cuda = torch.cuda.is_available()
    
    # load data
    adj, features, labels, idx_train, idx_val, idx_test = load_data()
    
    model = GAT(n_input = features.shape[1], n_hidden = args.hidden,
                n_classes = int(labels.max()) + 1, dropout = args.dropout,
                alpha = args.alpha, n_heads = args.n_heads)
    
    
    if args.use_cuda:
        model.cuda()
        features = features.cuda()
        adj = adj.cuda()
        labels = labels.cuda()
        idx_train = idx_train.cuda()
        idx_val = idx_val.cuda()
        idx_test = idx_test.cuda()
        
Esempio n. 54
0
                               shuffle=data_type == "train",
                               drop_last=False)
             data_loader = DataLoader(dataset,
                                      batch_sampler=sampler,
                                      collate_fn=EdgeSeqDataset.batchify,
                                      pin_memory=data_type == "train")
         data_loaders[data_type] = data_loader
         logger.info("data (data_type: {:<5s}, len: {}) generated".format(
             data_type, len(dataset.data)))
         logger.info(
             "data_loader (data_type: {:<5s}, len: {}, batch_size: {}) generated"
             .format(data_type, len(data_loader),
                     finetune_config["batch_size"]))
 else:
     data = load_data(finetune_config["graph_dir"],
                      finetune_config["pattern_dir"],
                      finetune_config["metadata_dir"],
                      num_workers=finetune_config["num_workers"])
     logger.info("{}/{}/{} data loaded".format(len(data["train"]),
                                               len(data["dev"]),
                                               len(data["test"])))
     for data_type, x in data.items():
         if finetune_config["model"] in ["RGCN", "RGIN", "RSIN"]:
             if os.path.exists(
                     os.path.join(finetune_config["save_data_dir"],
                                  "%s_dgl_dataset.pt" % (data_type))):
                 dataset = GraphAdjDataset(list())
                 dataset.load(
                     os.path.join(finetune_config["save_data_dir"],
                                  "%s_dgl_dataset.pt" % (data_type)))
             else:
                 dataset = GraphAdjDataset(x)
Esempio n. 55
0
prompt += ('====================================\n')
print(prompt, end='')

f = open('{0}/opt.txt'.format(run_dir), 'w')
f.write(prompt)
f.close()

if torch.cuda.is_available():
    # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    # os.environ["CUDA_VISIBLE_DEVICES"] = "{0}".format(opt.gpu)
    torch.cuda.set_device(opt.gpu)
    # device = torch.device('cuda:{0}'.format(opt.gpu))
# Configure data loader

import utils
trainset, trainset2, testset = utils.load_data(opt=opt)
train_loader = torch.utils.data.DataLoader(trainset,
                                           batch_size=opt.batch_size,
                                           drop_last=True,
                                           sampler=InfiniteSampler(
                                               len(trainset)))  # model
train_loader2 = torch.utils.data.DataLoader(trainset2,
                                            batch_size=opt.batch_size,
                                            drop_last=True,
                                            sampler=InfiniteSampler(
                                                len(trainset2)))  # model
test_loader = torch.utils.data.DataLoader(testset,
                                          batch_size=opt.batch_size,
                                          shuffle=True,
                                          drop_last=True)  # model
Esempio n. 56
0
    trend_rdd = yeargrowth_rdd \
        .map(lambda row: ((row[0][0], row[0][1]),
                          str(row[0][2]) + ":" + utils.prettify_growth(row[1]))) \
        .groupByKey() \
        .map(lambda row: (row[1], (row[0]))) \
        .cache()
    # .mapValues(iterate) \
    # .mapValues(iterate) put it after groupByKey to see content of trend

    similartrendingcompanies_rdd = trend_rdd.join(trend_rdd) \
        .filter(filter_couples) \
        .collect()

    # sort values and remove duplicates (A,B)(B,A))
    # .map(lambda row: (row[0], tuple(sorted(row[1])))).distinct()
    # it seems that the rdd already does combinations

    for kv in similartrendingcompanies_rdd:
        print(kv)


if __name__ == "__main__":
    spark = utils.create_session("job2")
    sc = spark.sparkContext
    # sqlContext = SQLContext(sc)

    history_rdd = utils.load_data(spark, HISTORY_PATH, preview=False)
    legend_rdd = utils.load_data(spark, LEGEND_PATH, preview=False)
    run_job(history_rdd, legend_rdd)
Esempio n. 57
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    print('==> calculating test({}) data lists...'.format(args.test_type))

    publicTest = pd.read_csv("/content/VoveDataset/public-test.csv")

    list1 = addPath(np.array(publicTest["audio_1"]))
    list2 = addPath(np.array(publicTest["audio_2"]))

    total_list = np.concatenate((list1, list2))
    unique_list = np.unique(total_list)
    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            result_path = "/content/VGG-Speaker-Recognition/result"
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    total_length = len(unique_list)
    feats, scores, labels = [], [], []
    for c, ID in enumerate(pbar(unique_list)):
        specs = ut.load_data(ID,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)

        v = network_eval.predict(specs)
        feats += [v]

    feats = np.array(feats)
    np.save("/content/feats.npy", feats)
Esempio n. 58
0
	preds_train = aclf.predict(x_train)
	return train_accuracy, test_accuracy, f1(y_train, preds_train), f1(y_test, preds)
###################################################
# Modify for running your experiments accordingly #
###################################################
if __name__ == '__main__':
	args = load_args()
	accuracy_train = []
	f1_tscore = []
	accuracy = []
	f1_score = []
	x_axis = np.arange(1, 26)
	max_features = [1,2,5,8,10,20,25,35,50]
	# x_axis = np.arange(10,220,10)
	random = np.arange(1,11)
	x_train, y_train, x_test, y_test = load_data(args.root_dir)
	L = np.arange(10,210,10)
	if args.county_dict == 1:
		county_info(args)
	if args.decision_tree == 1:
		for x in range(1,126):
			train_acc, test_acc, f1_train, f1_test = decision_tree_testing(x_train, y_train, x_test, y_test,x)
			accuracy_train.append(train_acc)
			accuracy.append(test_acc)
			f1_tscore.append(f1_train)
			f1_score.append(f1_test)
		plt.plot(x_axis, accuracy, label="Testing Accuracy")
		plt.plot(x_axis, f1_score, label = "Training F1 Score")
		plt.plot(x_axis, accuracy_train, label = "Training Accuracy")
		plt.plot(x_axis, f1_tscore, label = "Testing F1 score")
		plt.ylabel("Accuracy")
    for proposal_folder in proposal_folders:
        fn_clusters = sorted(glob.glob(os.path.join(proposal_folder, fn_node_pattern)))
        proposals.extend([fn_node for fn_node in fn_clusters])
    assert len(proposals) == len(probs)

    pos_lst = []
    for idx, prob in enumerate(probs):
        if prob < args.th_pos:
            continue
        pos_lst.append([idx, prob])
    pos_lst = sorted(pos_lst, key=lambda x:x[1], reverse=True)

    # get all clusters
    clusters = []
    for idx, _ in tqdm(pos_lst):
        cluster = load_data(proposals[idx])
        clusters.append(cluster)

    idx2lb, idx2lbs = nms(clusters, args.th_iou)

    # output stats
    multi_lb_num = 0
    for _, lbs in idx2lbs.items():
        if len(lbs) > 1:
            multi_lb_num += 1
    inst_num = len(idx2lb)
    cls_num = len(set(idx2lb.values()))

    print('#inst: {}, #class: {}, #multi-label: {}'.format(inst_num, cls_num, multi_lb_num))
    print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num))
Esempio n. 60
0
from config import args
from utils import load_data, build_vocab, gen_submission, gen_final_submission, eval_based_on_outputs
from model import Model

if __name__ == '__main__':
    if not args.pretrained:
        print('No pretrained model specified.')
        exit(0)
    build_vocab()

    if args.test_mode:
        dev_data = load_data('../data/test-data-processed.json')
    else:
        dev_data = load_data('../data/dev-data-processed.json')
    model_path_list = args.pretrained.split(',')
    for model_path in model_path_list:
        print('Load model from %s...' % model_path)
        args.pretrained = model_path
        model = Model(args)

        # evaluate on development dataset
        dev_acc = model.evaluate(dev_data)
        print('dev accuracy: %f' % dev_acc)

        # generate submission zip file for Codalab
        prediction = model.predict(dev_data)
        gen_submission(dev_data, prediction)

    gen_final_submission(dev_data)
    eval_based_on_outputs('./answer.txt')