def evaluate_multi(url, url2, time_lags=24): cr = Crawling() preds = utils.load_file(url) preds = np.array(preds) lt = len(preds) labels = utils.load_file(url2) labels = np.array(labels) loss_mae0, loss_mae1 = 0.0, 0.0 loss_rmse0, loss_rmse1 = 0.0, 0.0 r2_0, r2_1 = 0.0, 0.0 for i, d in enumerate(preds): lb_i = i * pr.strides + time_lags + 1 mae0, mse0, r2 = get_evaluation(d[:time_lags, :], labels[lb_i:(lb_i + time_lags), :, 0]) # mae1, mse1 = get_evaluation(d[:time_lags,:,1], labels[lb_i:(lb_i+time_lags),:,1]) loss_rmse0 += mse0 # loss_rmse1 += mse1 loss_mae0 += mae0 # loss_mae1 += mae1 r2_0 += r2 loss_mae0 = loss_mae0 / lt * 300 loss_mae1 = loss_mae1 / lt * 300 loss_rmse0 = sqrt(loss_rmse0 / lt) * 300 loss_rmse1 = sqrt(loss_rmse1 / lt) * 300 r2_0 = r2_0 / lt print("MAE: %.6f %.6f" % (loss_mae0, cr.ConcPM25(loss_mae0))) print("RMSE: %.6f %.6f" % (loss_rmse0, cr.ConcPM25(loss_rmse0))) print("R2 Score: %.6f" % r2_0)
def execute_gan(path, attention_url, url_weight, model, session, saver, batch_size, encoder_length, decoder_length, is_test, train_writer=None, offset=0): #if restore and not is_test: # tf.reset_default_graph() # print(tf.get_default_graph()) #with tf.device('/%s' % p.device): # model.init_ops(not is_test) # #model.add_placeholders() #trainable_vars = tf.trainable_variables() #saver = tf.train.Saver(trainable_vars) print("==> Loading dataset") dataset = utils.load_file(path) if dataset: dataset = np.asarray(dataset, dtype=np.float32) lt = len(dataset) train, _ = utils.process_data_grid(lt, batch_size, encoder_length, decoder_length, True) attention_data = None if attention_url: attention_data = utils.load_file(attention_url) model.set_data(dataset, train, None, attention_data) #with tf.Session(config=gpu_configs) as session: #init = tf.global_variables_initializer() #session.run(init) model.assign_datasets(session) if not is_test: print("start training") for epoch in xrange(100): _ = model.run_epoch(session, train, offset + epoch, train_writer, train=True, verbose=False, stride=2) saver.save(session, 'weights/%s.weights' % url_weight) else: # saver.restore(session, url_weight) print('==> running model') _, preds = model.run_epoch(session, train, train=False, verbose=False, shuffle=False, stride=2) save_gan_preds(url_weight, preds)
def get_grammatical_data(train_filename, test_filename, dict_filename, translate_emojis=True, replace_slang=True, lowercase=True): # Load the train and test sets print("Loading data...") train_tokens = utils.load_file(path + "/res/tokens/tokens_" + train_filename) train_pos = utils.load_file(path + "/res/pos/pos_" + train_filename) test_tokens = utils.load_file(path + "/res/tokens/tokens_" + test_filename) test_pos = utils.load_file(path + "/res/pos/pos_" + test_filename) if translate_emojis and replace_slang and lowercase: save_path = path + "/res/data/finest_grammatical_" else: save_path = path + "/res/data/grammatical_" # Clean the data and brind it to the most *grammatical* form possible gramm_train = grammatical_clean(train_tokens, train_pos, path + "/res/" + dict_filename, save_path + train_filename, translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase) gramm_test = grammatical_clean(test_tokens, test_pos, path + "/res/" + dict_filename, save_path + test_filename, translate_emojis=translate_emojis, replace_slang=replace_slang, lowercase=lowercase) return gramm_train, gramm_test
def get_traffic(**kwargs): global TRAFFIC_WRAPPER # t0 = time.time() if TRAFFIC_WRAPPER is None: wrapperFile = 'wrappers/traffic_wrapper.json' synonyms = load_synonyms('./datasets/sinonimos.csv') words = load_words() if os.path.isfile(wrapperFile): with open(wrapperFile,'r+') as rwjson: TRAFFIC_WRAPPER = ClassifierWrapper() TRAFFIC_WRAPPER.jsonLoads(rwjson.read()) TRAFFIC_WRAPPER.dataset.dataset = list(load_file('./datasets/traffic2.csv')) TRAFFIC_WRAPPER.synonyms = copy.deepcopy(synonyms) TRAFFIC_WRAPPER.words = copy.deepcopy(words) TRAFFIC_WRAPPER.dataset.synonyms = copy.deepcopy(synonyms) TRAFFIC_WRAPPER.dataset.words = copy.deepcopy(words) return TRAFFIC_WRAPPER clf = kwargs.pop('clf', LogisticRegression(C=8.5)) dataWrapperDataset = list(load_file('./datasets/traffic2.csv')) dataWrapper = DataWrapper(dataset=dataWrapperDataset,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words)) dataWrapper.resolveMatrix() wrapper = ClassifierWrapper(clf=clf,dataset=dataWrapper,synonyms=copy.deepcopy(synonyms),words=copy.deepcopy(words)) cross_validate = kwargs.pop('cross_validate', True) if cross_validate: wrapper.cross_validate() wrapper.train() # print time.time() - t0, "seconds from the multiclass classifier" TRAFFIC_WRAPPER = wrapper with open(wrapperFile, 'w') as rw_json: json.dump(TRAFFIC_WRAPPER.toDict(), rw_json) return TRAFFIC_WRAPPER
def main(data_path): if path.exists(data_path + "/dict_char_en.pkl"): dict_char_en = utils.load_file(data_path + "/dict_char_en.pkl") else: dict_char_en = generateCharacterDict(properties.en_char) utils.save_file(data_path + "/dict_char_en.pkl", dict_char_en) if path.exists(data_path + "/dict_char_vi.pkl"): dict_char_vi = utils.load_file(data_path + "/dict_char_vi.pkl") else: dict_char_vi = generateCharacterDict(properties.vi_char) utils.save_file(data_path + "/dict_char_vi.pkl", dict_char_vi) if path.exists(data_path + "/dict_en.pkl"): dict_en = utils.load_file(data_path + "/dict_en.pkl", True) else: dict_en = build_dictionary(data_path, properties.vocab_en) utils.save_file(data_path + "/dict_en.pkl", dict_en, True) if path.exists(data_path + "/dict_vi.pkl"): dict_vi = utils.load_file(data_path + "/dict_vi.pkl", True) else: dict_vi = build_dictionary(data_path, properties.vocab_vi) utils.save_file(data_path + "/dict_vi.pkl", dict_vi, True) dataset_en, unknown_en = map_sentence_idx( data_path + "/" + properties.train_en, dict_en, dict_char_en) dataset_vi, unknown_vi = map_sentence_idx( data_path + "/" + properties.train_vi, dict_vi, dict_char_vi) utils.save_file(data_path + "/dataset_en.pkl", (dataset_en, unknown_en)) utils.save_file(data_path + "/dataset_vi.pkl", (dataset_vi, unknown_vi))
def read_data(args): data_sources = [] header = '' if (args.f != None): if not isinstance(args.f, basestring): parts = [] for afile in args.f: part_of_data = utils.load_file(afile) if args.e != None and args.e == 'y': if header == '': header = part_of_data[0] part_of_data = part_of_data[1:len(part_of_data)] parts.append(part_of_data.tolist()) if args.s != None and args.s == 'y': parts.append("\n") parts = [item for sublist in parts for item in sublist] data_sources = array(parts) else: data_sources = utils.load_file(args.f) output = [] if header != '': output.append(header) for item in data_sources: output.append(item) return output
def evaluate_transportation(url, url2, pred_length=8): preds = utils.load_file(url) preds = np.array(preds) lt = len(preds) labels = utils.load_file(url2) labels = np.array(labels) labels = labels.reshape(len(labels), 32, 32) shape = np.shape(preds) if preds.shape[-1] < pred_length: print("data shape is ", preds.shape) pred_length = preds[-1] loss_mae0 = [0.0] * pred_length loss_rmse0 = [0.0] * pred_length r2_total = 0.0 for i, d in enumerate(preds): # 8 is encoder_length lb_i = i + 8 # labels[lb_i:(pred_length+lb_i),:,:] for x in xrange(pred_length): mae0, mse0, _ = get_evaluation(d[x, :, :], labels[lb_i + x, :, :]) # mae0, mse0, r2 = get_evaluation(d[0,:,:], labels[lb_i,:,:]) loss_rmse0[x] += mse0 loss_mae0[x] += mae0 # r2_total += r2 loss_mae0 = [(x / lt * 131) for x in loss_mae0] loss_rmse0 = [(sqrt(x / lt) * 131) for x in loss_rmse0] # r2_total = r2_total / lt # print("MAE: %.6f" % loss_mae0) # print("RMSE: %.6f" % loss_rmse0) # print("R2 Score: %.6f" % r2_total) print_accumulate_error(loss_mae0, loss_rmse0, pred_length, 0)
def get_data(vocabs=""): print("==> Load Word Embedding") word_embedding = utils.load_glove(use_index=True) validation_data = [] training_data = [] if not vocabs: non_words = utils.load_file(p.non_word, False) for w in non_words: w_ = w.replace('\n', '').split(' ') validation_data.append(int(w_[-1])) training_data = utils.sub(range(len(word_embedding)), validation_data) else: vocabs_set = utils.load_file(vocabs) print("vc", len(vocabs_set)) training_data = [w for _, w in vocabs_set.iteritems()] tm = range(len(word_embedding)) validation_data = list(utils.sub(set(tm), set(training_data))) length = int(math.ceil(len(training_data) * 1.0 / p.compression_batch_size)) * p.compression_batch_size - len(training_data) print('before', 'vd', len(validation_data), 'td', len(training_data)) if length: add_on = np.random.choice(validation_data, length) training_data += add_on.tolist() validation_data = utils.sub(set(validation_data), set(add_on)) print('vd', len(validation_data), 'td', len(training_data)) # utils.save_file(p.glove_path, training_data) return word_embedding, training_data, validation_data
def execute_gan(path, attention_url, label_path, url_weight, model, session, saver, batch_size, encoder_length, decoder_length, is_test, train_writer=None, offset=0, gpu_nums=1): print("==> Loading dataset") dataset = utils.load_file(path) if dataset: dataset = np.asarray(dataset, dtype=np.float32) lt = len(dataset) train, _ = utils.process_data_grid(lt, batch_size, encoder_length, decoder_length, True) attention_data = None if attention_url: attention_data = utils.load_file(attention_url) labels = None if label_path: labels = utils.load_file(label_path) model.set_data(dataset, train, None, attention_data, labels) model.assign_datasets(session) if not is_test: print('==> starting training') suffix = p.weight_saving_break for epoch in xrange(p.total_iteration): _ = model.run_epoch(session, train, offset + epoch, train_writer, train=True, verbose=False, stride=4) tmp_e = epoch + 1 if tmp_e % 10 == 0: suffix = math.ceil(float(tmp_e) / p.weight_saving_break) # utils.update_progress((epoch + 1) * 1.0 / p.total_iteration) saver.save(session, 'weights/%s_%i.weights' % (url_weight, suffix)) saver.save(session, 'weights/%s_%i.weights' % (url_weight, suffix)) else: # saver.restore(session, url_weight) print('==> running model') _, preds = model.run_epoch(session, train, train=False, verbose=False, shuffle=False, stride=2) save_gan_preds(url_weight, preds)
def convert_vocab_to_text(vocabs): vocab_str = "" length = len(vocabs) i = 0 vocab_idx = dict() vocab_lst = list() idx_file = '%s/%s' % (folder, 'vocabs_idx.pkl') if u.check_file(idx_file): vocab_idx = u.load_file(idx_file) else: for key, value in vocabs.iteritems(): vocab_idx[value] = key u.save_file(idx_file, vocab_idx) lst_file = '%s/%s' % (folder, 'vocabs_list.pkl') if u.check_file(lst_file): vocab_lst = u.load_file(lst_file) else: for key in sorted(vocab_idx.iterkeys()): vocab_lst.append(vocab_idx[key]) u.save_file(lst_file, vocab_lst) regex = RegexpTokenizer(r'\w+') for w in vocab_lst: words = regex.tokenize(w) if len(words) != 0: w_ = '_'.join(words) i += 1 if i % 10000 == 0: print('Processed %i' % i) # break if i == length: vocab_str += '%s' % w_ else: vocab_str += '%s\n' % w_ return vocab_str
def build_emoji_sentiment_dictionary(): new_emoji_sentiment_filename = path + "/res/emoji/emoji_sentiment_dictionary.txt" if not os.path.exists(new_emoji_sentiment_filename): filename = path + "/res/emoji/emoji_sentiment_raw.txt" emojis = utils.load_file(filename)[1:] lines = [] for line in emojis: line = line.split(",") emoji = line[0] occurences = line[2] negative = float(line[4]) / float(occurences) neutral = float(line[5]) / float(occurences) positive = float(line[6]) / float(occurences) description = line[7] lines.append( str(emoji) + "\t" + str(negative) + "\t" + str(neutral) + "\t" + str(positive) + "\t" + description.lower()) utils.save_file(lines, new_emoji_sentiment_filename) emoji_sentiment_data = utils.load_file(new_emoji_sentiment_filename) emoji_sentiment_dict = {} for line in emoji_sentiment_data: line = line.split("\t") # Get emoji characteristics as a list [negative, neutral, positive, description] emoji_sentiment_dict[line[0]] = [line[1], line[2], line[3], line[4]] return emoji_sentiment_dict
def exe(word_vectors_file, vector_preloaded_path, train_path, dev_path, test_path, hsi, hso, maxlen, pep, fep, ppat, fpat, plr, flr, mix): global word_vectors, vocabs if os.path.exists(train_path) and os.path.exists( dev_path) and os.path.exists(test_path): train = utils.load_file(train_path) dev = utils.load_file(dev_path) test = utils.load_file(test_path) else: raise NotImplementedError() if word_vectors is None or vocabs is None: word_vectors, vocabs = utils.loadWordVectors(word_vectors_file, vector_preloaded_path) if not maxlen: maxlen = properties.maxlen lstm = Model(word_vectors, hidden_sizes=[hsi, hso], epochs=pep, patience=ppat, learning_rate=plr) lstm_params = lstm.train(train, dev, test, maxlen) if mix is 'Y': combined = LSTM_CNN(word_vectors, hidden_sizes=[hsi, hso], epochs=fep, lstm_params=lstm_params) combined.train(train, dev, test, maxlen)
def evaluate_single_pred(url, url2, decoder_length=8): cr = Crawling() data = utils.load_file(url) if type(data) is list: data = np.asarray(data) lt = data.shape[0] * data.shape[1] data = np.reshape(data, (lt, 25)) dtl = len(data) labels = utils.load_file(url2) labels = np.asarray(labels) loss_mae = 0.0 loss_rmse = 0.0 r2_total = 0.0 for i, d in enumerate(data): pred_t = np.asarray(d).flatten() lb_i = i * pr.strides + 24 lbt = labels[lb_i:(lb_i + decoder_length), :, 0] lbg = lbt[decoder_length - 1, :].flatten() mae, mse, r2 = get_evaluation(pred_t, lbg) loss_mae += mae loss_rmse += mse r2_total += r2 utils.update_progress((i + 1.0) / dtl) loss_mae = loss_mae / lt * 300 loss_rmse = sqrt(loss_rmse / lt) * 300 r2_total = r2_total / lt print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae))) print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse))) print("R2 score: %.6f" % r2_total)
def prepare_data(shuffle=False, labels_to_categorical=True): path = os.getcwd()[:os.getcwd().rfind("/")] to_write_filename = path + "/stats/data_prep_for_lstm_visualization.txt" utils.initialize_writer(to_write_filename) train_filename = "train.txt" test_filename = "test.txt" tokens_filename = "clean_original_" # other types of tokens to experiment with in /res/tokens/ data_path = path + "/res/tokens/tokens_" # Load the data train_data = utils.load_file(data_path + tokens_filename + train_filename) test_data = utils.load_file(data_path + tokens_filename + test_filename) if shuffle: train_data = utils.shuffle_words(train_data) test_data = utils.shuffle_words(test_data) print("DATA IS SHUFFLED") # Load the labels train_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + train_filename) ] test_labels = [ int(l) for l in utils.load_file(path + "/res/datasets/ghosh/labels_" + test_filename) ] # Get the max length of the train tweets max_tweet_length = utils.get_max_len_info(train_data) # Convert all tweets into sequences of word indices tokenizer, train_indices, test_indices = utils.encode_text_as_word_indexes( train_data, test_data, lower=True) vocab_size = len(tokenizer.word_counts) + 1 word_to_index = tokenizer.word_index print("There are %s unique tokens." % len(word_to_index)) # Pad sequences with 0s (can do it post or pre - post works better here) x_train = pad_sequences(train_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.) x_test = pad_sequences(test_indices, maxlen=max_tweet_length, padding="post", truncating="post", value=0.) # Transform the output into categorical data or just keep it as it is (in a numpy array) if labels_to_categorical: train_labels = to_categorical(np.asarray(train_labels)) test_labels = to_categorical(np.asarray(test_labels)) else: train_labels = np.array(train_labels) test_labels = np.array(test_labels) return x_train, train_labels, x_test, test_labels, vocab_size, tokenizer, max_tweet_length
def evaluate_sp(url, url2, decoder_length=24, is_grid=True, grid_eval=True): cr = Crawling() map_ = heatmap.build_map() data = utils.load_file(url) if type(data) is list: data = np.asarray(data) if len(data.shape) == 4: lt = data.shape[0] * data.shape[1] else: lt = data.shape[0] if is_grid: data = np.reshape(data, (lt, data.shape[-2], 25, 25)) else: data = np.reshape(data, (lt, data.shape[-2], 25)) labels = utils.load_file(url2) labels = np.asarray(labels) loss_mae = 0.0 loss_rmse = 0.0 r2_total = 0.0 for i, d in enumerate(data): d = d[:decoder_length, :, :] pred_t = [] if is_grid: for d_ in d: d_t = heatmap.clear_interpolate_bound(np.asarray(d_), map_) pred_t.append(d_t) else: if grid_eval: for d_ in d: d_t = heatmap.fill_map(d_, map_) pred_t.append(d_t) else: pred_t = d lb_i = i * pr.strides + 24 lbt = labels[lb_i:(lb_i + decoder_length), :, 0] if grid_eval: lbg = [] for x in lbt: x_l = heatmap.fill_map(x, map_) lbg.append(x_l) lbg = np.asarray(lbg) lbg = lbg.flatten() else: lbg = lbt.flatten() pred_t = np.asarray(pred_t) pred_t = pred_t.flatten() mae, mse, r2 = get_evaluation(pred_t, lbg) loss_mae += mae loss_rmse += mse r2_total += r2 utils.update_progress((i + 1.0) / lt) loss_mae = loss_mae / lt * 300 loss_rmse = sqrt(loss_rmse / lt) * 300 r2_total = r2_total / lt print("MAE: %.6f %.6f" % (loss_mae, cr.ConcPM25(loss_mae))) print("RMSE: %.6f %.6f" % (loss_rmse, cr.ConcPM25(loss_rmse))) print("R2 Score: %.6f" % r2_total)
def accuracy_metric(sample_list_file, result_file): sample_list = load_file(sample_list_file) group = { 'CW': [], 'CH': [], 'TN': [], 'TC': [], 'DC': [], 'DL': [], 'DO': [] } for id, row in sample_list.iterrows(): qns_id = str(row['video']) + '_' + str(row['qid']) qtype = str(row['type']) #(combine temporal qns of previous and next as 'TN') if qtype == 'TP': qtype = 'TN' group[qtype].append(qns_id) preds = load_file(result_file) group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0} group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0} overall_acc = {'C': 0, 'T': 0, 'D': 0} overall_cnt = {'C': 0, 'T': 0, 'D': 0} all_acc = 0 all_cnt = 0 for qtype, qns_ids in group.items(): cnt = 0 acc = 0 for qid in qns_ids: cnt += 1 answer = preds[qid]['answer'] pred = preds[qid]['prediction'] if answer == pred: acc += 1 group_cnt[qtype] = cnt group_acc[qtype] += acc overall_acc[qtype[0]] += acc overall_cnt[qtype[0]] += cnt all_acc += acc all_cnt += cnt for qtype, value in overall_acc.items(): group_acc[qtype] = value group_cnt[qtype] = overall_cnt[qtype] for qtype in group_acc: print(map_name[qtype], end='\t') print('') for qtype, acc in group_acc.items(): print('{:.2f}'.format(acc * 100.0 / group_cnt[qtype]), end='\t') print('') print('Acc: {:.2f}'.format(all_acc * 100.0 / all_cnt))
def main(): parser = get_parser() args = parser.parse_args() if args.doc: print __doc__ sys.exit() g = geosearchclass.GeoSearchClass() if args.params: print 'Using parameters from ' + str(args.params) # turn parameter file into dictionary g.set_params_from_file(args.params) if args.address: print "Finding geocoordates for address:\n{}".format(args.address) coords = geo_converter.get_geocoords_from_address(args.address) if coords: g.latitude = coords[0] print "Found this latitude:" print g.latitude g.longitude = coords[1] print "Found this longitude:" print g.longitude else: print "Failed to find coordinates. Exiting." sys.exit() if args.input: text = utils.load_file(args.input) tokens = utils.tokenize_normal_words(text) for_poem = utils.filter_words(tokens) else: for_poem = get_default_words() if args.markov: if args.input: raise StandardError("Can only input a single text file. \ use --markov <your_text_file.txt>") else: text = utils.load_file(args.markov) # ngram = ngrams.make_ngram(text, 2) ngram = ngrams.make_bigram_trigram_dictionary(text) formatted_poem = create_poem(g, for_poem, ngram) else: formatted_poem = create_poem(g, for_poem) if args.output: print '\nwriting formatted poem to ' + str(args.output) output_file = args.output else: print "\nwriting formatted poem to poem.txt" output_file = "poem.txt" utils.save_file(output_file, formatted_poem)
def load_trained_params(self): lstm = utils.load_file('lstm_cb.txt') hidden_lstm = utils.load_file('hidden_cb.txt') hidden_relu_lstm = utils.load_file('hidden_relu_cb.txt') full_connect_lstm = utils.load_file('full_connect_cb.txt') convs = list() for x in range(len(self.filter_sizes)): conv = utils.load_file('convolution_%s.txt' % x) convs.append(conv) return lstm, hidden_lstm, hidden_relu_lstm, full_connect_lstm, convs
def get_filtered_clean_data(train_filename, test_filename): # Loading the train and test sets print("Loading data...") train_tokens = utils.load_file(path + "/res/data/" + train_filename) test_tokens = utils.load_file(path + "/res/data/" + test_filename) filtered_train_tokens = ulterior_clean( train_tokens, path + "/res/data/filtered_" + train_filename) filtered_test_tokens = ulterior_clean( test_tokens, path + "/res/data/filtered_" + test_filename) return filtered_train_tokens, filtered_test_tokens
def evaluate_lstm(url, url2, decoder_length=24, forecast_factor=0, is_classify=False): data = utils.load_file(url) if type(data) is list: data = np.asarray(data) lt = data.shape[0] * data.shape[1] data = np.reshape(data, (lt, data.shape[-1])) if decoder_length > data.shape[-1]: decoder_length = data.shape[-1] dtl = len(data) labels = utils.load_file(url2) labels = np.asarray(labels) if not is_classify: loss_mae = [0.0] * decoder_length loss_rmse = [0.0] * decoder_length else: acc = 0. #: r2_total = 0.0 cr = Crawling() for i, d in enumerate(data): if decoder_length < data.shape[-1]: pred_t = d[:decoder_length] else: pred_t = d lb_i = i * pr.strides + 24 lbt = np.mean(labels[lb_i:(lb_i + decoder_length), :, forecast_factor], axis=1) a = 0. for t_i, (p, l) in enumerate(zip(pred_t, lbt)): if not is_classify: # mae, mse, _ = get_evaluation(p, l) mae = abs(cr.ConcPM10(p * 300) - cr.ConcPM10(l * 300)) loss_mae[t_i] += mae # loss_rmse[t_i] += mse else: a += classify_data(pred_t, lbt, forecast_factor) if is_classify: a = a / decoder_length acc += a # r2_total += r2 utils.update_progress((i + 1.0) / dtl) if not is_classify: loss_mae = np.array(loss_mae) / lt # loss_rmse = [sqrt(x / lt) * 300 for x in loss_rmse] # print("R2 score: %.6f" % r2_total) print_accumulate_error(loss_mae, loss_rmse, decoder_length, forecast_factor=forecast_factor) else: acc = acc / lt * 100 print("accuracy %.4f" % acc)
def get_strict_data(train_filename, test_filename): # Load the train and test sets print("Loading data...") train_tweets = utils.load_file(path + "/res/data/" + train_filename) test_tweets = utils.load_file(path + "/res/data/" + test_filename) # Initial clean of data strict_tweets_train = strict_clean( train_tweets, path + "/res/data/strict_" + train_filename) strict_tweets_test = strict_clean( test_tweets, path + "/res/data/strict_" + test_filename) return strict_tweets_train, strict_tweets_test
def __init_docs(self, file_names): docs = [] # type:List[Document] for f_name in file_names: # 读取text文档 text = load_file(self.root_path, f_name, "txt") # type:str # 读取entities和ent_pairs ann_data = load_file(self.root_path, f_name, "ann") entities, entity_pairs = self.__get_entities_and_pairs( ann_data) # type:NamedEntitySet,List[EntityPair] d = Document(f_name, self.root_path, text, entities, entity_pairs) docs.append(d) self._docs = docs
def transform(self, X, **transform_params): docs_topics_vectors = [] lda_model = load_file("models/LDAbow_fbpac.pickle") lda_dictionary = load_file("models/LDAdict_fbpac.pickle") for doc in X: try: bow_vector = lda_dictionary.doc2bow(pre_process(doc)) docs_topics_vectors.append(lda_model[bow_vector]) except Exception as e: print(e) print("Error in computing topic vector") n, nx, ny = np.array(docs_topics_vectors).shape d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny)) return d2_all_docs[:, 1::2]
def main(urls, file=False): global loaded a_load = utils.load_file('cached.pkl') if a_load: loaded = a_load else: loaded = dict() if file: urls = utils.load_file(urls) # bad = utils.load_file('sitemap_bad.txt') elif urls: urls = urls.split(',') scrape_list(urls) utils.save_file('cached.pkl', loaded)
def get_clean_dl_data(train_filename, test_filename, word_list): vocab_filename = "dnn_vocabulary_" + train_filename # Load the train and test sets print("Loading data...") train_tweets = utils.load_file(path + "/res/tokens/tokens_" + train_filename) test_tweets = utils.load_file(path + "/res/tokens/tokens_" + test_filename) vocabulary = build_vocabulary_for_dnn_tasks( path + "/res/vocabulary/" + vocab_filename, train_tweets) clean_train_tweets, train_indices = vocabulary_filtering( vocabulary, train_tweets) clean_test_tweets, test_indices = vocabulary_filtering( vocabulary, test_tweets) return clean_train_tweets, train_indices, clean_test_tweets, test_indices, len( vocabulary)
def convert_data_to_grid(url, out_url, url_att="", out_url_att="", part=1): grid = heatmap.build_map(pr.map_size) data = utils.load_file(url) lt = len(data) attention_data = None att_part = None print(url_att) if url_att: attention_data = utils.load_file(url_att) alt = len(attention_data) if lt != alt: raise ValueError( "Attention & Main Data need same length while %s and %s" % (lt, alt)) data = zip(data, attention_data) att_part = [] res = [] if part != 1: bound = int(math.ceil(float(lt) / part)) else: bound = lt for i, row in enumerate(data): if url_att: t, a = row else: t = row if i and (i % bound) == 0: p_i = i / bound out_url_name = out_url + "_" + str(p_i) utils.save_file(out_url_name, res) if url_att: att_out_url_name = out_url_att + "_" + str(p_i) utils.save_file(att_out_url_name, att_part) res = [] att_part = [] g = heatmap.fill_map(t, grid) res.append(g) if url_att: att_part.append(a) utils.update_progress(float(i) / lt) if part == 1: out_url_name = out_url else: out_url_name = out_url + "_" + str(part) utils.save_file(out_url_name, res) if url_att: att_out_url_name = out_url_att + "_" + str(part) utils.save_file(att_out_url_name, att_part)
def load_multi_data(args): header = [] data = {} if (args.f != None): if not isinstance(args.f, basestring): for afile in args.f: file_lines = utils.load_file(afile) count = 0 for line in file_lines: org_line = line line = line.rsplit(',') if (len(line) <= 1): line = org_line.rsplit("\t") if (count == 0): if (len(header) > 0): header.append(', '.join(line[1:len(line)])) else: header.append(', '.join(line)) count = count + 1 continue if (line[0] not in data): data[line[0]] = [] data[line[0]].append(', '.join(line[1:len(line)])) return (header, data)
def pdb_to_lh5(traj, field): path = getattr(traj, field) data = load_file(path) new_fn = os.path.splitext(path)[0] + '.lh5' save_file(new_fn, data) os.unlink(path) setattr(traj, field, new_fn)
def read_data(args): data_sources = [] header = "" if args.f != None: parts = [] afile = args.f part_of_data = utils.load_file(afile) if args.e != None and args.e == "y": if header == "": header = part_of_data[0] part_of_data = part_of_data[1 : len(part_of_data)] part_of_data = calculat_hsv_model(part_of_data, args.t) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) output = [] if header != "": output.append(header) for item in data_sources: output.append(item) return output
def main(args): # Load file content. content = load_file(args.input, encoding=args.encoding) # Clean content. cleaned = clean(content, args.pattern) # Save cleaned content. save_file(args.output, cleaned, encoding=args.encoding)
def split_hashtag_long_version(hashtag): word_file = path + "/res/word_list.txt" word_list = utils.load_file(word_file).split() word_dictionary = list(set(words.words())) for alphabet in "bcdefghjklmnopqrstuvwxyz": word_dictionary.remove(alphabet) all_poss = split_hashtag_to_words_all_possibilities( hashtag.lower(), word_dictionary) max_p = 0 min_len = 1000 found = False best_p = [] for poss in all_poss: counter = 0 for p in poss: if p in word_list: counter += 1 if counter == len(poss) and min_len > counter: found = True min_len = counter best_p = poss else: if counter > max_p and not found: max_p = counter best_p = poss best_p_v2 = split_hashtag(hashtag, word_list) if best_p != [] and best_p_v2 != []: split_words = best_p if len(best_p) < len(best_p_v2) else best_p_v2 else: if best_p == [] and best_p_v2 == []: split_words = [hashtag] else: split_words = best_p if best_p_v2 == [] else best_p_v2 split_words = ['#' + str(s) for s in split_words] return split_words
def read_data(args): [operator, comparing_field_index, threshold] = args.t.rsplit(',') data_sources = [] header = '' if (args.f != None): parts = [] afile = args.f part_of_data = utils.load_file(afile) if args.e != None and args.e == 'y': if header == '': header = part_of_data[0] part_of_data = part_of_data[1:len(part_of_data)] part_of_data = filter_content(part_of_data, operator, comparing_field_index, threshold) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) output = [] if header != '': output.append(header) for item in data_sources: output.append(item) return output
def read_data(args): data_sources = [] header = '' if (args.f != None): parts = [] afile = args.f part_of_data = utils.load_file(afile) if args.e != None and args.e == 'y': if header == '': header = part_of_data[0] part_of_data = part_of_data[1:len(part_of_data)] part_of_data = calculat_facedetection_model(part_of_data, args.t, args.c) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) output = [] if header != '': output.append(header) for item in data_sources: output.append(item) return output
def on_return(self, task): """Called by main thread on the return of data from the workers. Post-processing""" logger.info('Retrieved task %s', task.tag) traj = Session.query(models.Trajectory).get(int(task.tag)) try: # save lh5 version of the trajectory conf = load_file(self.project.pdb_topology_file) coordinates = msmbuilder.Trajectory.load_trajectory_file(str(traj.dry_xtc_fn), Conf=conf) save_file(traj.lh5_fn, coordinates) except Exception as e: logger.error('When postprocessing %s, convert to lh5 failed!', traj) logger.exception(e) raise # convert last_wet_snapshot to lh5 pdb_to_lh5(traj, 'last_wet_snapshot_fn') pdb_to_lh5(traj, 'init_pdb_fn') traj.host = task.host traj.returned_time = datetime.now() traj.length = len(coordinates) logger.info('Finished converting new traj to lh5 sucessfully')
def on_open1_activate(self, widget=None, file=''): self.on_quit2_activate()# this takes care of saving content yes/no if not file: dlg = FileDialog(action='open',title=_("Open GvR world"),ext='wld') response = dlg.run() if response == Gtk.ResponseType.OK: file = dlg.get_filename() if os.path.splitext(file)[1] != '.wld': self.show_error(_("Selected path is not a world file")) dlg.destroy() return elif response == Gtk.ResponseType.CANCEL: self.logger.debug('Closed, no files selected') dlg.destroy() return dlg.destroy() txt = utils.load_file(file) if txt: self.set_text(file,txt) self.parent.on_button_reload() return
def on_open1_activate(self, widget=None,file=''): if not file: dlg = FileDialog(action='open',title=_("Open GvR program"),ext='gvr') response = dlg.run() if response == Gtk.ResponseType.OK: file = dlg.get_filename() if os.path.splitext(file)[1] != '.gvr': self.show_error(_("Selected path is not a program file")) dlg.destroy() return elif response == Gtk.ResponseType.CANCEL: self.logger.debug('Closed, no files selected') dlg.destroy() return dlg.destroy() txt = utils.load_file(file) if txt: self.set_text(file,txt) for b in ('execute','step','abort'): self.parent._set_sensitive_button(b,True) return
def read_file(afile, fields, filters, data_sources, data_labels, padding = None): part_of_data = utils.load_file(afile) part_of_data = part_of_data[1:len(part_of_data)] content = {} count = 0 for line in part_of_data: line_fields = line.rsplit(',') if (len(line_fields) == 1): line_fields = line.rsplit("\t") selected_line = '' if (len(filters) == 0 or data_labels[count] in filters): selected_line = [line_fields[int(index)] for index in fields] else: if padding != None: selected_line = padding else: selected_line = ['0' for index in fields] #content.append(','.join(selected_line)) content[line_fields[0]] = ','.join(selected_line) count += 1 output_content = [] for item in data_sources: if (item in content): output_content.append(content[item]) else: print("key " + item + " is not in input file.") return "\n".join(output_content)
def main(): parser = argparse.ArgumentParser(description = 'Exporting data matrix from HIT summary result.') parser.add_argument('-f', action = 'append', help = 'The CSV files.') parser.add_argument('-c', help = 'The exporting columns separated with comma.') parser.add_argument('-o', help = 'The output file.') parser.add_argument('-t', help = 'The types used to filter out data row.') parser.add_argument('-p', default = '0', help = 'The padding for filtered rows.') parser.add_argument('-d', help = 'The data source file.') args = parser.parse_args() data_sources = [] data_labels = [] data_ids = [] if (args.d != None): data_sources = utils.load_file(args.d) data_metainfo = regex_datasource(data_sources) # data_labels: flickr high interesting 1, flickr low interesting 2, pinterest [3, 4, 5] data_labels = data_metainfo[0] # data_ids: (flickr, pinterest) image id data_ids = data_metainfo[1] output = read_data(args, data_sources, data_labels) if (args.o != None): utils.write_file(output, args.o)
def load_rules(path, *, encoding='utf-8'): """Load parse file with 'pseudo' tree of rules. Args: path: string with a path to the file with rules, encoding: encoding of the file (default='utf-8') Return: dict: dictionary with pseudo tree structure, representing hierarchy of rules NOTE: Expected file structure: +- rule1_from --> rule1_to | +- sub_rule1_from --> sub_rule1_to | | +- sub_sub_rule1_from --> sub_sub_rule1_to +- rule2_from --> rule2_to +- rule3_from --> rule3_to | +- sub_rule3_from --> sub_rule3_to """ # Load rules from a file. raw_rules = load_file(path, encoding=encoding) # Parse rules into 'pseudo' tree structure. return parse_rules(list(raw_rules))
def read_file(filename, n_fold): data_sources = [] parts = [] part_of_data = utils.load_file(filename) part_of_data = part_of_data[1:len(part_of_data)] part_of_data = filter_content(part_of_data) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) random.shuffle(data_sources) data_count_limit = len(data_sources) / n_fold folds = [] count = 0 for begin_index in range(0, len(data_sources), data_count_limit): end_index = begin_index + data_count_limit if (count == n_fold - 1): end_index = len(data_sources) print("begin: " + str(begin_index)) print("end: " + str(end_index)) folds.append(data_sources[begin_index:end_index]) count += 1 return folds
def read_data(args): data_sources = [] header = '' if (args.f != None): parts = [] afile = args.f part_of_data = utils.load_file(afile) if args.e != None and args.e == 'y': if header == '': header = part_of_data[0] part_of_data = part_of_data[1:len(part_of_data)] part_of_data = calculat_hsv_model(part_of_data, args.t) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) output = [] if header != '': output.append(header) for item in data_sources: output.append(item) return output
def main(): parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.') parser.add_argument('-f', help = 'The mtk data source file.') parser.add_argument('-o', help = 'The output file of used data.') args = parser.parse_args() data_sources = [] if (args.f != None): data_sources = utils.load_file(args.f) random.shuffle(data_sources) db_collections = hit.setup_mongodb() data_metainfo = hit.regex_datasource(data_sources) images_metainfo = hit.query_imagedata_from_db(db_collections, data_metainfo) # data_labels: flickr high interesting 1, flickr low interesting 2, pinterest [3, 4, 5] data_labels = data_metainfo[0] # data_ids: (flickr, pinterest) image id data_ids = data_metainfo[1] data_count_limit = 50 for begin_index in range(0, len(data_sources), data_count_limit): print("index: " + str(begin_index)) generate_hits(data_sources[begin_index:begin_index + data_count_limit], begin_index, args, data_ids[begin_index:begin_index + data_count_limit], images_metainfo) sys.exit(0)
def main(): parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.') parser.add_argument('-f', help = 'The mtk data source file.') parser.add_argument('-o', help = 'The output file of used data.') parser.add_argument('-m', default = 'normal', help = 'The running mode in {normal, qua_init, qua}.') parser.add_argument('-q', help = 'The qualification type id.') parser.add_argument('-t', default = 'sandbox', help = 'The type of Mechanical Turk.') args = parser.parse_args() if (args.m == 'qua' and args.q == None): print('Please give qualification type id if running in qualification mode.') sys.exit(0) data_sources = [] if (args.f != None): data_sources = utils.load_file(args.f) if (args.m != 'qua'): random.shuffle(data_sources) data_count_limit = 100 for begin_index in range(0, len(data_sources), data_count_limit): print("index: " + str(begin_index)) generate_hits(args.t, data_sources[begin_index:begin_index + data_count_limit], begin_index, args) sys.exit(0)
def welcome(update, context): chat_id = update.effective_chat.id for new_member in update.message.new_chat_members: chat_title = update.message.chat.title if not new_member.is_bot: first_name = new_member.first_name last_name = new_member.last_name username = new_member.username if last_name is None: last_name = "" message = WELCOME_NEW_MEMBER_MESSAGE.format( first_name=first_name, last_name=last_name, username=username, chat_title=chat_title, ) context.bot.sendPhoto( chat_id=chat_id, photo=load_file( context=context, pickle_file=DEFAULT_PICKEL_FILE_PHOTO, default_photo=DEFAULT_PHOTO_WELCOME, chat_id=chat_id, ), caption=message, ) elif new_member.full_name == "PyLadies Brasil Bot": context.bot.send_message( chat_id=chat_id, text=HELLO_MESSAGE.format(chat_title=chat_title))
def add_hits(filename, all_hits): hits = utils.load_file(filename) hits = hits[0:len(hits)] all_hits.append(hits) return all_hits
def read_data(args): x_column = int(args.x) y_column = int(args.y) bin_length = int(args.b) threshold = float(args.t) x_threshold = int(args.d) data_sources = [] if (args.f != None): parts = [] afile = args.f part_of_data = utils.load_file(afile) part_of_data = calculat_hsv_figure(part_of_data, x_column, y_column, bin_length, threshold, x_threshold) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) output = [] for item in data_sources: output.append(item) return output
def main(args): content_generator = load_file(args.transcript, encoding=args.encoding) rules = load_rules(args.rules, encoding=args.encoding) mapped = list(do_mapping(content_generator, rules)) formatted = format_data(mapped) save_file(args.output, formatted, encoding=args.encoding)
def load_csv(path, encoding='utf-8'): for item in load_file(path, encoding=encoding): splitted = item.split(';') # Change to SINGLE word file format splitted[2] = splitted[2][:-3] splitted[3] = splitted[3][:-3] yield splitted
def main(args): content_generator = load_file(args.transcript, encoding=args.encoding) rules = load_rules(args.rules, encoding=args.encoding) mapped = do_mapping(content_generator, rules) cleaned = clean(mapped) formatted = mlf_format_data(cleaned) save_file(args.output, formatted, encoding=args.encoding)
def load(cls): """ Load rules from config file. """ if cls._loaded_rules is None: log("Loading contextual rules...", "CYAN", True) lx = load_file("corpus/contextual_rules.rls") cls._loaded_rules = [r for r in lx.split(u"\n") if len(r) > 1] return cls._loaded_rules
def load(cls): if cls._loaded_rules is None: log("Loading lemmatizer rules...", "CYAN", True) lx = load_file("corpus/lemmatizer_rules.rls") cls._loaded_rules = [] for line in lx.split(u"\n"): els = line.split(u"\t") if els[0] != u"": cls._loaded_rules.append(els[0]) return cls._loaded_rules
def main(args): # Load alignment of the phonemes alignment = load_mlf_to_dict(args.mlf, clean=False) # Merge items in the mlf if the output is from AP decoder if (args.a): alignment = process_ap(alignment) # Load path to the processed WAV files, should corresponds to the alignment paths = load_file(args.scp) # Load WAV files waves = get_wave(paths) # Use Noiser library to the changing of the tempo process(waves, alignment, args.output, int(args.tempo), args.skip)
def setUp(self): super(TestPublish, self).setUp() self.user = User('*****@*****.**') self.login('*****@*****.**') self.issue = models.Issue(subject='test') self.issue.local_base = False self.issue.put() self.ps = models.PatchSet(parent=self.issue, issue=self.issue) self.ps.data = load_file('ps1.diff') self.ps.save() self.patches = engine.ParsePatchSet(self.ps) db.put(self.patches)
def setUp(self): super(TestStatusListener, self).setUp() self.user = users.User('*****@*****.**') self.login('*****@*****.**') self.issue = models.Issue(subject='test') self.issue.local_base = False self.issue.put() self.ps = models.PatchSet(parent=self.issue.key, issue_key=self.issue.key) self.ps.data = load_file('ps1.diff') self.ps.put() self.patches = engine.ParsePatchSet(self.ps) ndb.put_multi(self.patches) self.logout() # Need to log out for /status_listener to work
def read_data(args): data_sources = [] if (args.f != None): if not isinstance(args.f, basestring): parts = [] for afile in args.f: part_of_data = utils.load_file(afile) parts.append(part_of_data.tolist()) parts = [item for sublist in parts for item in sublist] data_sources = array(parts) else: data_sources = utils.load_file(args.f) hits = [] head_of_hits = '' if (args.m != None): if not isinstance(args.f, basestring): parts = [] for afile in args.m: part_of_data = utils.load_file(afile) if head_of_hits == '': head_of_hits = part_of_data[0] part_of_data = part_of_data[1:len(part_of_data)] parts.append(part_of_data.tolist()) parts = [item for sublist in parts for item in sublist] hits = array(parts) else: hits = utils.load_file(args.m) hits = hits[0] hits = hits[1:len(hits)] return (data_sources, hits, head_of_hits)
def main(): parser = argparse.ArgumentParser(description = 'Generate HITs for Amazon Mechnical Turk workers.') parser.add_argument('-f', help = 'The mtk data source file.') parser.add_argument('-o', help = 'The output file of used data.') args = parser.parse_args() data_sources = [] if (args.f != None): data_sources = utils.load_file(args.f) random.shuffle(data_sources) generate_hits(data_sources, args) sys.exit(0)
def loaded(self): """ Load lexicon in RAM, from file. The representation will be a dict {"word1": [{tag1 : lemme1}]} """ if not self.PATH in self._loaded: # Caching and lazy loading sulci_logger.debug("Loading lexicon...", "RED", True) lx = load_file("%s/lexicon.lxc" % self.PATH) self._loaded[self.PATH] = {} for line in lx.split("\n"): if line: lexicon_entity = LexiconEntity(line) self.add_factors(lexicon_entity.word) self._loaded[self.PATH][lexicon_entity.word] = lexicon_entity return self._loaded[self.PATH]
def read_file(filename): data_sources = [] parts = [] part_of_data = utils.load_file(filename) part_of_data = part_of_data[1 : len(part_of_data)] part_of_data = filter_content(part_of_data) parts.append(part_of_data) parts = [item for sublist in parts for item in sublist] dictionary = {} for item in parts: dictionary[item] = 1 return dictionary