def run(input_file, test_file, k): clf = RandomForestClassifier(n_estimators=k) df = preprocess_data(input_file) X, label_dict, dict = extract_features(df) r, c = X.shape dft = preprocess_testdata(test_file) Xt, yt = extract_testfeatures(dft, label_dict, dict) clf.fit(X[:, 0:c - 1], X[:, c - 1]) z = clf.predict(Xt) print(accuracy_score(yt, z))
def build_model_input(): print("Import class names") cols = [ 'Label', 'Latin Name', 'Common Name', 'Train Images', 'Validation Images' ] info = pd.read_csv("monkey_labels.txt", names=cols, skiprows=1) global LABELS LABELS = info['Common Name'] util.display_image('training/n0/n0018.jpg') print("Data augmentation/Preprocessing") height, width, channels = 299, 299, 3 train_datagen = ImageDataGenerator(rescale=1. / 255) train_generator = train_datagen.flow_from_directory( TRAIN_DIR, target_size=(height, width), batch_size=BATCH_SIZE, class_mode='categorical') test_datagen = ImageDataGenerator(rescale=1. / 255) test_generator = test_datagen.flow_from_directory(TEST_DIR, target_size=(height, width), batch_size=BATCH_SIZE, class_mode='categorical') print("Import pretrained Inception module") base_model = Xception(weights=INCEPTION_DIR, include_top=False, input_shape=(height, width, channels)) print("Extract features") train_features, train_labels = util.extract_features( 1097, BATCH_SIZE, train_generator, base_model) test_features, test_labels = util.extract_features(272, BATCH_SIZE, test_generator, base_model) return train_features, train_labels, test_features, test_labels
def configure_df(static, dynamic, params): # consolidate static/dynamic features, select features, scale values master_df = consolidate_features(static, dynamic) # configure clustering dataframe features = util.extract_features(params['clustering_features']) cluster_df = util.choose_features(master_df, ['dt_entropy', 'num_unique_words']) # return master feature df and scaled cluster df return master_df, scale_features(cluster_df)
def _extract_optic_flow_features(info_df: pd.DataFrame, dir_path: str, pickle_path: str): data = None print('Extracting optic flow features!') for index, row in info_df.iterrows(): for camera_no in [1, 2]: file_path = os.path.join( dir_path, row['Initials'], str(row['Session']), f"{row['Session']}K{camera_no}play{row['Segment_No']}.txt") current = pd.read_csv(file_path, delim_whitespace=True, header=None) child, therapist = current.iloc[:, :3].copy(), current.iloc[:, 3:] for p in range(2): cur_person = current.iloc[:, p * 3:(p + 1) * 3].copy() cur_person.columns = ['x', 'y', 'm'] cur_person['Key'] = row['Key'] cur_feats = util.extract_features( cur_person[cur_person['x'] != -10000000], columns=['x', 'y', 'm']) cur_feats = cur_feats.reset_index() cur_feats['CameraNo'], cur_feats['Person'] = camera_no, p if data is None: data = cur_feats.copy() else: if len(cur_feats) == 0: # Fill all zeros prev_feats.loc[:, 'x_mean':'m_var'] = 0 prev_feats['CameraNo'], prev_feats[ 'Person'] = camera_no, p prev_feats['Key'] = row['Key'] data = data.append(prev_feats, ignore_index=True) else: data = data.append(cur_feats, ignore_index=True) prev_feats = cur_feats.copy() data.to_pickle(pickle_path) return data
if reverse_word_dict[word_id_list[idx]] not in kb_dict: kb_dict[reverse_word_dict[word_id_list[idx]]] = [int(item) for item in line.strip('\r\n').split(',')] f.close() kb_emb_dict = {} with open(sys.argv[13], 'r') as f: for idx, line in enumerate(f): # key is word index in dictionary, value is embedding vector kb_emb_dict[idx] = [float(item) for item in line.strip('\r\n').split(',')] f.close() new_feature_dict = {} for k,v in word_dict.items(): features = extract_features(use_other_info, k, word_dict, word_emb_dict, pos_dict, pos_emb_dict, parser_dict, parser_emb_dict, dict_desc_dict, kb_dict, kb_emb_dict) if features is not None: new_feature_dict[k] = features else: sys.stderr.write("%s feature extraction failed" % k) print(len(new_feature_dict)) counter = 0 real_counter = 0 top_1 = 0 top_3 = 0 for com_item in analogy_list: counter += 1 flag = False
def main(args): tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True) train_toks = tokenizer.tokenize(open(args.train_file).read()) num_train_toks = int(args.train_fraction * len(train_toks)) print('-' * 79) print('Using %d tokens for training (%g%% of %d)' % (num_train_toks, 100 * args.train_fraction, len(train_toks))) train_toks = train_toks[:int(args.train_fraction * len(train_toks))] val_toks = tokenizer.tokenize(open(args.val_file).read()) num_val_toks = int(args.val_fraction * len(val_toks)) print('Using %d tokens for validation (%g%% of %d)' % (num_val_toks, 100 * args.val_fraction, len(val_toks))) val_toks = val_toks[:int(args.val_fraction * len(val_toks))] train_ngram_counts = tokenizer.count_ngrams(train_toks) # Get vocab and threshold. print('Using vocab size %d (excluding UNK) (original %d)' % (min( args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0]))) vocab = [ tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab) ] train_toks = tokenizer.threshold(train_toks, vocab, args.unk) val_toks = tokenizer.threshold(val_toks, vocab, args.unk) if args.features == 'basic1': feature_extractor = util.basic_features1 elif args.features == 'basic1suffix3': feature_extractor = util.basic_features1_suffix3 # TODO: Implement elif args.features == 'basic2': feature_extractor = util.basic_features2 else: raise ValueError('Unknown feature extractor type.') # for feature_extractor in [util.basic_features1, util.basic_features2, util.basic_features1_suffix3, util.basic_features2_prefix3, util.basic_features2_suffix3]: # We'll cheat and cache features for validation data to make things faster # for this assignment. The correct thing to do here would be # #f2i, fcache, num_feats_cached, x2ys \ # util.extract_features(train_toks, feature_extractor) # f2i, fcache, num_feats_cached, x2ys \ = util.extract_features(train_toks + val_toks, feature_extractor) print('%d feature types extracted' % len(f2i)) print('%d feature values cached for %d window types' % (num_feats_cached, len(fcache))) for seed in [82, 95, 11, 29, 49, 8, 42, 36, 71, 65]: # The language model assumes a trucated vocab and a feature definition. lm = util.LogLinearLanguageModel(args.model, vocab, args.unk, feature_extractor, f2i, fcache, x2ys, init=args.init, lr=args.lr, check_interval=args.check_interval, seed=seed) # lm = util.LogLinearLanguageModel(args.model, vocab, args.unk, # feature_extractor, f2i, fcache, x2ys, # init=args.init, lr=args.lr, # check_interval=args.check_interval, # seed=args.seed) if args.test: # Load trained parameters lm.load() else: # Estimate parameters. lm.train(train_toks, val_toks, args.epochs) val_ppl = lm.test(val_toks) print('Optimized Perplexity: %f' % (val_ppl)) sample = open('results/1.8.txt', 'a') print('%f,%f' % (seed, val_ppl), file=sample) sample.close() print('-' * 79) for (i, f, w) in lm.topK_feats(args.K): print('{:10d}: {:40s} ({:8.4f})'.format(i, f, w))
from sklearn.cross_validation import StratifiedKFold, cross_val_score data_dir = '../../data' cleaned_file = 'trauma_los_cleaned.csv' extract_file = 'trauma_los_cleaned_extract.csv' desired_headings = [ 'sex', 'normalvitals', 'gcs1', 'iss8', 'age65', 'transfer', 'penetrating', 'mechcode', 'bodyregions', 'headany', 'faceany', 'neckany', 'chestany', 'abdoany', 'spineany', 'upperlimbany', 'lowerlimbany', 'head3', 'face3', 'neck3', 'chest3', 'abdo3', 'spine3', 'upper3', 'lower3', 'operation', 'neurosurgery', 'laparotomy', 'thoracotomy', 'married', 'english', 'mentalhealth', 'comorbidity', 'ssa' ] # trim the input file into only the features we want to use extract_features(data_dir, cleaned_file, extract_file, desired_headings) X, y, headings = separate_data(True, data_dir, extract_file) # convert all strings to ints X, y = map(lambda x: list(map(int, x)), X), map(int, y) X, y = np.array(list(X)), np.array(list(y)) n_samples, n_features = X.shape # create a stratified 10-fold cross-validation iterator that generates the # indices for us cv = StratifiedKFold(y=y, n_folds=10, shuffle=True) # create a support vector machine classifier clf_svm = svm.SVC(kernel='linear', probability=True) # create a Gaussian naive Bayes classifier clf_gauss_nb = GaussianNB()
def queryaudio(): result_song = '' begin = 0 end = 0 artists = [] song_artist = [] if request.method == "POST": print("FORM DATA RECEIVED") if "file" not in request.files: return redirect(request.url) file = request.files["file"] # file1 =file.filename.replace(".mp3",".wav") # sound = AudioSegment.from_mp3(file.filename) # sound.export(file1, format="wav") if file.filename == "": return redirect(request.url) if file: def first_feature(feat, index_song, temp, u): l = [] for i in range(int(temp[index_song]), int(temp[index_song + 1]), 1): l.append((u.get_item_vector(i))) z = AnnoyIndex(100, metric='angular') for i in range(len(l)): v = l[i] z.add_item(i, v) z.build(100) x = [] for i in range(0, int(feat.shape[0] / 50), 1): crop_feat = util.crop_feature(feat, i, nb_step=10) result1 = z.get_nns_by_vector(crop_feat, n=1) x.append(result1[0]) return x def most_common(List): return (mode(List)) def begin_second(count_s_begin): count = 0 for i in range(0, len(count_s_begin), 5): k = count_s_begin[i:i + 5] try: kq = (most_common(k) - count) return kq except: pass count += 1 ef, sr = librosa.load(file.filename, sr=16000) exact_feature = util.extract_features(ef) results, count_s_begin = util.predict(exact_feature, songs, model1) result_song = [] top = 5 for q in range(top): k = results[q][0] song = k.split( "E:/Computer Science/HK5/CS336/Audio_query/data/")[1] song_artist.append(song) s_a = song.split(" -") artists.append(s_a[1].split(".mp3")[0]) result_song.append(s_a[0]) index_song = list_song.index(result_song[0] + " -" + artists[0] + ".mp3") second_begin = first_feature(exact_feature, index_song, time, model1) begin = begin_second(second_begin) begin = int(begin) * 0.1 end = begin + (len(ef) / 16000) return render_template("queryaudio.html", songs=result_song, song_artist=song_artist, artists=artists, begin=begin, end=round(end, 1))
from sklearn.metrics import roc_auc_score from sklearn.cross_validation import StratifiedKFold, cross_val_score data_dir = '../../data' cleaned_file = 'trauma_los_cleaned.csv' extract_file = 'trauma_los_cleaned_extract.csv' desired_headings = ['sex', 'normalvitals', 'gcs1', 'iss8', 'age65', 'transfer', 'penetrating', 'mechcode', 'bodyregions', 'headany', 'faceany', 'neckany', 'chestany', 'abdoany', 'spineany', 'upperlimbany', 'lowerlimbany', 'head3', 'face3', 'neck3', 'chest3', 'abdo3', 'spine3', 'upper3', 'lower3', 'operation', 'neurosurgery', 'laparotomy', 'thoracotomy', 'married', 'english', 'mentalhealth', 'comorbidity', 'ssa'] # trim the input file into only the features we want to use extract_features(data_dir, cleaned_file, extract_file, desired_headings) X, y, headings = separate_data(True, data_dir, extract_file) # convert all strings to ints X, y = map(lambda x:list(map(int, x)), X), map(int, y) X, y = np.array(list(X)), np.array(list(y)) n_samples, n_features = X.shape # create a stratified 10-fold cross-validation iterator that generates the # indices for us cv = StratifiedKFold(y=y, n_folds=10, shuffle=True) # create a support vector machine classifier clf_svm = svm.SVC(kernel='linear', probability=True) # create a Gaussian naive Bayes classifier clf_gauss_nb = GaussianNB()
for idx, line in enumerate(f): real_counter += 1 elements = line.strip('\r\n').split('\t') word_1 = elements[0] word_2 = elements[1] score = float(elements[2]) if use_other_info == 'false': if word_1 not in word_dict or word_2 not in word_dict: continue pred_score_dict[counter] = cosin_distance(word_emb_dict[word_dict[word_1]], word_emb_dict[word_dict[word_2]]) index_dict[counter] = score counter += 1 else: sim_vec1 = extract_features(use_other_info, word_1, word_dict, word_emb_dict, pos_dict, pos_emb_dict, parser_dict, parser_emb_dict, dict_desc_dict, kb_dict, kb_emb_dict) if sim_vec1 == None: continue sim_vec2 = extract_features(use_other_info, word_2, word_dict, word_emb_dict, pos_dict, pos_emb_dict, parser_dict, parser_emb_dict, dict_desc_dict, kb_dict, kb_emb_dict) if sim_vec2 == None: continue pred_score_dict[counter] = cosin_distance(sim_vec1, sim_vec2) index_dict[counter] = score counter += 1 f.close() print("counter is %d, real_counter is %d" % (counter, real_counter)) sorted_x = sorted(index_dict.items(), key=operator.itemgetter(1)) real_value_list = []
def features(): # display a page with the plots of the zcr and spectral centroid (possibly other spectral features) variations here f = extract_features(signal = session['samples'])
def classify(model, wav): x = extract_features(wav) prob = model.predict_proba(x)[0, 1] return prob
def load(path): wav, sr = librosa.load(path) x1 = extract_features(wav) return x1
def load(path): wav, _ = librosa.load(path) x = extract_features(wav) return x
def main(): util.set_api_key() messenger.print_task('Fetching song list') genres_songs = collector.fetch_songs_from_artists(genres_artists.MAP) messenger.print_task('Fetching songs analyses') try: with open(settings.REGISTRY) as fd: downloaded_songs = simplejson.load(fd) except JSONDecodeError: downloaded_songs = {} for genre, songs in genres_songs.iteritems(): for ith_song, song in enumerate(songs): if song.id in downloaded_songs: continue try: analysis = collector.fetch_song_analysis(song) except Exception as exc: messenger.print_subtask_error(exc) continue analysis_file = '%s_%d.json' % (genre, ith_song) analysis_file = os.path.join(settings.ANALYSES_DIR, analysis_file) with open(analysis_file, 'w') as analysis_fd: simplejson.dump(analysis, analysis_fd) downloaded_songs[song.id] = analysis_file # save which songs have are already downloaded to avoid # re-downloading their analysis in the future. # Ugly and bad as hell. I could append lines to a file or use # sqlite but it does not worth it. with open(settings.REGISTRY, 'w') as fd: simplejson.dump(downloaded_songs, fd) messenger.print_task('Constructing dataset') analysis_files = os.listdir(settings.ANALYSES_DIR) # construct a dict behaving like an enum struct: {'genre': number} genres = set([filename.split('_')[0] for filename in analysis_files]) genres -= settings.IGNORE_GENRES genres = dict([(genre, i) for i, genre in enumerate(genres)]) with open(settings.DATASET_FILE, 'w') as dataset_fd: dataset_writer = csv.writer(dataset_fd) headers = util.song_features_names() dataset_writer.writerow(headers + ['genre']) for analysis_file in analysis_files: # dirty hack: get genre from filename song_genre = analysis_file.split('_')[0] if song_genre not in genres: continue file_path = os.path.join(settings.ANALYSES_DIR, analysis_file) with open(file_path) as fd: analysis = simplejson.load(fd) song_features = util.extract_features(analysis) # Construct a csv row containing all the features of a song, sorted # by feature name. The last column is a number representing a # specific genre. csv_row = [song_features[f] for f in headers] csv_row.append(genres[song_genre]) dataset_writer.writerow(csv_row) print 'Done.'