Example #1
0
def run(input_file, test_file, k):
    clf = RandomForestClassifier(n_estimators=k)
    df = preprocess_data(input_file)
    X, label_dict, dict = extract_features(df)
    r, c = X.shape
    dft = preprocess_testdata(test_file)
    Xt, yt = extract_testfeatures(dft, label_dict, dict)
    clf.fit(X[:, 0:c - 1], X[:, c - 1])
    z = clf.predict(Xt)
    print(accuracy_score(yt, z))
Example #2
0
def build_model_input():

    print("Import class names")
    cols = [
        'Label', 'Latin Name', 'Common Name', 'Train Images',
        'Validation Images'
    ]
    info = pd.read_csv("monkey_labels.txt", names=cols, skiprows=1)
    global LABELS
    LABELS = info['Common Name']

    util.display_image('training/n0/n0018.jpg')

    print("Data augmentation/Preprocessing")
    height, width, channels = 299, 299, 3

    train_datagen = ImageDataGenerator(rescale=1. / 255)
    train_generator = train_datagen.flow_from_directory(
        TRAIN_DIR,
        target_size=(height, width),
        batch_size=BATCH_SIZE,
        class_mode='categorical')

    test_datagen = ImageDataGenerator(rescale=1. / 255)
    test_generator = test_datagen.flow_from_directory(TEST_DIR,
                                                      target_size=(height,
                                                                   width),
                                                      batch_size=BATCH_SIZE,
                                                      class_mode='categorical')

    print("Import pretrained Inception module")
    base_model = Xception(weights=INCEPTION_DIR,
                          include_top=False,
                          input_shape=(height, width, channels))

    print("Extract features")
    train_features, train_labels = util.extract_features(
        1097, BATCH_SIZE, train_generator, base_model)
    test_features, test_labels = util.extract_features(272, BATCH_SIZE,
                                                       test_generator,
                                                       base_model)

    return train_features, train_labels, test_features, test_labels
def configure_df(static, dynamic, params):
    # consolidate static/dynamic features, select features, scale values
    master_df = consolidate_features(static, dynamic)

    # configure clustering dataframe
    features = util.extract_features(params['clustering_features'])
    cluster_df = util.choose_features(master_df,
                                      ['dt_entropy', 'num_unique_words'])

    # return master feature df and scaled cluster df
    return master_df, scale_features(cluster_df)
Example #4
0
def _extract_optic_flow_features(info_df: pd.DataFrame, dir_path: str,
                                 pickle_path: str):
    data = None
    print('Extracting optic flow features!')
    for index, row in info_df.iterrows():
        for camera_no in [1, 2]:
            file_path = os.path.join(
                dir_path, row['Initials'], str(row['Session']),
                f"{row['Session']}K{camera_no}play{row['Segment_No']}.txt")
            current = pd.read_csv(file_path,
                                  delim_whitespace=True,
                                  header=None)
            child, therapist = current.iloc[:, :3].copy(), current.iloc[:, 3:]
            for p in range(2):
                cur_person = current.iloc[:, p * 3:(p + 1) * 3].copy()
                cur_person.columns = ['x', 'y', 'm']
                cur_person['Key'] = row['Key']
                cur_feats = util.extract_features(
                    cur_person[cur_person['x'] != -10000000],
                    columns=['x', 'y', 'm'])
                cur_feats = cur_feats.reset_index()
                cur_feats['CameraNo'], cur_feats['Person'] = camera_no, p
                if data is None:
                    data = cur_feats.copy()
                else:
                    if len(cur_feats) == 0:
                        # Fill all zeros
                        prev_feats.loc[:, 'x_mean':'m_var'] = 0
                        prev_feats['CameraNo'], prev_feats[
                            'Person'] = camera_no, p
                        prev_feats['Key'] = row['Key']
                        data = data.append(prev_feats, ignore_index=True)
                    else:
                        data = data.append(cur_feats, ignore_index=True)
                        prev_feats = cur_feats.copy()
    data.to_pickle(pickle_path)
    return data
Example #5
0
                if reverse_word_dict[word_id_list[idx]] not in kb_dict:
                    kb_dict[reverse_word_dict[word_id_list[idx]]] = [int(item) for item in
                                                                     line.strip('\r\n').split(',')]
            f.close()

        kb_emb_dict = {}
        with open(sys.argv[13], 'r') as f:
            for idx, line in enumerate(f):
                # key is word index in dictionary, value is embedding vector
                kb_emb_dict[idx] = [float(item) for item in line.strip('\r\n').split(',')]
            f.close()

        new_feature_dict = {}
        for k,v in word_dict.items():
            features = extract_features(use_other_info, k, word_dict, word_emb_dict, pos_dict,
                                      pos_emb_dict, parser_dict, parser_emb_dict, dict_desc_dict,
                                      kb_dict, kb_emb_dict)
            if features is not None:
                new_feature_dict[k] = features
            else:
                sys.stderr.write("%s feature extraction failed" % k)

        print(len(new_feature_dict))

        counter = 0
        real_counter = 0
        top_1 = 0
        top_3 = 0
        for com_item in analogy_list:
            counter += 1
            flag = False
Example #6
0
def main(args):
    tokenizer = util.Tokenizer(tokenize_type=args.tok, lowercase=True)

    train_toks = tokenizer.tokenize(open(args.train_file).read())
    num_train_toks = int(args.train_fraction * len(train_toks))
    print('-' * 79)
    print('Using %d tokens for training (%g%% of %d)' %
          (num_train_toks, 100 * args.train_fraction, len(train_toks)))
    train_toks = train_toks[:int(args.train_fraction * len(train_toks))]
    val_toks = tokenizer.tokenize(open(args.val_file).read())
    num_val_toks = int(args.val_fraction * len(val_toks))
    print('Using %d tokens for validation (%g%% of %d)' %
          (num_val_toks, 100 * args.val_fraction, len(val_toks)))
    val_toks = val_toks[:int(args.val_fraction * len(val_toks))]

    train_ngram_counts = tokenizer.count_ngrams(train_toks)

    # Get vocab and threshold.
    print('Using vocab size %d (excluding UNK) (original %d)' % (min(
        args.vocab, len(train_ngram_counts[0])), len(train_ngram_counts[0])))
    vocab = [
        tup[0] for tup, _ in train_ngram_counts[0].most_common(args.vocab)
    ]
    train_toks = tokenizer.threshold(train_toks, vocab, args.unk)
    val_toks = tokenizer.threshold(val_toks, vocab, args.unk)

    if args.features == 'basic1':
        feature_extractor = util.basic_features1
    elif args.features == 'basic1suffix3':
        feature_extractor = util.basic_features1_suffix3  # TODO: Implement
    elif args.features == 'basic2':
        feature_extractor = util.basic_features2
    else:
        raise ValueError('Unknown feature extractor type.')

    # for feature_extractor in [util.basic_features1, util.basic_features2, util.basic_features1_suffix3, util.basic_features2_prefix3, util.basic_features2_suffix3]:
    # We'll cheat and cache features for validation data to make things faster
    # for this assignment. The correct thing to do here would be
    #
    #f2i, fcache, num_feats_cached, x2ys \
    #      util.extract_features(train_toks, feature_extractor)
    #
    f2i, fcache, num_feats_cached, x2ys \
        = util.extract_features(train_toks + val_toks, feature_extractor)

    print('%d feature types extracted' % len(f2i))
    print('%d feature values cached for %d window types' %
          (num_feats_cached, len(fcache)))

    for seed in [82, 95, 11, 29, 49, 8, 42, 36, 71, 65]:

        # The language model assumes a trucated vocab and a feature definition.
        lm = util.LogLinearLanguageModel(args.model,
                                         vocab,
                                         args.unk,
                                         feature_extractor,
                                         f2i,
                                         fcache,
                                         x2ys,
                                         init=args.init,
                                         lr=args.lr,
                                         check_interval=args.check_interval,
                                         seed=seed)
        # lm = util.LogLinearLanguageModel(args.model, vocab, args.unk,
        #                                  feature_extractor, f2i, fcache, x2ys,
        #                                  init=args.init, lr=args.lr,
        #                                  check_interval=args.check_interval,
        #                                  seed=args.seed)

        if args.test:
            # Load trained parameters
            lm.load()
        else:
            # Estimate parameters.
            lm.train(train_toks, val_toks, args.epochs)

        val_ppl = lm.test(val_toks)
        print('Optimized Perplexity: %f' % (val_ppl))
        sample = open('results/1.8.txt', 'a')
        print('%f,%f' % (seed, val_ppl), file=sample)
        sample.close()

        print('-' * 79)
        for (i, f, w) in lm.topK_feats(args.K):
            print('{:10d}: {:40s} ({:8.4f})'.format(i, f, w))
Example #7
0
from sklearn.cross_validation import StratifiedKFold, cross_val_score

data_dir = '../../data'
cleaned_file = 'trauma_los_cleaned.csv'
extract_file = 'trauma_los_cleaned_extract.csv'
desired_headings = [
    'sex', 'normalvitals', 'gcs1', 'iss8', 'age65', 'transfer', 'penetrating',
    'mechcode', 'bodyregions', 'headany', 'faceany', 'neckany', 'chestany',
    'abdoany', 'spineany', 'upperlimbany', 'lowerlimbany', 'head3', 'face3',
    'neck3', 'chest3', 'abdo3', 'spine3', 'upper3', 'lower3', 'operation',
    'neurosurgery', 'laparotomy', 'thoracotomy', 'married', 'english',
    'mentalhealth', 'comorbidity', 'ssa'
]

# trim the input file into only the features we want to use
extract_features(data_dir, cleaned_file, extract_file, desired_headings)
X, y, headings = separate_data(True, data_dir, extract_file)
# convert all strings to ints
X, y = map(lambda x: list(map(int, x)), X), map(int, y)
X, y = np.array(list(X)), np.array(list(y))
n_samples, n_features = X.shape

# create a stratified 10-fold cross-validation iterator that generates the
# indices for us
cv = StratifiedKFold(y=y, n_folds=10, shuffle=True)

# create a support vector machine classifier
clf_svm = svm.SVC(kernel='linear', probability=True)

# create a Gaussian naive Bayes classifier
clf_gauss_nb = GaussianNB()
Example #8
0
def queryaudio():
    result_song = ''
    begin = 0
    end = 0
    artists = []
    song_artist = []
    if request.method == "POST":
        print("FORM DATA RECEIVED")

        if "file" not in request.files:
            return redirect(request.url)

        file = request.files["file"]
        # file1 =file.filename.replace(".mp3",".wav")
        # sound = AudioSegment.from_mp3(file.filename)
        # sound.export(file1, format="wav")
        if file.filename == "":
            return redirect(request.url)

        if file:

            def first_feature(feat, index_song, temp, u):
                l = []
                for i in range(int(temp[index_song]),
                               int(temp[index_song + 1]), 1):
                    l.append((u.get_item_vector(i)))
                z = AnnoyIndex(100, metric='angular')
                for i in range(len(l)):
                    v = l[i]
                    z.add_item(i, v)
                z.build(100)
                x = []
                for i in range(0, int(feat.shape[0] / 50), 1):
                    crop_feat = util.crop_feature(feat, i, nb_step=10)
                    result1 = z.get_nns_by_vector(crop_feat, n=1)
                    x.append(result1[0])
                return x

            def most_common(List):
                return (mode(List))

            def begin_second(count_s_begin):
                count = 0
                for i in range(0, len(count_s_begin), 5):
                    k = count_s_begin[i:i + 5]
                    try:
                        kq = (most_common(k) - count)
                        return kq
                    except:
                        pass
                    count += 1

            ef, sr = librosa.load(file.filename, sr=16000)
            exact_feature = util.extract_features(ef)
            results, count_s_begin = util.predict(exact_feature, songs, model1)
            result_song = []

            top = 5
            for q in range(top):
                k = results[q][0]
                song = k.split(
                    "E:/Computer Science/HK5/CS336/Audio_query/data/")[1]
                song_artist.append(song)
                s_a = song.split(" -")
                artists.append(s_a[1].split(".mp3")[0])
                result_song.append(s_a[0])
            index_song = list_song.index(result_song[0] + " -" + artists[0] +
                                         ".mp3")
            second_begin = first_feature(exact_feature, index_song, time,
                                         model1)
            begin = begin_second(second_begin)
            begin = int(begin) * 0.1
            end = begin + (len(ef) / 16000)
    return render_template("queryaudio.html",
                           songs=result_song,
                           song_artist=song_artist,
                           artists=artists,
                           begin=begin,
                           end=round(end, 1))
Example #9
0
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold, cross_val_score

data_dir = '../../data'
cleaned_file = 'trauma_los_cleaned.csv'
extract_file = 'trauma_los_cleaned_extract.csv'
desired_headings = ['sex', 'normalvitals', 'gcs1', 'iss8', 'age65', 'transfer',
    'penetrating', 'mechcode', 'bodyregions', 'headany', 'faceany', 'neckany',
    'chestany', 'abdoany', 'spineany', 'upperlimbany', 'lowerlimbany', 'head3',
    'face3', 'neck3', 'chest3', 'abdo3', 'spine3', 'upper3', 'lower3',
    'operation', 'neurosurgery', 'laparotomy', 'thoracotomy', 'married',
    'english', 'mentalhealth', 'comorbidity', 'ssa']

# trim the input file into only the features we want to use
extract_features(data_dir, cleaned_file, extract_file, desired_headings)
X, y, headings = separate_data(True, data_dir, extract_file)
# convert all strings to ints
X, y = map(lambda x:list(map(int, x)), X), map(int, y)
X, y = np.array(list(X)), np.array(list(y))
n_samples, n_features = X.shape

# create a stratified 10-fold cross-validation iterator that generates the
# indices for us
cv = StratifiedKFold(y=y, n_folds=10, shuffle=True)

# create a support vector machine classifier
clf_svm = svm.SVC(kernel='linear', probability=True)

# create a Gaussian naive Bayes classifier
clf_gauss_nb = GaussianNB()
Example #10
0
     for idx, line in enumerate(f):
         real_counter += 1
         elements = line.strip('\r\n').split('\t')
         word_1 = elements[0]
         word_2 = elements[1]
         score = float(elements[2])
         if use_other_info == 'false':
             if word_1 not in word_dict or word_2 not in word_dict:
                 continue
             pred_score_dict[counter] = cosin_distance(word_emb_dict[word_dict[word_1]],
                                                       word_emb_dict[word_dict[word_2]])
             index_dict[counter] = score
             counter += 1
         else:
             sim_vec1 = extract_features(use_other_info, word_1, word_dict, word_emb_dict,
                                                     pos_dict, pos_emb_dict, parser_dict, parser_emb_dict,
                                                     dict_desc_dict, kb_dict, kb_emb_dict)
             if sim_vec1 == None:
                 continue
             sim_vec2 = extract_features(use_other_info, word_2, word_dict, word_emb_dict,
                                         pos_dict, pos_emb_dict, parser_dict, parser_emb_dict,
                                         dict_desc_dict, kb_dict, kb_emb_dict)
             if sim_vec2 == None:
                 continue
             pred_score_dict[counter] = cosin_distance(sim_vec1, sim_vec2)
             index_dict[counter] = score
             counter += 1
     f.close()
 print("counter is %d, real_counter is %d" % (counter, real_counter))
 sorted_x = sorted(index_dict.items(), key=operator.itemgetter(1))
 real_value_list = []
Example #11
0
def features():
	# display a page with the plots of the zcr and spectral centroid (possibly other spectral features) variations here
	f = extract_features(signal = session['samples'])
def classify(model, wav):
    x = extract_features(wav)
    prob = model.predict_proba(x)[0, 1]
    return prob
def load(path):
    wav, sr = librosa.load(path)
    x1 = extract_features(wav)
    return x1
Example #14
0
def load(path):
    wav, _ = librosa.load(path)
    x = extract_features(wav)
    return x
def main():
    util.set_api_key()

    messenger.print_task('Fetching song list')
    genres_songs = collector.fetch_songs_from_artists(genres_artists.MAP)

    messenger.print_task('Fetching songs analyses')
    try:
        with open(settings.REGISTRY) as fd:
            downloaded_songs = simplejson.load(fd)
    except JSONDecodeError:
        downloaded_songs = {}

    for genre, songs in genres_songs.iteritems():
        for ith_song, song in enumerate(songs):
            if song.id in downloaded_songs:
                continue

            try:
                analysis = collector.fetch_song_analysis(song)
            except Exception as exc:
                messenger.print_subtask_error(exc)
                continue

            analysis_file = '%s_%d.json' % (genre, ith_song)
            analysis_file = os.path.join(settings.ANALYSES_DIR, analysis_file)
            with open(analysis_file, 'w') as analysis_fd:
                simplejson.dump(analysis, analysis_fd)

            downloaded_songs[song.id] = analysis_file

            # save which songs have are already downloaded to avoid
            # re-downloading their analysis in the future.
            # Ugly and bad as hell. I could append lines to a file or use
            # sqlite but it does not worth it.
            with open(settings.REGISTRY, 'w') as fd:
                simplejson.dump(downloaded_songs, fd)

    messenger.print_task('Constructing dataset')
    analysis_files = os.listdir(settings.ANALYSES_DIR)

    # construct a dict behaving like an enum struct: {'genre': number}
    genres = set([filename.split('_')[0] for filename in analysis_files])
    genres -= settings.IGNORE_GENRES
    genres = dict([(genre, i) for i, genre in enumerate(genres)])

    with open(settings.DATASET_FILE, 'w') as dataset_fd:
        dataset_writer = csv.writer(dataset_fd)
        headers = util.song_features_names()
        dataset_writer.writerow(headers + ['genre'])

        for analysis_file in analysis_files:
            # dirty hack: get genre from filename
            song_genre = analysis_file.split('_')[0]
            if song_genre not in genres:
                continue

            file_path = os.path.join(settings.ANALYSES_DIR, analysis_file)
            with open(file_path) as fd:
                analysis = simplejson.load(fd)

            song_features = util.extract_features(analysis)
            # Construct a csv row containing all the features of a song, sorted
            # by feature name. The last column is a number representing a
            # specific genre.
            csv_row = [song_features[f] for f in headers]
            csv_row.append(genres[song_genre])
            dataset_writer.writerow(csv_row)

    print 'Done.'