def extract(self, NAME):
     """
     this function will return square(x) value
     :param NAME: chat name e.g. chat3-AbbasJaf-HamzaNaj
     :return: void
     """
     fe.feature_extraction(False, NAME)
Ejemplo n.º 2
0
def main(args):
    #-----------------------------------------------------#
    #             2D/3D Convolutional Autoencoder         #
    #-----------------------------------------------------#
    if args.program == 'CAE':
        cae = CAE(input_dir=args.data_dir,
                  patch_size=ast.literal_eval(args.patch_size),
                  batch_size=args.batch_size,
                  test_size=args.test_size,
                  prepare_batches=args.prepare_batches)

        cae.prepare_data(args.sampler_type, args.max_patches, args.resample,
                         ast.literal_eval(args.patch_overlap),
                         args.min_lab_vox, args.label_prob, args.load_data)
        if args.model_dir is None:
            cae.train(args.epochs)
        cae.predict(args.model_dir)

    #-----------------------------------------------------#
    #               Patient classification                #
    #-----------------------------------------------------#
    """
    if args.program=='AutSeg':
        asg = AutomaticSegmentation(    model_name=args.model_name,
                                        patch_size=args.patch_size,
                                        patch_overlap=args.patch_overlap,
                                        input_dir=args.data_dir, 
                                        model_dir=args.model_dir   )
        asg.run()
        asg.run_postprocessing()

"""
    if args.program == 'CLUS':
        clustering = Clustering(num_iters=args.iterations,
                                num_clusters=args.num_clusters,
                                input_dir=args.data_dir)
        clustering.run()

    if args.program == 'FeEx':
        fe = FeatureExtraction(model_name=args.model_name,
                               patch_size=ast.literal_eval(args.patch_size),
                               patch_overlap=ast.literal_eval(
                                   args.patch_overlap),
                               num_clusters=args.num_clusters,
                               cluster_selection=args.cluster_selection,
                               resample=args.resample,
                               encoded_layer_num=args.encoded_layer_num,
                               model_dir=args.model_dir,
                               input_dir=args.data_dir)
        fe.run(batch_size=20)

    if args.program == 'SVM':
        svm = SvmClassifier(feature_dir=args.feature_dir,
                            ffr_dir=args.ffr_dir,
                            ffr_filename=args.ffr_filename,
                            input_dir=args.data_dir,
                            ffr_cut_off=args.ffr_cut_off,
                            test_size=args.test_size)
        svm.train()
        svm.predict()
	def __init__(self,graph_path,label_path):
		self.graph_path = graph_path
		self.label_path = label_path
		self.graph = tf_Graph()
		self.sample_rate= 16000  # Samle Rate: 16000
		self.window_len = 0.03   # Window Size: 30ms = 480 Samples 960 Bytes
		self.frame_shift_ms= 0.01   # Frame Shift: 10ms = 160 Samples 320 Bytes
		self.melcount = 40 
		self.frame_shift = int(self.frame_shift_ms*self.sample_rate)
		self.bitsize = 2
		self.blocksize = 20
		self.recognition_threshold = 0.9
		self.lower_frequency = 20 
		self.higher_frequency = 8000
		self.prediction_every = 20 #Number of mel steps between predictions
		self.gain = 1.0
		self.detection_cooldown = 8
		self.cooldown = 0
		self.sensitivity = 0.5
		self.mel_spectrogram = np.zeros((1,self.melcount*98), dtype=np.float32) 
		self.mel = FeatureExtraction(nfilt=self.melcount,lowerf=self.lower_frequency,upperf=self.higher_frequency,
			samprate=self.sample_rate,wlen=self.window_len,nfft=512,datalen=512)

		self.input_name = "fingerprint_input:0"
		self.output_name = "labels_softmax:0"

		self.sess = tf_Session(graph=self.graph)

		self.labels_list = self._load_labels(label_path)
		self._load_graph(graph_path)

		self.last_frames = {}

		self.softmax_tensor = self.sess.graph.get_tensor_by_name(self.output_name) 
		self._warmup()
Ejemplo n.º 4
0
    def ekstrakBanyak(self):
        dlg = filedialog.askdirectory()
        print(dlg)
        if dlg != '':
            self.folderCitra = dlg
            fe = FeatureExtraction()
            fitur2 = fe.ekstrakFiturBanyak(self.folderCitra)

            pass
    def get_frame_hog(ctrans_tosearch):
        ch1 = ctrans_tosearch[:, :, 0]
        ch2 = ctrans_tosearch[:, :, 1]
        ch3 = ctrans_tosearch[:, :, 2]

        # Compute individual channel HOG features for the entire image
        # Y channel
        hog1 = FeatureExtraction.get_hog_features(ch1, folder="../buffer/hog-features/")
        # Cr  channel
        hog2 = FeatureExtraction.get_hog_features(ch2)
        # Cb channel
        hog3 = FeatureExtraction.get_hog_features(ch3)
        return hog1, hog2, hog3
Ejemplo n.º 6
0
    def klasifikasiCitraBanyak(self, folder, method):
        self.folder = folder
        helper = Helper()
        files = helper.listFiles(folder)
        scale, proc, klas = self.loadModel(method)
        fitur_banyak = []
        hasil = []
        for fil in files:
            fe = FeatureExtraction()
            fitur_banyak.append(fe.ekstraksifitur(fil))
        hasil = self.klaf(scale, fitur_banyak, proc, method, klas)

        return hasil
Ejemplo n.º 7
0
    def build_data(self, path):

        """Builds the Datafile

        Parameters
        ----------
        path : str
            The location of the images

        Returns
        -------
        None
        """
        rootdir = path
        for subdir, dirs, files in os.walk(rootdir): # for every sub directory of path 
            for file in files:   #for every file
                filepath = subdir+os.sep+file 
                if filepath.endswith('.png'):
                    image = cv2.imread(filepath) #read image
                    self = FeatureExtraction() 
                    face = self.viola_jones(image) #find faces
                    fd = self.calc_hogs(face) #calculate hog desc
                    print (filepath) #print filepath for image
                    if len(fd) != 0: #If face found
                        with open('./assets/data.csv', mode ='a') as csv_file:
                            writer = csv.writer(csv_file)
                            data = [subdir.replace(rootdir+os.sep,'')] #Write Y and x
                            fs=fd[0]
                            for feature in fs:
                                data.append(feature)
                            writer.writerow(data)
        print("COMPLETE DATA BUILD")
Ejemplo n.º 8
0
    def __init__(self,
                 min_num_utt_per_spkr,
                 min_utt_duration,
                 sample_rate,
                 num_fft,
                 fft_window_size,
                 fft_hop_size,
                 num_mel,
                 vad_mode,
                 preprocess_multiprocessing=False):
        """
        Note:
        
        Args:
            min_num_utt_per_spkr: integer, minimum number of utterances per speaker
            min_utt_duration: float, minimum duration of an audio file (in frames)
            sample_rate: integer, sampling rate
            num_fft: integer, fft window size
            fft_window_size: integer, The window will be of length win_length and then padded with zeros to match n_fft (msec)
            fft_hop_size: integer, number audio of frames between STFT columns (msec)
            num_mel: integer, feature dimension
            vad_mode: integer ranging [1,3]. degree of strictness in VAD
            multiprocessing: Boolean, True when processing large DB and saving it to .npy

        Returns:
            
        """

        ### General settings
        self._min_num_utt_per_spkr = min_num_utt_per_spkr
        self._min_utt_duration = ((
            (min_utt_duration - 1) * fft_hop_size) + fft_window_size) / 1000
        self._min_utt_duration_fr = min_utt_duration

        ### VAD
        self.vad = VAD(vad_mode, "unused")

        ### FFT
        fft_window_size_frames = int((fft_window_size / 1000) * sample_rate)
        fft_hop_size_frames = int((fft_hop_size / 1000) * sample_rate)
        self.feature_extractor = FeatureExtraction(sample_rate, num_fft,
                                                   fft_window_size_frames,
                                                   fft_hop_size_frames,
                                                   num_mel)
        self._multiprocessing = preprocess_multiprocessing
Ejemplo n.º 9
0
    def prepare_data(self):
        """
        prepare training dataset with feature extraction methods
        :return:
        """
        regex_path = re.compile(r'^[0-9]')
        audio_dir_path = [
            i for i in os.listdir(self.audio_path) if regex_path.match(i)
        ]

        regex_file = re.compile(r'.*[wav|ogg]$')
        dataset = pd.DataFrame()
        labels = []

        for directory in audio_dir_path:

            feature_extraction = FeatureExtraction(label=directory)
            file_list = [
                i for i in os.listdir(os.path.join(self.audio_path, directory))
                if regex_file.match(i)
            ]

            for audio_file in file_list:
                audio_file_abspath = os.path.join(self.audio_path, directory,
                                                  audio_file)

                audio_data, sr = librosa.load(audio_file_abspath,
                                              sr=44100,
                                              mono=True,
                                              duration=5)
                features, label = feature_extraction.extract_feature(
                    audio_data)

                dataset = dataset.append(pd.Series(features),
                                         ignore_index=True)
                labels.append(label)

        label_df = pd.DataFrame(labels, columns=['label'])
        dataset = pd.concat([dataset, label_df], axis=1)
        dataset.to_csv(self.dataset_save_file,
                       sep=",",
                       index=False,
                       encoding="utf8")
Ejemplo n.º 10
0
    def extract_single_img_features(img):
        """
        combine spatial bin, color histogram and gradient histogram features for a single image
        """
        # Create a list to append feature vectors to
        features = []

        # apply color conversion if other than 'RGB'
        feature_image = Helper.change_cspace(img)

        # get hog features for either specific channel or for all channels
        if config["hog_channel"] == 'ALL':
            hog_features = []
            channels = feature_image.shape[2]
            # get features for all 3 channels
            for channel in range(channels):
                hog_features.append(
                    FeatureExtraction.get_hog_features(feature_image[:, :,
                                                                     channel],
                                                       feature_vec=True))
                hog_features = np.ravel(hog_features)
        else:
            # get features for specific channel
            hog_features = FeatureExtraction.get_hog_features(
                feature_image[:, :, config["hog_channel"]], feature_vec=True)

        # Apply bin_spatial() to get spatial color features
        bin_features = FeatureExtraction.bin_spatial(feature_image,
                                                     config["spatial_size"])

        # Apply color_hist() to get color histogram features
        color_hist_features = FeatureExtraction.color_hist(
            feature_image, config["hist_bins"])

        # concatenate all 3 types of features
        feature = np.concatenate(
            (bin_features, color_hist_features, hog_features), axis=0)

        # Append the new feature vector to the features list
        features.append(feature)

        # Return list of feature vectors
        return features
 def get_features(onset_clips, sr):
     nyq = sr / 2
     return FeatureExtraction(onset_clips, sr) \
         .with_spectral_centroid() \
         .with_zero_crossing_rate() \
         .with_rms() \
         .with_rms_of_filter(np.divide([49, 50], nyq), np.divide([0.01, 2000], nyq), 0.01, 62)\
         .with_rms_of_filter(np.divide([200, 201], nyq), np.divide([1, 1300], nyq), 0.01, 20)\
         .with_rms_of_filter(np.divide([5100, 16300], nyq), np.divide([65, 22000], nyq), 0.05, 60)\
         .with_crest_factor() \
         .with_spectral_bandwith() \
         .with_spectral_kurtosis() \
         .with_spectral_skewness() \
         .with_spectral_rolloff() \
         .with_spectral_flatness() \
         .with_mfcc() \
         .with_row_operation(3, 2, np.subtract) \
         .with_row_operation(4, 2, np.subtract) \
         .with_row_operation(5, 2, np.subtract) \
         .with_row_operation(3, 4, np.subtract) \
         .with_row_operation(3, 5, np.subtract) \
         .with_row_operation(4, 5, np.subtract) \
         .get_feature_matrix()
Ejemplo n.º 12
0
def feature_extraction(data):
    '''
    对数据进行特征提取
    :return:
    '''
    feature_data = FeatureExtraction()
    feature_data.transfer_txt(data)
    data_list, label_list = feature_data.loadDataSet()  # 加载数据
    features = feature_data.feature_select(data_list)  # 所有词的TF-IDF值
    data = open(path + "\\data\aut.txt", 'w', encoding="utf-8")
    data2 = open(path + "\\data\aut2.txt", 'w', encoding="utf-8")
    for i in range(len(features)):
        # count = count + 1
        # print(count)
        if features[i][1] > 0.0006:
            # print(features[i][1])
            data2.write(features[i][0] + " " + str(features[i][1] * 100) +
                        "%" + "\n")
            data.write(str(i + 1) + " " + features[i][0] + "\n")
            # data3.write(features[i][0] + "\n")
            # print(features[i][0], features[i][1])
    data.close()
    data2.close()
Ejemplo n.º 13
0
def Analysis(lyric, mod=True):
    if mod == False:
        pos = []
        neg = []
        with open(
                "D:\\Academic_work\\01_ERG3010\\Project\\corpus\\doubandata.txt",
                'r',
                encoding='utf-8-sig') as f:
            for line in f:
                line = f.readline()
                line = line.split("##")
                try:
                    star = int(line[1])
                except:
                    pass
                if star == 1 or star == 2:
                    neg.append(line[2].strip('\n'))
                elif star == 4 or star == 5:
                    pos.append(line[2].strip('\n'))
        ''' segment '''
        seg_pos = Seg().seg_from_datalist(pos)
        seg_neg = Seg().seg_from_datalist(neg)
        ''' training & test  '''
        word_list = []
        lable_list = []
        data = []
        train_data = []
        shuffle(seg_pos)
        shuffle(seg_neg)
        for k in seg_pos[:500]:
            train_data.append(('pos', k))
            word_list.append(k)
            lable_list.append('pos')
        for k in seg_neg[:500]:
            train_data.append(('neg', k))
            word_list.append(k)
            lable_list.append('neg')
        ''' train, test'''
        fe = FeatureExtraction(word_list, lable_list)
        best_words = fe.best_words(3000, False)
        best_words = "D:\Academic_work\01_ERG3010\Project\lyricsAnalysis2\svmmodel-bestwords.dat"
        model = Sentiment(best_words)
        model.train_model(train_data)
        model.save_model(root_path + "\\lyricsAnalysis2\\svmmodel")
    else:
        model = Sentiment()
        model.load_model(root_path + "\\lyricsAnalysis2\\svmmodel")

    result = model.predict_datalist(lyric)  # lyric 是一个list, 放每一首歌曲
    data = []
    count = 1
    for prob in result:
        time = "{}/{}".format((count // 12), count // 30)
        data.append([count, prob, "Pos"])
        data.append([count, 1 - prob, "Neg"])
        count += 1
    ''' text visualization '''
    tr = ThemeRiver("Sentiment", title_color="#274C77", title_text_size=20)
    tr.add(['Pos', 'Neg'],
           data,
           is_label_show=True,
           is_datazoom_show=True,
           legend_text_color="#274C77",
           legend_text_size=15)
    tr.render("ThemeRiver.html")
Ejemplo n.º 14
0
        #print metric
        #self.metrics.append(metric)
        #joblib.dump(self.dnn.fe.whiten_high, self.dnn.name + '-high.pkl')
        #joblib.dump(self.dnn.fe.whiten_low, self.dnn.name + '-low.pkl')
        #self.model.save(self.dnn.name)
        return


def psnr_metric(y_true, y_pred):
    l2_loss = K.mean((y_true - y_pred)**2)
    psnr = 20. * K.log(K.max(y_true) / K.sqrt(l2_loss) + 1e-8) / K.log(10.)
    return psnr


if __name__ == "__main__":
    fe = FeatureExtraction(train_subsample=0.25, val_subsample=1.0)

    batch_size = 128
    n_epochs = 100
    n_frames = 9

    data = fe.frame_generator(fe.X_train,
                              fe.Y_train,
                              n_frames=9,
                              batch_size=128)
    n_train, _ = fe.X_train.shape

    data_val = fe.frame_generator(fe.X_val,
                                  fe.Y_val,
                                  n_frames=9,
                                  batch_size=128)
Ejemplo n.º 15
0
    def predict(self, chat_message, modelType = "ensemble", chatModel = "chat1-model"):
        NAME = ""
        for root, dirs, files in os.walk("./chat-data/processed-chat"):
            for filename in files:
                # print(filename[:-4])
                BASE_NAME = filename[:-4]
                chatModel = chatModel.split(".")[0]
                chatModelRegex = "(" + chatModel.split("-")[0] + "-)" 
                if re.search(chatModelRegex, BASE_NAME):
                    CHAT_NO = BASE_NAME.split("-")[0]
                    CHAT_NAME = BASE_NAME.split("-")[1] + '-' + BASE_NAME.split("-")[2]
                    NAME = CHAT_NO + "-" + CHAT_NAME
        feature_table = []
        static_feature_table = []
        dynamic_feature_table = []

        CONFIG = FeatureExtraction.set_variables(NAME)
        print(NAME)

        # f = open("D:\MSc\Chat Parser Script\chat-data\extracted-features\chat1-MustafaAbid-MurtazaAn-feature-set.json", encoding="utf8")
        f = open("D:\\MSc\\Chat Parser Script\\chat-data\\extracted-features\\" + NAME + "-feature-set.json", encoding="utf8")

        data_dictionary = json.load(f)
        print(data_dictionary)
        f.close()

        CSV_OUTPUT_FILE_NAME = 'D:\\MSc\\Chat Parser Script\\chat-data\\extracted-features\\' + NAME + '-partial.csv'
        CHAT_MODEL_BASE_PATH = "D:\\MSc\\Chat Parser Script\\models\\"
        # chat_message = "hahah. no. have to go home. Bro is there a format to send invites? No no you invite him. Your more close to him"
        # chat_message = "I explained it to u yesterday that the reason we didn't call because there was no update to give, we itself were looking for places, and when u called we were still not planned but eventually then and their we Decided to go to Ramada. How is it obvious that ull were getting wet? There are 2 possibilities either u got shelter and werent getting wet or ul didn't find any and got wet. So the obvious part gets eliminated when there are 2 possibilities.. Didn't no where.to go. Cause I know if it was my vehicle what ever the situ I wouldve taken ull inside.. 👆🏽. Judgement"
        # chat_message = "Let me know pricing. Also gym is empty these days. Let's play badminton"
        # chat_message = "No bro. Ill join after dinner. Let me know where ull r going."

        feature_table, static_feature_table, dynamic_feature_table = FeatureExtraction.generate_values(CONFIG, chat_message, data_dictionary, feature_table, static_feature_table, dynamic_feature_table, 1)

        dataframe = pd.read_csv(CSV_OUTPUT_FILE_NAME)
        train_dict = dataframe.to_dict('records')

        train_dict.append(feature_table[0])

        normalizedData = FeatureExtraction.normalize_data(train_dict)
        chat_features = train_dict[len(normalizedData) - 1]

        chat_features = normalizedData[len(normalizedData) - 1]

        if modelType == "svm":
            df = pd.DataFrame(chat_features, index=[0])
            
            result = self.svmInstance.predict_svm(CHAT_MODEL_BASE_PATH + "svm\\" + chatModel + ".pkl", df)
        if modelType == "mlp":
            df = pd.DataFrame(chat_features, index=[0])
            result = self.mlpInstance.predict_mlp(CHAT_MODEL_BASE_PATH + "mlp\\" + chatModel + ".pkl", df)
        if modelType == "svm-rbf":
            df = pd.DataFrame(chat_features, index=[0])
            result = self.svmInstance.predict_svm(CHAT_MODEL_BASE_PATH + "svm-rbf\\" + chatModel + ".pkl", df)
        if modelType == "tensorflow-RNN":
            df = pd.DataFrame(chat_features, index=[0])
            result = self.tensorflowNNInstance.predict_tensorflow_nn(CHAT_MODEL_BASE_PATH + "tensorflow-RNN\\" + chatModel, df)[0][0]
            result = result.tolist()
        if modelType == "ensemble":
            df = pd.DataFrame(chat_features, index=[0])
            svm_result = self.svmInstance.predict_svm(CHAT_MODEL_BASE_PATH + "svm\\" + chatModel + ".pkl", df)
            mlp_result = self.mlpInstance.predict_mlp(CHAT_MODEL_BASE_PATH + "mlp\\" + chatModel + ".pkl", df)
            # tf_result = self.tensorflowNNInstance.predict_tensorflow_nn(CHAT_MODEL_BASE_PATH + "tensorflow-RNN\\" + chatModel, df)
            # tf_result_formatted = []
            # for elem in tf_result:
            #     tf_result_formatted.extend(elem)
            # data = {'tf_pred':tf_result_formatted[0], 'mlp_pred':mlp_result, 'svm_pred': svm_result, 'result': 0}
            data = {'mlp_pred':mlp_result, 'svm_pred': svm_result, 'result': 0}
            test_dataframe = pd.DataFrame(data, index=[0])
            result = self.ensemblingInstance.predict_ensemble(CHAT_MODEL_BASE_PATH + "ensemble\\" + chatModel, test_dataframe)[0][0]
            result = result.tolist()
        print(result)
        return result

# predict(NAME="chat3-AbbasJafferjee-HamzaNajmudeen", modelType="ensemble", chatModel="chat3-model")
    def get_bounding_boxes(img, classifier, x_start_stop, y_start_stop):

        # get window parameters
        n_xsteps, n_ysteps, w = WindowSearch.get_window_params(img,
                                                               x_start_stop,
                                                               y_start_stop)
        n_blocks_per_window, ctrans_tosearch = w

        # get hog features for full image
        hog1, hog2, hog3 = WindowSearch.get_frame_hog(ctrans_tosearch)

        svc, scaler = classifier
        x_start, x_stop = x_start_stop
        y_start, y_stop = y_start_stop
        bounding_boxes = []

        t_start = int(time.time())

        for xb in range(n_xsteps):
            for yb in range(n_ysteps):
                y_pos = yb * config["cells_per_step"]
                x_pos = xb * config["cells_per_step"]

                # Extract HOG for this patch
                hog_feat1 = hog1[y_pos:y_pos + n_blocks_per_window, x_pos:x_pos + n_blocks_per_window].ravel()
                hog_feat2 = hog2[y_pos:y_pos + n_blocks_per_window, x_pos:x_pos + n_blocks_per_window].ravel()
                hog_feat3 = hog3[y_pos:y_pos + n_blocks_per_window, x_pos:x_pos + n_blocks_per_window].ravel()

                x_left = x_pos * config["pix_per_cell"]
                y_stop = y_pos * config["pix_per_cell"]

                # Extract the image patch
                sub_sample_img = cv.resize(
                    ctrans_tosearch[y_stop:y_stop + config["window_size"], x_left:x_left + config["window_size"]],
                    (config["window_size"], config["window_size"])
                )

                # Get color and gradient features for each image patch
                hog_features = np.hstack((hog_feat1, hog_feat2, hog_feat3))
                spatial_features = FeatureExtraction.bin_spatial(sub_sample_img, size=config["spatial_size"])
                hist_features = FeatureExtraction.color_hist(sub_sample_img, nbins=config["hist_bins"])

                # append merge features
                features = np.hstack((spatial_features, hist_features, hog_features))

                # normalize the features
                features = scaler.transform(np.array(features).reshape(1, -1))

                # predict the label for the features: 1 = car, 0 = not car
                predicted_labels = svc.predict(features)

                # get the bounding box for detected cars
                if predicted_labels == 1:
                    bounding_boxes.append(WindowSearch.get_box(x_start,
                                                               x_left,
                                                               y_start,
                                                               y_stop))

        t_end = int(time.time())
        print("prediction time: {}".format(t_end - t_start))

        return bounding_boxes
def train(train_train_loader, train_test_loader, test_test_loader, model,
          log_dir, model_dir, pos_prob_dir, neg_prob_dir, pos_mask_dir,
          neg_mask_dir):

    logger = SummaryWriter(log_dir)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    feature_extractor = FeatureExtraction(
        feature_extraction_cnn=sim_feature_cnn,
        normalization=True,
        last_layer=','.join(sim_feature_layers))

    feature_extractor.eval()

    steps = 0

    npy_log = []

    for e in range(epoch_num):

        print('Epoch: {}'.format(e))

        model.train()

        for _, data in enumerate(tqdm(train_train_loader)):

            img = data['model_img'].to(device)
            pos_anchor = data['model_pos_anchor'].to(device).permute(
                0, 1, 3, 4, 2)
            neg_anchor = data['model_neg_anchor'].to(device).permute(
                0, 1, 3, 4, 2)

            optimizer.zero_grad()

            if merge_batches:

                clip_size = img.shape[1]

                img = img.view(-1, *img.shape[2:])
                pos_anchor = pos_anchor.view(-1, *pos_anchor.shape[2:])
                neg_anchor = neg_anchor.view(-1, *neg_anchor.shape[2:])

                logits = model(img)

                anchor_loss, anchor_loss_details = get_anchor_loss(
                    logits, -logits, pos_anchor, neg_anchor, pos_ratio,
                    neg_ratio)

                logits = logits.view(-1, clip_size, *logits.shape[1:])
                img = img.view(-1, clip_size, *img.shape[1:])

                logits_1, logits_2 = logits[:, 0, :, :, :], logits[:,
                                                                   1, :, :, :]
                img_1, img_2 = img[:, 0, :, :, :], img[:, 1, :, :, :]

            else:

                img_1, img_2 = img[:, 0, :, :, :], img[:, 1, :, :, :]
                pos_anchor_1, pos_anchor_2 = pos_anchor[:,
                                                        0, :, :, :], pos_anchor[:,
                                                                                1, :, :, :]
                neg_anchor_1, neg_anchor_2 = neg_anchor[:,
                                                        0, :, :, :], neg_anchor[:,
                                                                                1, :, :, :]

                logits_1 = model(img_1)
                logits_2 = model(img_2)

                anchor_loss_1, anchor_loss_details_1 = get_anchor_loss(
                    logits_1, -logits_1, pos_anchor_1, neg_anchor_1, pos_ratio,
                    neg_ratio)

                anchor_loss_2, anchor_loss_details_2 = get_anchor_loss(
                    logits_2, -logits_2, pos_anchor_2, neg_anchor_2, pos_ratio,
                    neg_ratio)

                anchor_loss = anchor_loss_1 + anchor_loss_2

                anchor_loss_details = {
                    k: anchor_loss_details_1[k] + anchor_loss_details_2[k]
                    for k in anchor_loss_details_1.keys()
                }

            # Semantic diffusion
            with torch.no_grad():
                feature_maps_1 = feature_extractor(img_1)
                feature_maps_2 = feature_extractor(img_2)

            diffusion_loss = {}

            diffusion_loss_details = {}

            for i, key in enumerate(sim_feature_layers):

                feature_maps_1[i] = F.interpolate(feature_maps_1[i],
                                                  size=feature_map_size,
                                                  mode='bicubic',
                                                  align_corners=True)

                feature_maps_2[i] = F.interpolate(feature_maps_2[i],
                                                  size=feature_map_size,
                                                  mode='bicubic',
                                                  align_corners=True)

                _diff_loss, _diff_details = get_diffusion_loss(
                    feature_maps_1[i],
                    feature_maps_2[i],
                    logits_1,
                    logits_2,
                    fg_margin=sim_fg_margins[i],
                    bg_margin=sim_bg_margins[i],
                    fg_ratio=fg_ratio,
                    bg_ratio=bg_ratio,
                    naming='sim_{}'.format(key))

                diffusion_loss[key] = _diff_loss
                diffusion_loss_details[key] = _diff_details

            #######

            total_loss = anchor_loss

            for key in sim_feature_layers:
                total_loss += diffusion_loss[key]

            total_loss.backward()

            optimizer.step()

            for k in anchor_loss_details.keys():
                logger.add_scalar('Running-{}'.format(k),
                                  anchor_loss_details[k], steps)

            for v in diffusion_loss_details.values():
                for k in v.keys():
                    logger.add_scalar('Running-{}'.format(k), v[k], steps)

            steps += 1

        if e % 1 == 0:

            results = {}

            train_iou, train_dice = test(train_test_loader, model,
                                         pos_prob_dir + '-{}'.format(e),
                                         neg_prob_dir + '-{}'.format(e),
                                         pos_mask_dir + '-{}'.format(e),
                                         neg_mask_dir + '-{}'.format(e))

            results['train_iou'] = train_iou
            results['train_dice'] = train_dice

            if test_test_loader is not None:

                test_iou, test_dice = test(test_test_loader, model,
                                           pos_prob_dir + '-{}'.format(e),
                                           neg_prob_dir + '-{}'.format(e),
                                           pos_mask_dir + '-{}'.format(e),
                                           neg_mask_dir + '-{}'.format(e))

                results['test_iou'] = test_iou
                results['test_dice'] = test_dice

                npy_log.append([train_iou, train_dice, test_iou, test_dice])
            else:
                npy_log.append([train_iou, train_dice])

            for k1 in results.keys():
                for k2 in results[k1].keys():
                    logger.add_scalar('{}-{}'.format(k1, k2), results[k1][k2],
                                      steps)

            torch.save(model.state_dict(),
                       os.path.join(model_dir, 'model-{}'.format(steps)))

    logger.close()

    np.save(os.path.join(log_dir, 'npy_log.npy'), npy_log)
Ejemplo n.º 18
0
import re


from data_process import DataProcess
from feature_extraction import FeatureExtraction
from evaluation.svm import EvalSVM
from evaluation.knn import EvalKnn
from evaluation.tree import EvalTree
from evaluation.nb import EvalNB
from evaluation.logReg import EvalLogReg



# Init the data process and feature extraction object
data_process = DataProcess()
feature_extraction = FeatureExtraction()


data_content, data_lable = data_process.load_data('dataset/5000_seq.csv')
data_process.extract_n_p_total(data_lable)

# preprocess data
processed_data = data_process.pre_process(data_content)
processed_data = data_process.lemmatizer(processed_data)

# pprint (processed_data)

# vectorizer data
vectorized_data = feature_extraction.tfidf_vectorizer(processed_data)

# vectorized_data = feature_extraction.fp_vectorizer(vectorized_data)
Ejemplo n.º 19
0
    gcv = GridSearchCV(clf, params, iid=False, cv=5)
    gcv.fit(X, y)
    return gcv.best_score_, gcv.best_params_


if __name__ == '__main__':
    # Read the training data
    df = pd.read_csv('data/train.csv')
    data = df.values

    # Separate X and y
    X = data[:, 1:-1]
    y = data[:, -1]

    # Feature Engineering
    feature = FeatureExtraction(X, n_point_dft=32)

    X_fft = np.fft.fft(X, axis=1)
    X_fft = np.abs(X_fft)
    X_fft_128 = np.abs(np.fft.fft(X, n=128, axis=1))
    X_fft_256 = np.abs(np.fft.fft(X, n=256, axis=1))
    X_sample_fft_30 = sample_from_6000(X_fft, 30)
    X_sample_fft_100 = sample_from_6000(X_fft, 100)
    X_cumulants = feature.cumulants_
    X_combined_4_100 = np.hstack((X_sample_fft_100, X_cumulants))
    # X_sampled = sample_from_6000(X_fft, 30)

    params = {'C': [10**(-10+i) for i in range(20)], 'gamma': [10**(-10+i) for i in range(20)]}
    clf = SVC()

    best_scores = []
Ejemplo n.º 20
0
 def do_all(self):
     self.dump()
     self.data_extraction()
     FeatureExtraction(self.get_dir_id()).extract()
Ejemplo n.º 21
0
        },
        'doc2vec': {},
        'dtm': {},
        'sentiment_analysis': {},
        'ELMo': {},
        'lexical_diversity': {},
        'readability': {
            "Flesch.Kincaid", "Dale.Chall.old", "Wheeler.Smith",
            "meanSentenceLength", "meanWordSyllables", "Strain", "SMOG",
            "Scrabble", "FOG", "Farr.Jenkins.Paterson", "DRP", "Dale.Chall"
        },
        'topic_modeling': {},
    }

    full_data['clean_text'] = full_data.full_text.apply(clean_text)

    fs = FeatureExtraction(full_data.clean_text, features_dict)
    feature_df = import_features(features_dict)
    pd.concat([full_data, feature_df])

    train_data = full_data.clean_text[0:1088]
    dev_data = full_data.clean_text[1088:1388]
    test_data = full_data.clean_text[1388:1688]

    # select a number of features or all features
    # selected_feature = feature_tuning(train_data)

    # best_hyperparameter_list = tune(train_data_X, train_data_y, dev_data_X, dev_data_y, selected_feature)
    best_hyperparameter_list = tune(train_data_X, train_data_y, dev_data_X,
                                    dev_data_y)
Ejemplo n.º 22
0
class Wav2Mel(object):
    """
    Note:
        Convert audio signal/file to mel feature
        VAD & feature extraction process
    
    Attributes:
        __init__: constructs Wav2Mel class, initializes FeatureExtraction and VAD classes
        process_db: processes all wav files under input directory path and save .npy to designated output path
        wav_to_mel: processes input signal and returns mel array
        _vad: VAD process
        _feature_extraction: log mel feature extraction process
        _one_wav_for_process_db: called in process_db, processes one wav file path and save a .npy
        _get_audio_list: get list of .wav or .mp4 excluding that of speakers(folders) which contain less than 10(or defined number of) utts
        _remove_spkr_under_numutt: in process_db, after saving .npy remove directories that contain less than 10(or defined number of) utts
        _save: check directory and save .npy
        _make_dir_tree: when process_db with multiprocessing option, make all save dir tree in advance
        _dir_sanity_check: directory path filtering
    """
    def __init__(self,
                 min_num_utt_per_spkr,
                 min_utt_duration,
                 sample_rate,
                 num_fft,
                 fft_window_size,
                 fft_hop_size,
                 num_mel,
                 vad_mode,
                 preprocess_multiprocessing=False):
        """
        Note:
        
        Args:
            min_num_utt_per_spkr: integer, minimum number of utterances per speaker
            min_utt_duration: float, minimum duration of an audio file (in frames)
            sample_rate: integer, sampling rate
            num_fft: integer, fft window size
            fft_window_size: integer, The window will be of length win_length and then padded with zeros to match n_fft (msec)
            fft_hop_size: integer, number audio of frames between STFT columns (msec)
            num_mel: integer, feature dimension
            vad_mode: integer ranging [1,3]. degree of strictness in VAD
            multiprocessing: Boolean, True when processing large DB and saving it to .npy

        Returns:
            
        """

        ### General settings
        self._min_num_utt_per_spkr = min_num_utt_per_spkr
        self._min_utt_duration = ((
            (min_utt_duration - 1) * fft_hop_size) + fft_window_size) / 1000
        self._min_utt_duration_fr = min_utt_duration

        ### VAD
        self.vad = VAD(vad_mode, "unused")

        ### FFT
        fft_window_size_frames = int((fft_window_size / 1000) * sample_rate)
        fft_hop_size_frames = int((fft_hop_size / 1000) * sample_rate)
        self.feature_extractor = FeatureExtraction(sample_rate, num_fft,
                                                   fft_window_size_frames,
                                                   fft_hop_size_frames,
                                                   num_mel)
        self._multiprocessing = preprocess_multiprocessing

    def process_db(self, db_dir, mel_dir):
        """
        Note:
            does VAD & feature extraction of wav files from a directory and save processed mel features as .npy
            uses Pool if multiprocessing=True in __init__ config

        Args:
            db_dir : string that indicates a directory of the original DB
                     A wav file should be directely under its speaker folder
            mel_dir : string that indicates a directory where the processed mels are saved as npy
                      its sub-directories are the same as db_dir

        Returns:

        """

        db_dir = self._dir_sanity_check(db_dir, isexist=True)
        mel_dir = self._dir_sanity_check(mel_dir)
        wav_list, spkr_list = self._get_audio_list(db_dir)
        if self._multiprocessing:
            self._make_dir_tree(spkr_list, db_dir, mel_dir)
            db_dir_param = [db_dir] * len(wav_list)
            mel_dir_param = [mel_dir] * len(wav_list)
            params = zip(wav_list, db_dir_param, mel_dir_param)
            with Pool(processes=4) as pool:
                pool.map(self._one_wav_for_process_db, params)
        else:
            for wav_file in tqdm(wav_list):
                params = (wav_file, db_dir, mel_dir)
                self._one_wav_for_process_db(params)
        self._remove_spkr_under_numutt(mel_dir)

    def wav_to_mel(self, wav_file):
        """
        Note:
        does VAD & feature extraction of a wav file and return processed mels as an np array
            
        Args:
        wav_file: a string that indicates path of a .wav file

        Returns:
        mel: a np array of processed mel feature
        
        """

        sig, sr = librosa.load(wav_file, sr=None)
        wav_dur = len(sig) / sr
        if wav_dur < self._min_utt_duration:
            print(
                f"skipping RAW {wav_file} {wav_dur} < {self._min_utt_duration}"
            )
            return []
        else:
            processed_sig = self._vad(wav_file)
            if len(processed_sig) / sr < self._min_utt_duration:
                print(
                    f"skipping VAD output {wav_file} {len(processed_sig)/sr}< {self._min_utt_duration}"
                )
                return []
            else:
                mel, _ = self._feature_extraction(processed_sig)
                return mel

    def _vad(self, wav_file):
        """
        Note:
            does VAD from a file and return the result as an array

        Args:
            wav_file: a string that indicates path of a .wav file

        Returns:
            total_wav: an array of the VAD processed signal

        """

        total_wav, sample_rate = self.vad.run_vad(wav_file)
        return total_wav

    def _feature_extraction(self, sig):
        """
        Note:
            extracts log mel feature from a signal

        Args:
            sig: an array of input audio signal

        Returns:
            out_mel: an array of mel features

        """
        out_mel = self.feature_extractor.process_signal(sig)
        return out_mel

    def _one_wav_for_process_db(self, params):
        """
        Note:
            does VAD & feature extraction of a wav file and save the processed mel array to a new directory as .npy file
            this is designed for multiprocessing Pool

        Args:
            params: tuple of (wav_file, db_dir, mel_dir), strings of paths

        Returns:

        """

        wav_file = params[0]
        db_dir = params[1]
        mel_dir = params[2]
        fn = re.sub(db_dir, mel_dir, wav_file)
        fn = re.sub(r'\.[^/]+', '.npy', fn)
        new_path = os.path.join(mel_dir, fn)
        print(new_path)
        mel = self.wav_to_mel(wav_file)
        if len(mel) > 0:
            self._save(mel, new_path)

    def _get_audio_list(self, db_dir):
        """
        Note:
            glob .wav or .m4a(add more extensions if needed) files from input directory
            excludes utterances of which the speaker contains less than min_num_utt_per_spkr

        Args:
            db_dir: a string that indicates top directory containing audio files 

        Returns:
            out_wav_list: a list of strings which are paths of each audio file
            uniq_spkr: a list of strings which are directories representing each speaker
                       assuming direct upper directory of audio files are speaker directories

        """

        wav_list = []
        ext = ['wav', 'm4a']
        for e in ext:
            print(f"getting all {e} under {db_dir}")
            wav_list.extend(
                glob.glob(os.path.join(db_dir, '**/*.' + e), recursive=True))
        spkr_path_list = ['/'.join(i.split('/')[:-1]) for i in wav_list]
        uniq_spkr = list(set(spkr_path_list))
        out_wav_list = []
        for spkr_path in uniq_spkr:
            indices = [
                i for i, x in enumerate(spkr_path_list) if x == spkr_path
            ]
            if len(indices) >= self._min_num_utt_per_spkr:
                out_wav_list.extend([wav_list[i] for i in indices])
        return out_wav_list, uniq_spkr

    def _remove_spkr_under_numutt(self, mel_dir):
        """
        Note:
            remove speaker directories that contain less than min_num_utt_per_spkr

        Args:
            mel_dir: a string of a directory path

        Returns:

        """

        npy_list = glob.glob(os.path.join(mel_dir, '**/*.npy'), recursive=True)
        spkr_path_list = ['/'.join(i.split('/')[:-1]) for i in npy_list]
        uniq_spkr = list(set(spkr_path_list))
        for spkr_path in uniq_spkr:
            indices = [
                i for i, x in enumerate(spkr_path_list) if x == spkr_path
            ]
            if len(indices) < self._min_num_utt_per_spkr:
                shutil.rmtree(spkr_path)
                print(
                    f"{spkr_path} is erased since it contains less than {self._min_num_utt_per_spkr} npy files"
                )
        return

    def _save(self, arr, path):
        """
        Note:
            save an array with np.save

        Args:
            arr: numpy array that is to be saved
            path: a string that indicates a path where .npy will be saved

        Returns:

        """

        save_dir = os.path.dirname(path)
        if not os.path.exists(save_dir):
            if not self._multiprocessing:
                os.makedirs(save_dir)
            else:
                print(
                    f"directory {save_dir} is not prepared, check self._make_dir_tree"
                )
        #print(arr)
        np.save(path, arr)
        print(arr.shape)
        return

    def _make_dir_tree(self, spkr_dir_list, db_root, mel_root):
        """
        Note:
            make all directory tree for saving .npy files in advance
            this is for multiprocessing=True, because Pool tends to yield an error with os.path.exists & os.makedirs combination

        Args:
            spkr_dir_list: list of strings that indicate speaker folders from source database directory
            db_root: a string that indicates the top directory path of the source database
            mel_root: a string that indicates the top directory path of target saving location 

        Returns:

        """

        mel_dir_list = [re.sub(db_root, mel_root, x) for x in spkr_dir_list]
        for mel_dir in mel_dir_list:
            os.makedirs(mel_dir)
        return

    def _dir_sanity_check(self, path, isexist=False):
        """
        Note:
            deletes ending slash character that triggers error with os.path.join
            checks if directory exists if isexist=True

        Args:
            path: a string of a path
            isexist: if isexist=True, checks if the input path exists
                     default value is False

        Returns:
            path: a string of cleaned path

        """

        if isexist:
            if not os.path.exists(path):
                print(f"{path} does not exist; aborted")
                exit()
        if path[-1] == '/':
            path = path[:-1]
        return path
from sklearn.cross_validation import train_test_split
from scipy import sparse
import numpy as np
from data_process import DataProcess
from feature_extraction import FeatureExtraction
from evaluation.svm import EvalSVM
from evaluation.tree import EvalTree
from evaluation.nb import EvalNB
from evaluation.logReg import EvalLogReg
from sklearn.decomposition import PCA

# Init the data process and feature extraction object
data_process = DataProcess()
feature_extraction = FeatureExtraction()

# data_content, data_lable, data_similarity = data_process.load_data('2w_sample_sim.csv')
data_content, data_label, data_similarity, data_b1, data_b2, data_b3, data_b4, data_b5 = data_process.load_data(
    '../../data/output/news_gra_sen_title_sample_sim.csv')
# print distribution
data_process.extract_n_p_total(data_label)
# print data_content
# print data_similarity
# print data_b5

# preapre features, similarity, location
data_similarity = np.array(data_similarity)
# data_b1 = np.array(data_b1)
# data_b2 = np.array(data_b2)
# data_b3 = np.array(data_b3)
# data_b4 = np.array(data_b4)
# data_b5 = np.array(data_b5)
Ejemplo n.º 24
0
import collections
import itertools
import json
import typing
import numpy as np
from operator import attrgetter
from _bisect import bisect_left

from pepfrag import constants, Peptide, ModSite
from rPTMDetermine.peptide_spectrum_match import PSM
from rPTMDetermine.readers import PTMDB

from feature_extraction import FeatureExtraction

ptmdb = PTMDB()
featuregetter = FeatureExtraction()

Feature_names = ("NumPeaks", "TotInt", "PepMass", "Charge", "FracIon",
                 "FracIonInt", "NumSeriesbm", "NumSeriesym", "NumIona",
                 "NumIonynl", "NumIonbnl", "FracIonIntb_c1", "FracIonIntb_c2",
                 "FracIonInty_c1", "FracIonInty_c2", "FracIon20pc", "NumIonb",
                 "NumIony", "FracIonInty", "FracIonIntb", "MatchScore",
                 "SeqTagm")

target_pairs = {
    "Kmod_Biotinyl": "Kmod_Biotin",
    "Kmod_Propionyl": "Kmod_Propion",
    "Kmod_Ubiquitinyl": "Kmod_Glygly"
}

SynMatch = collections.namedtuple("SynMatch", [
Ejemplo n.º 25
0
            self.feature_extraction.y_train)
        self.model = LGBMClassifier(n_estimators=3000,
                                    random_state=1994,
                                    nfold=5,
                                    learning_rate=0.03,
                                    colsample_bytree=0.2,
                                    objective='multiclass')
        self.model.fit(self.x_train_oversampled,
                       self.y_train_oversampled,
                       eval_metric='multi_logloss',
                       verbose=200)

    def run_pipeline(self, training: bool = False):
        '''Make a Pipeline for both training and prediction'''
        self.feature_extraction.feature_ext(training)
        training_variables = self.get_training_variables()
        if training:
            self.fit_model(training_variables)
        pred = self.model.predict(
            self.feature_extraction.dataframe[training_variables])
        print(pred)
        return pred


if __name__ == '__main__':
    DATAFRAME = pd.read_excel("Data_Train.xlsx")
    DATA_PREPROCESSING = DataPreprocessing(DATAFRAME)
    FEATURE_EXTRACTION = FeatureExtraction(DATA_PREPROCESSING)
    MANAGER = ModelManager(FEATURE_EXTRACTION)
    MANAGER.run_pipeline(training=True)
Ejemplo n.º 26
0
def harmonic():
    # if 'username' not in session:
    #     return render_template('login.html')

    dbmodel = x.DBModel()
    get_stemming = dbmodel.get_data_all("Judul_Skripsi", "Stemming")
    get_dataawal = dbmodel.get_data_all("Judul_Skripsi", "datanya")

    stem = []
    for h in get_stemming:
        stem1 = []
        val1 = h.values()
        for k in val1:
            if k <> None:
                stem1.append(k)
        stem.append(stem1)
    documents = stem

    awal = []
    for c in get_dataawal:
        val = c.values()
        for d in val:
            d = d
        awal.append(d)

    # documents = awal

    feature_extraction = FeatureExtraction()
    feature = feature_extraction.fit(documents)
    score_ec = cosine_similarity(feature)  #feature)
    #score_ec = euclidean_distances(feature,feature)
    print score_ec

    total_tf = []

    for tf_score in feature:
        total_tf.append(round(np.sum(tf_score), 3))

    k = int(request.form["kluster"])
    data_len = len(feature)

    max_iteration = 100
    lastoutput = kHarmonic(score_ec, k, data_len, total_tf)

    # mengambil nama sheet dari hasil upload terakhir
    get_file = dbmodel.get_file_desc("Judul_Skripsi", "file")
    for w in get_file:
        values = w.values()
        for y in values:
            y = y

    #mengambil nama file dari hasil upload terakhir
    get_file2 = dbmodel.get_file_desc2("Judul_Skripsi", "file")
    for w2 in get_file2:
        values2 = w2.values()
        for y2 in values2:
            y2 = y2

    #mencari sheet didatabase(file) yang nama sheetnya itu = nama sheet yang di uppload terakhir.
    find_sheet = dbmodel.find_sheet("Judul_Skripsi", "file", y)
    #kondisi jika ketemu sheet yang sama maka dihitung ada berapaa sheet
    if find_sheet == True:
        count_sheet = dbmodel.count_sheet("Judul_Skripsi", "file", y)
        y = y + "(" + (
            str(count_sheet -
                1)) + ")"  # setelah itu nama sheet diubah menjadi (sheet(1))
        dbmodel.update_file1(
            "Judul_Skripsi", "file", y2, y,
            k)  #mengupdate database dengan nama sheet yang baru
        print y

    # update file nambah kolom jumlah kluster
    dbmodel.update_file("Judul_Skripsi", "file", y2, y, k)

    # mencari collection dengan nama sheet dari hasil upload terakhir
    find_collection = dbmodel.find_collection("Judul_Skripsi", y)

    if find_collection == True:
        dbmodel.delete_collection("Judul_Skripsi", y)

    s = 1
    all_out = []
    for out in lastoutput:
        temp_out = []
        for o in out:
            temp_out.append(awal[o])
            dbmodel.insert_hasil("Judul_Skripsi", y, awal[o], s)
        all_out.append(temp_out)
        s = s + 1

    get_hasil = dbmodel.get_data_all("Judul_Skripsi", y)
    table_hasil = pd.DataFrame(list(get_hasil))

    return render_template(
        "k-harmonic.html",
        tables=[
            table_hasil.to_html(
                classes='table table-striped table-bordered table-hover')
        ])
Ejemplo n.º 27
0
def uni_and_bi_validation(lines):
    """
    + plots the classification F1-measure using bigrams and unigrams
    + prints a table containing the max accuracy and F1-measure obtained and the number of feature reached at
    :param lines: list of tweets
    :return:
    """
    accuracy_list_nb = []
    f_measure_list_nb = []
    accuracy_list_svm = []
    f_measure_list_svm = []
    accuracy_list_maxent = []
    f_measure_list_maxent = []


    random.shuffle(lines)

    hashtag_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '#'))
    name_list = PatternsFeatures().get_most_frequent_pattern(PatternsFeatures().pattern_classifier(lines, '@'))

    train_set_rate = int(len(lines)*0.75)
    train_set, test_set = lines[:train_set_rate], lines[train_set_rate:]
    all_tweets = " ".join([" ".join(line[1]) for line in train_set])

    ftr2 = FeatureExtraction(20)
    ftr2.most_frequent_bigrams(all_tweets)

    bigram_featuresets_test = [(ftr2.bigram_features(line[1]), line[0]) for line in test_set]
    bigram_featuresets_train = [(ftr2.bigram_features(line[1]), line[0]) for line in train_set]



    for i in range(10, 200, 20):
        ftr = FeatureExtraction(i)

        ftr.most_frequent_unigrams(all_tweets)

        for hashtag in hashtag_list:
            ftr.set_unigram_features_list(hashtag)
        for name in name_list:
            ftr.set_unigram_features_list(name)


        unigram_featuresets_test = [(ftr.unigram_features(line[1]), line[0]) for line in test_set]
        unigram_featuresets_train = [(ftr.unigram_features(line[1]), line[0]) for line in train_set]

        featuresets_test = bigram_featuresets_test + unigram_featuresets_test
        featuresets_train = bigram_featuresets_train + unigram_featuresets_train



##############################################################################


        classifier1 = NaiveBayesClassifier.train(featuresets_train)
        classifier2 = MaxentClassifier.train(featuresets_train)
        classifier3 = nltk.classify.SklearnClassifier(LinearSVC())
        classifier3.train(featuresets_train)


        refsets = collections.defaultdict(set)
        testsets1 = collections.defaultdict(set)
        testsets2 = collections.defaultdict(set)
        testsets3 = collections.defaultdict(set)

        for i, (feats, label) in enumerate(featuresets_test):
            refsets[label].add(i)
            observed1 = classifier1.classify(feats)
            observed2 = classifier2.classify(feats)
            observed3 = classifier3.classify(feats)
            testsets1[observed1].add(i)
            testsets2[observed2].add(i)
            testsets3[observed3].add(i)

        accuracy_list_nb.append(nltk.classify.accuracy(classifier1, featuresets_test))
        f_measure_list_nb.append(nltk.metrics.f_measure(refsets['not'], testsets1['not']))
        accuracy_list_svm.append(nltk.classify.accuracy(classifier3, featuresets_test))
        f_measure_list_svm.append(nltk.metrics.f_measure(refsets['not'], testsets3['not']))
        accuracy_list_maxent.append(nltk.classify.accuracy(classifier2, featuresets_test))
        f_measure_list_maxent.append(nltk.metrics.f_measure(refsets['not'], testsets2['not']))


 ################################################################################

    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\tbigram and unigram classification measurements"
    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\t\t\tmax accuracy \t number of features "
    print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_nb), (accuracy_list_nb.index(max(accuracy_list_nb))*20)+10)
    print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(accuracy_list_maxent), (accuracy_list_maxent.index(max(accuracy_list_maxent))*20)+10)
    print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(accuracy_list_svm), (accuracy_list_svm.index(max(accuracy_list_svm))*20)+10)
    print "+-----------------------------------------------------------------+"
    print "+-----------------------------------------------------------------+"
    print "\t\t\t\t\t\t\tmax f-measure \t number of features "
    print "Naive Bayes\t\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_nb), (f_measure_list_nb.index(max(f_measure_list_nb))*20)+10)
    print "Maximum entropy\t\t\t\t %f \t\t\t\t%d" % (max(f_measure_list_maxent), (f_measure_list_maxent.index(max(f_measure_list_maxent))*20)+10)
    print "Support Vector Machine\t\t %f \t\t\t\t%d" % (max(f_measure_list_svm), (f_measure_list_svm.index(max(f_measure_list_svm))+1)*20)
    print "+-----------------------------------------------------------------+"
################################################################################

    print " time taken for the classification process %f sec " % (time() - t0)
#####################################################################################################
    x_axis = [i for i in range(10, 200, 20)]
    plt.figure(facecolor='white')
    fig1, = plt.plot(x_axis, accuracy_list_nb, 'r*-', label='Naive bayes accuracy')
    fig2, = plt.plot(x_axis, f_measure_list_nb, 'ro-', label='Naive bayes f-measure')
    fig3, = plt.plot(x_axis, accuracy_list_svm, 'g*-', label='SVM accuracy')
    fig4, = plt.plot(x_axis, f_measure_list_svm, 'go-', label='SVM f-measure')
    fig5, = plt.plot(x_axis, accuracy_list_maxent, '*-', label='max Entropy accuracy')
    fig6, = plt.plot(x_axis, f_measure_list_maxent, 'o-', label='max Entropy f-measure')

    plt.xlabel('Number of features')
    plt.ylabel('Results')
    plt.title('Results of the classification using unigrams and bigrams')
    plt.legend(handles=[fig1, fig2, fig3, fig4, fig5, fig6], loc=4)
    plt.show()
Ejemplo n.º 28
0
 def get_testing_set(self):
     FeatureExtraction(self.get_dir_id()).get_testing_set()
Ejemplo n.º 29
0
OUTPUT_DIR = '../../data/output/'


def create_path(params):
    path = ''
    for key in params.keys():
        path += key + '=' + str(params[key]) + '/'
    return path


# Loop over the experiments create necessary datasets and save to paths
for experiment in experiments:
    if experiment['dataset'] == 'speaker1':
        fe = FeatureExtraction(train_files=SPEAKER1_TRAIN,
                               val_files=SPEAKER1_VAL,
                               data_dir=SPEAKER1_DATA,
                               dataset='vctk',
                               upsample=experiment['upsample'])

        SAVE_DIR = OUTPUT_DIR + create_path(experiment)

    elif experiment['dataset'] == 'multispeaker':
        fe = FeatureExtraction(train_files=MULTISPEAKER_TRAIN,
                               val_files=MULTISPEAKER_VAL,
                               data_dir=MULTISPEAKER_DATA,
                               dataset='vctk',
                               upsample=experiment['upsample'],
                               train_subsample=experiment['subsample'])

        SAVE_DIR = OUTPUT_DIR + create_path(experiment)
Ejemplo n.º 30
0
from feature_extraction import FeatureExtraction

import cProfile
import numpy as np

mel = FeatureExtraction(nfilt=40,
                        lowerf=20,
                        upperf=8000,
                        samprate=16000,
                        wlen=0.03,
                        nfft=512,
                        datalen=480)
data = np.zeros(480)
pr = cProfile.Profile()
pr.enable()
for i in range(10000):
    mel_data = mel.signal_to_mel(data)

pr.disable()

pr.print_stats(sort='time')
Ejemplo n.º 31
0
from image_list_creator import ImageListCreator
from feature_extraction import FeatureExtraction
from EnumModels import Models

# First we need to create a image list of all the images in a folder.
# Creates image.txt listing the location of all images
il = ImageListCreator()
il.make_list_image_filenames("images")

# Now we have the images.txt file with all images
# We extract features from theses images on various models

obj_fe = FeatureExtraction()
# Param 1 : Takes model name from EnumModels.
# Param 2 : expected that models is the folder that contains this model and prototxt file.
#           But if it doesnot exist it will create a folder models and download necessary files
obj_fe.extract_features(Models.bvlc_alexnet.name, "models")
obj_fe.extract_features(Models.bvlc_googlenet.name, "models")
# Can do other models as given in Enum Models class
Ejemplo n.º 32
0

od = OnsetDetect(audio, sr)
onsets = od.get_times()
onset_clips = od.get_onset_clips(0.02)

#fe = FeatureExtraction(onset_clips, sr)

nyq = sr/2
X = FeatureExtraction(onset_clips, sr)\
    .with_spectral_centroid()\
    .with_zero_crossing_rate()\
    .with_rms()\
    .with_rms_of_filter(np.divide([49, 50], nyq), np.divide([0.01, 2000], nyq), 0.01, 62)\
    .with_rms_of_filter(np.divide([200, 201], nyq), np.divide([1, 1300], nyq), 0.01, 20)\
    .with_rms_of_filter(np.divide([5100, 16300], nyq), np.divide([65, 22000], nyq), 0.05, 60)\
    .with_spectral_kurtosis()\
    .with_spectral_skewness()\
    .with_spectral_rolloff()\
    .with_spectral_flatness()\
    .with_mfcc()\
    .get_feature_matrix()

annotator = DrumAnnotation("./trained_models/nov26.pkl")
predict = annotator.get_drum_prediction_times(audio, sr)
print(predict)
librosa.display.waveplot(audio, sr=sr)
plt.vlines(onsets, -audio.max(), audio.max(), color='r', alpha=0.9,
                   linestyle='--', label='Onsets')
plt.vlines(predict["Bass drum"], -audio.max(), audio.max(), color='b', alpha=0.5)
plt.vlines(predict["Hi-hat closed"], -audio.max(), audio.max(), color='g', alpha=0.5)