Ejemplo n.º 1
0
    def __init__(self):
        self.preprocessor = Preprocessor()
        self.feature_extractor = FeatureExtractor()
        self.crf_analyzer = CRFAnalyzer()
        self.sentiment_analyzer = SentimentAnalyzer()

        print("\nAll module instantiated and ready to go....\n")
Ejemplo n.º 2
0
def mfcc_stuff(cfg):
    """
  for dct, filter bands, etc
  """

    # plot path
    plot_path = '../docu/thesis/3_signal/figs/'

    # init feature extractor
    feature_extractor = FeatureExtractor(cfg['feature_params'])

    # plot dct
    plot_dct(custom_dct_matrix(cfg['feature_params']['n_filter_bands']),
             plot_path=plot_path,
             name='signal_mfcc_dct',
             show_plot=False)
    plot_dct(custom_dct_matrix(cfg['feature_params']['n_filter_bands']),
             plot_path=plot_path,
             context='dct-div',
             name='signal_mfcc_dct-div',
             show_plot=False)

    # mel scale
    plot_mel_scale(plot_path=plot_path,
                   name='signal_mfcc_mel_scale',
                   show_plot=False)

    # plot mel bands
    plot_mel_band_weights(feature_extractor.w_f,
                          feature_extractor.w_mel,
                          feature_extractor.f,
                          feature_extractor.m,
                          plot_path=plot_path,
                          name='signal_mfcc_weights',
                          show_plot=True)
Ejemplo n.º 3
0
  def __init__(self, dataset_cfg, feature_params, collect_wavs=False, verbose=False):

    # parent init
    super().__init__(dataset_cfg, feature_params, collect_wavs=collect_wavs, verbose=verbose)

    # feature extractor
    self.feature_extractor = FeatureExtractor(feature_params=self.feature_params)

    # short vars
    self.N = self.feature_extractor.N
    self.hop = self.feature_extractor.hop

    # create plot plaths if not already exists
    create_folder(list(self.plot_paths.values()))

    # recreate
    if self.dataset_cfg['recreate'] or not check_folders_existance(self.wav_folders, empty_check=True):

      # delete old data
      delete_files_in_path(self.wav_folders, file_ext=self.dataset_cfg['file_ext'])

      # create folder wav folders
      create_folder(self.wav_folders)

      # create sets (specific to dataset)
      self.create_sets()

    # get audio files from sets
    self.get_audiofiles()
    self.get_annotation_files()
Ejemplo n.º 4
0
    def __init__(self,
                 classifier,
                 mic_params,
                 is_audio_record=False,
                 root_path='./'):

        # arguments
        self.classifier = classifier
        self.mic_params = mic_params
        self.is_audio_record = is_audio_record
        self.root_path = root_path

        # plot path
        self.plot_path = self.root_path + self.mic_params['plot_path']

        # create folder for plot path
        create_folder([self.plot_path])

        # shortcuts
        self.feature_params = classifier.feature_params

        # feature extractor
        self.feature_extractor = FeatureExtractor(self.feature_params)

        # windowing params
        self.N, self.hop = self.feature_extractor.N, self.feature_extractor.hop

        # queue
        self.q = queue.Queue()

        # collector
        self.collector = Collector(
            N=self.N,
            hop=self.hop,
            frame_size=self.feature_params['frame_size'],
            update_size=self.mic_params['update_size'],
            frames_post=self.mic_params['frames_post'],
            is_audio_record=self.is_audio_record)

        # device
        self.device = sd.default.device[0] if not self.mic_params[
            'select_device'] else self.mic_params['device']

        # determine downsample
        self.downsample = self.mic_params['fs_device'] // self.feature_params[
            'fs']

        # get input devices
        self.input_dev_dict = self.extract_devices()

        # show devices
        print("\ndevice list: \n", sd.query_devices())
        print("\ninput devs: ", self.input_dev_dict.keys())

        # stream
        self.stream = None

        # change device flag
        self.change_device_flag = False
Ejemplo n.º 5
0
def audio_set_wavs(cfg):
    """
  audio set wavs
  """

    # plot path
    plot_path = '../docu/thesis/5_exp/figs/'

    # audio sets
    a1 = AudioDataset(cfg['datasets']['speech_commands'],
                      cfg['feature_params'],
                      root_path='../')
    a2 = AudioDataset(cfg['datasets']['my_recordings'],
                      cfg['feature_params'],
                      root_path='../')

    # feature extractor
    feature_extractor = FeatureExtractor(cfg['feature_params'])

    # get audio files
    a1.get_audiofiles()

    # random seed
    np.random.seed(1234)
    r = np.random.randint(low=0, high=150, size=len(a1.set_audio_files[1]))

    wav_grid = []

    # process wavs
    for wav in sorted([
            label_wavs[r[i]]
            for i, label_wavs in enumerate(a1.set_audio_files[1])
    ]):

        # info
        print("wav: ", wav)

        # get raw
        x, _ = a1.wav_pre_processing(wav)

        # extract feature vectors [m x l]
        _, bon_pos = feature_extractor.extract_mfcc(x,
                                                    reduce_to_best_onset=False)

        # append to wav grid
        wav_grid.append((librosa.util.normalize(x),
                         re.sub(r'[0-9]+-', '',
                                wav.split('/')[-1].split('.')[0]), bon_pos))

    # plot wav grid
    plot_wav_grid(wav_grid,
                  feature_params=a1.feature_params,
                  grid_size=(6, 5),
                  plot_path=plot_path,
                  name='wav_grid_c30',
                  show_plot=True)
Ejemplo n.º 6
0
    def __init__(self,
                 classifier,
                 feature_params,
                 mic_params,
                 is_audio_record=False):

        # arguments
        self.classifier = classifier
        self.feature_params = feature_params
        self.mic_params = mic_params
        self.is_audio_record = is_audio_record

        # windowing params
        self.N, self.hop = int(
            feature_params['N_s'] * feature_params['fs']), int(
                feature_params['hop_s'] * feature_params['fs'])

        # queue
        self.q = queue.Queue()

        # collector
        self.collector = Collector(
            N=self.N,
            hop=self.hop,
            frame_size=self.feature_params['frame_size'],
            update_size=self.mic_params['update_size'],
            frames_post=self.mic_params['frames_post'],
            is_audio_record=self.is_audio_record)

        # feature extractor
        self.feature_extractor = FeatureExtractor(
            self.feature_params['fs'],
            N=self.N,
            hop=self.hop,
            n_filter_bands=self.feature_params['n_filter_bands'],
            n_ceps_coeff=self.feature_params['n_ceps_coeff'],
            frame_size=self.feature_params['frame_size'])

        # select microphone yourself (usually not necessary)
        if mic_params['select_device']:
            sd.default.device = self.mic_params['device']

        # determine downsample
        self.downsample = self.mic_params['fs_device'] // self.feature_params[
            'fs']

        # show devices
        print("\ndevice list: \n", sd.query_devices())

        # setup stream sounddevice
        self.stream = sd.InputStream(samplerate=self.mic_params['fs_device'],
                                     blocksize=int(self.hop * self.downsample),
                                     channels=self.mic_params['channels'],
                                     callback=self.callback_mic)
Ejemplo n.º 7
0
def time_measurements(x, u, feature_params):
    """
  time measurements
  """

    # create feature extractor
    feature_extractor = FeatureExtractor(feature_params)

    # n measurements
    delta_time_list = []

    for i in range(100):

        # measure extraction time - start
        start_time = time.time()

        # time: 0.030081419944763182
        #y = calc_mfcc39(x, fs, N=400, hop=160, n_filter_bands=32, n_ceps_coeff=12, use_librosa=False)

        # time: 0.009309711456298829
        #y = calc_mfcc39(x, fs, N=400, hop=160, n_filter_bands=32, n_ceps_coeff=12, use_librosa=True)

        # time: 0.00014737367630004883
        #y = (custom_dct(np.log(u), n_filter_bands).T)

        # time: 6.929159164428711e-05
        #y = scipy.fftpack.dct(np.log(u), type=2, n=n_filter_bands, axis=1, norm=None, overwrite_x=False).T

        # time: 0.00418839693069458 *** winner
        y, _ = feature_extractor.extract_mfcc(x)

        # time: 0.015525884628295898
        #y, _ = feature_extractor.extract_mfcc39_slow(x)

        # time: 0.011266257762908936s
        #y = custom_stft(x, N=N, hop=hop, norm=True)

        # time: 0.0005800390243530274s
        #y = 2 / N * librosa.stft(x, n_fft=N, hop_length=hop, win_length=N, window='hann', center=True, dtype=None, pad_mode='reflect')

        # time: 0.00044193744659423826s
        #_, _, y = scipy.signal.stft(x, fs=1.0, window='hann', nperseg=N, noverlap=N-hop, nfft=N, detrend=False, return_onesided=True, boundary='zeros', padded=False, axis=- 1)

        # result of measured time diff
        delta_time_list.append(time.time() - start_time)

    # data shpae
    print("y: ", y.shape)

    # times
    print("delta_time: ", np.mean(delta_time_list))
Ejemplo n.º 8
0
    def write_predictions(self):
        """Output the predictions to a text file."""

        res = FeatureExtractor("test").run()
        model = torch.load("model.pt")
        test = namedtuple("res",
                          ["lsr", "feats", "scores"])(lsr=res.lsr.reshape(
                              -1, 2048),
                                                      feats=res.feats,
                                                      scores=res.scores)
        dev_ = data_utils.TensorDataset(*[
            torch.tensor(getattr(test, i)).float()
            for i in ["lsr", "feats", "scores"]
        ])
        with torch.no_grad():
            preds = model.forward(*dev_.tensors[:2]).cpu().numpy()
        np.set_printoptions(suppress=True)
        np.savetxt("predictions.txt",
                   preds.astype(float),
                   delimiter="\n",
                   fmt="%f")
        print("Predictions saved to predictions.txt")
Ejemplo n.º 9
0
def showcase_wavs(cfg,
                  raw_plot=True,
                  spec_plot=True,
                  mfcc_plot=True,
                  show_plot=False):
    """
  showcase wavs
  """

    # plot path
    plot_path = '../docu/thesis/3_signal/figs/'

    # change params
    feature_params = cfg['feature_params'].copy()
    feature_params['n_ceps_coeff'] = 32
    feature_params['norm_features'] = True

    # init feature extractor
    feature_extractor = FeatureExtractor(feature_params)

    # wav, anno dir
    wav_dir, anno_dir = '../ignore/my_recordings/showcase_wavs/', '../ignore/my_recordings/showcase_wavs/annotation/'

    # analyze some wavs
    for wav, anno in zip(glob(wav_dir + '*.wav'),
                         glob(anno_dir + '*.TextGrid')):

        # info
        print("\nwav: ", wav), print("anno: ", anno)

        # load file
        x, _ = librosa.load(wav, sr=feature_params['fs'])

        # raw waveform
        if raw_plot:
            plot_waveform(x,
                          feature_params['fs'],
                          anno_file=anno,
                          hop=feature_extractor.hop,
                          plot_path=plot_path,
                          name='signal_raw_' +
                          wav.split('/')[-1].split('.')[0] + '_my',
                          show_plot=show_plot)

        # spectogram
        if spec_plot:
            plot_spec_profile(x,
                              feature_extractor.calc_spectogram(x).T,
                              feature_params['fs'],
                              feature_extractor.N,
                              feature_extractor.hop,
                              anno_file=anno,
                              plot_path=plot_path,
                              title=wav.split('/')[-1].split('.')[0] + '_my',
                              name='signal_spec-lin_' +
                              wav.split('/')[-1].split('.')[0] + '_my',
                              show_plot=show_plot)
            plot_spec_profile(x,
                              feature_extractor.calc_spectogram(x).T,
                              feature_params['fs'],
                              feature_extractor.N,
                              feature_extractor.hop,
                              log_scale=True,
                              anno_file=anno,
                              plot_path=plot_path,
                              title=wav.split('/')[-1].split('.')[0] + '_my',
                              name='signal_spec-log_' +
                              wav.split('/')[-1].split('.')[0] + '_my',
                              show_plot=show_plot)

        # mfcc
        if mfcc_plot:
            mfcc, bon_pos = feature_extractor.extract_mfcc(
                x, reduce_to_best_onset=False)
            plot_mfcc_profile(x,
                              cfg['feature_params']['fs'],
                              feature_extractor.N,
                              feature_extractor.hop,
                              mfcc,
                              anno_file=anno,
                              sep_features=True,
                              bon_pos=bon_pos,
                              frame_size=cfg['feature_params']['frame_size'],
                              plot_path=plot_path,
                              name='signal_mfcc_' +
                              wav.split('/')[-1].split('.')[0] + '_my',
                              close_plot=False,
                              show_plot=show_plot)
def estimate_parameters(multinomial_nb=False,
                        bernoulli_nb=False,
                        k_nearest=False,
                        support_vm=False,
                        support_vmsgd=False,
                        bow=False,
                        tfidf=False):
    """
    This method performs a grid search on the given algorithm using a fixed
    set of parameter ranges.
    The values with highest score are printed to stdout after evaluation
    :param multinomial_nb: MultinomialNB
    :param bernoulli_nb: BernoulliNB
    :param k_nearest: KNearestClassifier
    :param support_vm: Linear SVM aka SVC
    :param support_vmsgd: SGDClassifier
    :param bow: CountVectorizer aka Bag-of-words
    :param tfidf: TfidfVectorizer
    """
    fe = FeatureExtractor()
    counts, targets = fe.fetch_data()

    MAX_DF = [0.25, 0.5, 0.75, 1.0]
    N_GRAMS = [(1, 1), (1, 2), (1, 3), (1, 4)]

    if multinomial_nb:
        CLF = MultinomialNB()
        parameters = {'clf__alpha': 10.0**-np.arange(5, 11)}
    elif bernoulli_nb:
        CLF = BernoulliNB()
        parameters = {'clf__alpha': 10.0**-np.arange(5, 11)}
    elif k_nearest:
        CLF = KNeighborsClassifier()
        parameters = {
            'clf__n_neighbors': range(2, 10),
            'clf__weights': ('uniform', 'distance'),
            'clf__algorithm': ('auto', 'brute'),
            'clf__leaf_size': (20, 30, 40)
        }
    elif support_vm:
        CLF = SVC()
        parameters = {
            'clf__kernel': ('linear', 'sigmoid', 'rbf', 'poly'),
            'clf__decision_function_shape': ('ovo', 'ovr'),
            'clf__C': (100, 1000, 10000, 100000, 1000000),
            'clf__gamma': (0.001, 0.01, 0.1, 1)
        }
    elif support_vmsgd:
        CLF = SGDClassifier(max_iter=50)
        parameters = {
            'clf__loss': ('hinge', 'modified_huber', 'squared_hinge'),
            'clf__penalty': ('l1', 'l2', 'elasticnet'),
            'clf__alpha': 10.0**-np.arange(1, 8),
            'clf__tol': (0.3, 0.2, 1e-2, 1e-3, 1e-4),
            'clf__n_iter': np.ceil(10**6 / 1062),
            'clf__eta0': (0.0, 0.2, 0.5, 0.7),
            'clf__learning_rage': ('constant', 'optimal', 'invscaling'),
            'clf__average': (True, False)
        }
    else:
        print('Please provide one which algorithm to use')
        return

    # add feature extraction params and classifier to pipeline
    if bow:
        parameters.update({
            'vect__max_df': MAX_DF,
            'vect__ngram_range': N_GRAMS
        })

        pipeline = Pipeline([('vect', CountVectorizer()), ('clf', CLF)])
    elif tfidf:
        parameters.update({
            'tfidf__max_df': MAX_DF,
            'tfidf__ngram_range': N_GRAMS,
            'tfidf__analyzer': ('word', 'char'),
            'tfidf__sublinear_tf': (True, False),
            'tfidf__smooth_idf': (True, False),
            'tfidf__use_idf': (True, False),
            'tfidf__norm': ('l1', 'l2', None)
        })

        pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', CLF)])
    else:
        print('Please provide one which algorithm to use')
        return

    # perform grid search on pipeline
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=parameters,
                               cv=15,
                               scoring='accuracy')
    print("parameters:")
    pprint(parameters)
    print("Starting grid search. This may take some time...")

    # learn vocabulary
    grid_search.fit(counts, targets)

    print("Best parameters: " + str(grid_search.best_params_))
    print("Best score: %0.3f" % grid_search.best_score_)

    filename = '/var/booking_categorizer/'
    with open(filename, 'a') as file:
        file.write("Best parameters: " + str(grid_search.best_params_) + "\n" +
                   "Best score: %0.3f" % grid_search.best_score_)
Ejemplo n.º 11
0
    sys.path.append("../")

    from common import create_folder
    from feature_extraction import FeatureExtractor

    # plot path
    #plot_path = './ignore/plots/fe/'

    # create folder
    #create_folder([plot_path])

    # yaml config file
    cfg = yaml.safe_load(open("../config.yaml"))

    # init feature extractor
    feature_extractor = FeatureExtractor(cfg['feature_params'])

    # --
    # params

    fs = 16000
    N = 400
    hop = 160
    n_filter_bands = 16
    n_ceps_coeff = 12

    # --
    # test signal

    # generate test signal
    x = some_test_signal(fs, t=1, save_to_file=False)
Ejemplo n.º 12
0
def run(properties, vector_models, results_file, csv_line):
    """
		Script that runs the k-fold

		:param properties: dictionary containing the parameters specified in the config file for the current experiment
		:param vector_models: list of embedding models to be used in this experiment
		:param results_file: csv file where the accuracies are going to be written
		:param csv_line: line of csv config file corresponding to the current experiment
		:return: nothing
	"""
    kfold_folder_path = '../data/kfold/'  # folder containing the k partitions (the development set has already been split during preprocessing)
    print 'Writing to output file:', results_file.name
    k = properties['K']
    results_file.write(csv_line.rstrip(
    ))  # it copies the line from config file to keep track of used parameters
    r = range(1, k + 1)
    """
		If the bag-of-words feature is chosen, the script initializes dictionary for the list of all words in the data-set.
		It also initializes the index value available for the next unseen word, with the value of zero because no word has been added yet.
		This index value will be updated every time an unseen word occurs in the data-set.
	"""
    if properties['BAG_OF_WORDS']:
        feature_extractor = FeatureExtractor(
            properties,
            words_dict={},
            next_word_position=0,
            vector_models=vector_models,
            vectors_size=properties['VECTORS_SIZE'])
    else:
        feature_extractor = FeatureExtractor(
            properties,
            vector_models=vector_models,
            vectors_size=properties['VECTORS_SIZE'])
    results_dict = {
        'subj': 0.0,
        'opos': 0.0,
        'oneg': 0.0,
        'iro': 0.0,
        'polarity': 0.0
    }
    kfold_folder_path += 'conll/'
    """
		managing cross validation;
		the k partition have already been created during pre-processing
	"""
    if k > 1:
        for i in r:
            print
            print 'RUNNING ITERATION N.', str(i)
            kth_value_folder = kfold_folder_path + str(k) + '/'
            """ creates list of partition sorted by k value inside file names """
            partitions = sorted(os.listdir(kth_value_folder),
                                key=lambda x: (int(re.sub('\D', '', x)), x))
            test_file = kth_value_folder + 'fold_' + str(i)
            for index in range(len(partitions)):
                partitions[index] = kth_value_folder + partitions[index]
            partitions.pop(partitions.index(test_file))
            try:
                assert len(partitions) == k - 1
            except AssertionError:
                print 'Error: invalid number of partitions'
            """ samples with word, emoj and embedding features """
            training_samples = []
            """ dictionaries of word occurrences in tweets for bag-of-words """
            training_words_dicts = []
            """ dictionary with training labels """
            training_labels = {
                'subj_s': [],
                'opos_s': [],
                'oneg_s': [],
                'iro_s': [],
                'lpos_s': [],
                'lneg_s': []
            }
            extraction_function = feature_extractor.extract_from_conll
            """ using partitions as training set, with the exception of kth one """
            for training_file in partitions:
                samples, dicts, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function(
                    training_file)
                training_labels['subj_s'] += subj_s
                training_labels['opos_s'] += opos_s
                training_labels['oneg_s'] += oneg_s
                training_labels['iro_s'] += iro_s
                training_labels['lpos_s'] += lpos_s
                training_labels['lneg_s'] += lneg_s
                training_samples += samples
                training_words_dicts += dicts
            """ sets to zero the empty positions in bags-of-words of training tweets """
            fill_dicts(training_samples, training_words_dicts,
                       feature_extractor.next_word_position)
            test_labels = {}
            """ using kth partition as test-set """
            samples, test_words_dicts, id_s, top_s, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function(
                test_file, test=True)
            """ delete embedding models """

            test_samples = samples
            test_labels['subj_s'] = subj_s
            test_labels['opos_s'] = opos_s
            test_labels['oneg_s'] = oneg_s
            test_labels['iro_s'] = iro_s
            test_labels['lpos_s'] = lpos_s
            test_labels['lneg_s'] = lneg_s
            test_labels['id_s'] = id_s
            """ sets to zero the empty positions in bags-of-words of test tweets """
            fill_dicts(test_samples, test_words_dicts,
                       feature_extractor.next_word_position)
            training_labels_vectors = [
                training_labels['subj_s'], training_labels['opos_s'],
                training_labels['oneg_s'], training_labels['iro_s'],
                training_labels['lpos_s'], training_labels['lneg_s']
            ]
            test_labels_vectors = [
                test_labels['subj_s'], test_labels['opos_s'],
                test_labels['oneg_s'], test_labels['iro_s'],
                test_labels['lpos_s'], test_labels['lneg_s']
            ]
            test_id_s = test_labels['id_s']
            predict_matrix = get_prediction_matrix(training_samples,
                                                   training_labels_vectors,
                                                   test_samples, test_id_s,
                                                   top_s, properties['KERNEL'])
            gold_matrix = get_gold_matrix(test_labels_vectors, test_id_s,
                                          top_s)
            prediction_lines = matrix2string(predict_matrix)
            test_lines = matrix2string(gold_matrix)
            """ write prediction and gold matrix to file for the evaluation script"""
            tmp_folder = '../tmp/'
            tmp_result_file = open(tmp_folder + 'tmp_res.txt', 'w')
            tmp_gold_file = open('tmp_folder + tmp_gold.txt', 'w')
            tmp_result_file.write(prediction_lines)
            tmp_gold_file.write(test_lines)
            tmp_result_file.close()
            tmp_gold_file.close()
            """ evaluate and write accuracies to temporary file"""
            tmp_out_file_name = 'tmp_out' + str(i) + '.txt'
            tmp_out_file = open(tmp_out_file_name, 'w')
            evaluate('tmp_res.txt',
                     'tmp_gold.txt',
                     outfile=tmp_out_file,
                     verbose=False)
            tmp_out_file.close()
            """ parse temporary results file and updates the dictionary with experiment results"""
            with open(tmp_out_file_name, 'r') as infile:
                task = ''
                for line in infile:
                    if 'task' in line:
                        task = line.rstrip().split()[-1]
                    if line[0].isdigit():
                        """ add the accuracies values to the dictionary of accuracies """
                        results_dict[task] += float(line.rstrip().split()[-1])
        for key, value in results_dict.iteritems():
            """ averages the results """
            results_dict[key] = value / k
    elif k == 1:
        """ if k == 1 it uses the official test-set as test """
        training_file_name = '/home/ruggero/MEGA/tesi_magistrale/classification/data/training_all.parsed'
        test_file_name = '/home/ruggero/MEGA/tesi_magistrale/classification/data/testset_annotated.parsed'
        training_labels = {
            'subj_s': [],
            'opos_s': [],
            'oneg_s': [],
            'iro_s': [],
            'lpos_s': [],
            'lneg_s': []
        }
        extraction_function = feature_extractor.extract_from_conll
        training_samples, training_words_dicts, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function(
            training_file_name)
        training_labels['subj_s'] += subj_s
        training_labels['opos_s'] += opos_s
        training_labels['oneg_s'] += oneg_s
        training_labels['iro_s'] += iro_s
        training_labels['lpos_s'] += lpos_s
        training_labels['lneg_s'] += lneg_s
        fill_dicts(training_samples, training_words_dicts,
                   feature_extractor.next_word_position)
        test_labels = {}
        test_samples, test_words_dicts, id_s, top_s, subj_s, opos_s, oneg_s, iro_s, lpos_s, lneg_s = extraction_function(
            test_file_name, test=True)
        test_labels['subj_s'] = subj_s
        test_labels['opos_s'] = opos_s
        test_labels['oneg_s'] = oneg_s
        test_labels['iro_s'] = iro_s
        test_labels['lpos_s'] = lpos_s
        test_labels['lneg_s'] = lneg_s
        test_labels['id_s'] = id_s
        fill_dicts(test_samples, test_words_dicts,
                   feature_extractor.next_word_position)
        training_labels_vectors = [
            training_labels['subj_s'], training_labels['opos_s'],
            training_labels['oneg_s'], training_labels['iro_s'],
            training_labels['lpos_s'], training_labels['lneg_s']
        ]
        test_labels_vectors = [
            test_labels['subj_s'], test_labels['opos_s'],
            test_labels['oneg_s'], test_labels['iro_s'], test_labels['lpos_s'],
            test_labels['lneg_s']
        ]
        test_id_s = test_labels['id_s']
        predict_matrix = get_prediction_matrix(training_samples,
                                               training_labels_vectors,
                                               test_samples, test_id_s, top_s,
                                               properties['KERNEL'])
        gold_matrix = get_gold_matrix(test_labels_vectors, test_id_s, top_s)
        prediction_lines = matrix2string(predict_matrix)
        test_lines = matrix2string(gold_matrix)
        tmp_result_file = open('tmp_res.txt', 'w')
        tmp_gold_file = open('tmp_gold.txt', 'w')
        tmp_result_file.write(prediction_lines)
        tmp_gold_file.write(test_lines)
        tmp_result_file.close()
        tmp_gold_file.close()
        tmp_out_file_name = 'tmp_out' + '.txt'
        tmp_out_file = open(tmp_out_file_name, 'w')
        evaluate('tmp_res.txt',
                 'tmp_gold.txt',
                 outfile=tmp_out_file,
                 verbose=False)
        tmp_out_file.close()
        with open(tmp_out_file_name, 'r') as infile:
            task = ''
            for line in infile:
                if 'task' in line:
                    task = line.rstrip().split()[-1]
                if line[0].isdigit():
                    results_dict[task] += float(line.rstrip().split()[-1])
    write_results(results_file, results_dict)
def main():
    parser = get_parser()
    args = parser.parse_args()
    feature_extractor = FeatureExtractor()
    if args.pipeline_type == "analysis":
        text_preprocessor = TextPreProcessor(
            stop_words_file_path=args.stopwords_file_path)
        analyser = DataAnalyser(input_file=args.input_file_path,
                                text_preprocessor=text_preprocessor)
        analyser.get_data_distribution(plot_bar=args.plot_bar)
        analyser.get_word_weights(word_thresh=args.word_thresh)
        if args.word_cloud:
            analyser.generate_word_cloud()
    elif args.pipeline_type == "model_selection":
        text_preprocessor = TextPreProcessor(
            stop_words_file_path=args.stopwords_file_path)
        training_data_df = load_training_data(args.train_file_path)
        training_data_df["sentence"] = training_data_df["sentence"].map(
            text_preprocessor.process)
        features = feature_extractor.get_features_for_training(
            training_data_df["sentence"], args.vectorizer)
        labels = training_data_df["class"]
        apply_cross_validation(
            features=features,
            labels=labels,
            k_folds=args.kfolds,
            use_svm=args.use_svm,
            use_naive_bayes=args.use_naive_bayes,
            use_random_forest=args.use_random_forest,
            use_logistic_regression=args.use_logistic_regression,
            use_xgboost=args.use_xgboost,
            use_gradient_boosting=args.use_gradient_boosting,
            plot_cv_graph=True,
        )
    elif args.pipeline_type == "training":
        trainer = Trainer(
            train_file_path=args.train_file_path,
            val_file_path=args.val_file_path,
            stop_words_file_path=args.stopwords_file_path,
            model_name=args.best_model,
            feature_extractor=feature_extractor,
        )
        training_data_df = load_training_data(args.train_file_path)
        trainer.train(
            training_data_df,
            split_test_size=args.split_size,
            vectorizer_name=args.vectorizer,
            get_classification_report=args.get_classification_report,
            get_confusion_matrix=args.get_confusion_matrix,
        )
        validation_data_df = load_validation_data(args.val_file_path)
        trainer.validate(validation_data_df, vectorizer_name=args.vectorizer)
        if args.model_check_point_path:
            trainer.save_trained_model(args.model_check_point_path)
    elif args.pipeline_type == "prediction":
        if not args.stopwords_file_path:
            predictor = Predictor()
        else:
            predictor = Predictor(stop_words_file=args.stopwords_file_path)
        if args.input_file_path:
            predictor.predict_csv(args.input_file_path, args.output_file_path,
                                  args.model_path)
        if args.test_input:
            model, vectorizer = predictor.unpickle_the_model(args.model_path)
            predictor.predict(args.test_input, model, vectorizer)
Ejemplo n.º 14
0
import pandas as pd
from sklearn.model_selection import GridSearchCV

from feature_extraction import FeatureExtractor
from regressor import RandomForestClassifierAuc,XGBRegressor
from tools import fitStats,featureImportance

test_df = pd.read_csv("./data/test.csv", delimiter=";", header=0, index_col=0);
train_df = pd.read_csv("./data/train_preprocessed.csv", delimiter=";", header=0, index_col=0);
train_label = pd.read_csv("./data/label.csv",
                          delimiter=";", header=0, index_col=0);

extractor = FeatureExtractor()
train_df = extractor.fit_transform(train_df, train_label)
test_df = extractor.transform(test_df)

param_grid = dict(max_depth=[10], n_estimators=[15])

studyAuc = True
if(studyAuc):
    reg = GridSearchCV(RandomForestClassifierAuc(max_depth=10, n_estimators=15), param_grid=param_grid)
    reg = GridSearchCV(XGBRegressor(), param_grid=dict())
else:
    reg = RandomForestClassifierAuc(max_depth=10, n_estimators=15)


X_train = train_df.values
y_train = train_label.values.ravel()
X_test = test_df.values
fit = reg.fit(X_train, y_train)
Ejemplo n.º 15
0
    def __init__(self, cfg_tb, test_model_path, root_path='./'):

        # arguments
        self.cfg_tb = cfg_tb
        self.test_model_path = test_model_path
        self.root_path = root_path

        # shortcuts
        self.feature_params, self.data_size = None, None

        # paths
        self.paths = dict(
            (k, self.root_path + v) for k, v in self.cfg_tb['paths'].items())

        # test model path
        self.test_model_name = self.test_model_path.split('/')[-2]

        # determine available model files
        model_files_av = [
            f.split('/')[-1] for f in glob(self.test_model_path + '*model.pth')
        ]

        # model file
        self.model_files = [
            self.test_model_path + f for f in model_files_av
            if f in self.cfg_tb['model_file_names']
        ]

        # pick just the first one (errors should not occur)
        self.model_file = self.model_files[0]

        # param file
        self.params_file = self.test_model_path + self.cfg_tb[
            'params_file_name']

        # wavs
        self.test_wavs = [
            self.root_path + wav for wav in self.cfg_tb['test_wavs']
        ]

        # create folder
        create_folder(list(self.paths.values()))

        # parameter loading
        net_params = np.load(self.params_file, allow_pickle=True)

        # extract params
        self.nn_arch, self.train_params, self.class_dict = net_params[
            'nn_arch'][()], net_params['train_params'][(
            )], net_params['class_dict'][()]

        # legacy stuff
        #self.data_size, self.feature_params = self.legacy_adjustments_tb(net_params)

        # legacy stuff
        self.data_size, self.feature_params = legacy_adjustments_net_params(
            net_params)

        # init feature extractor
        self.feature_extractor = FeatureExtractor(self.feature_params)

        # init net handler
        self.net_handler = NetHandler(nn_arch=self.nn_arch,
                                      class_dict=self.class_dict,
                                      data_size=self.data_size,
                                      use_cpu=True)

        # load model
        self.net_handler.load_models(model_files=[self.model_file])

        # set evaluation mode
        self.net_handler.set_eval_mode()
Ejemplo n.º 16
0
    concat = Concatenate()([pooled_conv_dropped_a, pooled_conv_dropped_b])
    output = TimeDistributed(Dense(units = 1,
                 activation = 'sigmoid',
                 ))(concat)

    model = Model(my_input, output)
    
    model.compile(loss='categorical_hinge',
                  optimizer = 'adagrad',
                  metrics = ['accuracy'])
    
    return model


# print(model.summary())
fe = FeatureExtractor(6)
fe.set_w2v(w2v_pathname, 500, keep_alive=True)

for epoch in range(2, 6):
	model = load_model('model/cnw100_{}.h5'.format(epoch-1))
	for i in range(1,1001):
	    true_filename = 'data/batch/bengio/6/{}.txt'.format(i)
	    false_filename = 'data/batch/cnw/6/{}.txt'.format(i)

	    with open(true_filename) as file:
	        true_data = file.readlines()
	    with open(false_filename) as file:
	        false_data = file.readlines()

	    true_word_seq = []
	    false_word_seq = []
Ejemplo n.º 17
0
    def run(self):
        """Run whole data loading, feature extraction, model training and regressing pipeline."""
        if self.mode == "extract":
            print("Extracting features")
            train = FeatureExtractor("train").run()
            dev = FeatureExtractor("dev").run()

            print("Saving features")
            np.save("saved_features/train_lsr", train.lsr)
            np.save("saved_features/train_nlp", train.feats)
            np.save("saved_features/train_scores", train.scores)
            np.save("saved_features/dev_lsr", dev.lsr)
            np.save("saved_features/dev_nlp", dev.feats)
            np.save("saved_features/dev_scores", dev.scores)
        else:  # Load saved extracted features
            print("Loading saved features")
            split = False if self.full_data else True
            train, dev = load_features(split=split, nt=True)

        if self.params["upsample"]:
            train = self.upsample(train)

        train_loader = create_loader(train, self.params["batch_size_train"])
        dev_loader = create_loader(dev, validate=True)

        # We set a random seed to ensure that results are reproducible.
        # Also set a cuda GPU if available
        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
            GPU = True
        else:
            GPU = False
        device_idx = 0
        if GPU:
            device = torch.device(
                "cuda:" +
                str(device_idx) if torch.cuda.is_available() else "cpu")
        else:
            device = torch.device("cpu")
        print(f"Running on {device}")

        if self.bUseConv:
            model = RecursiveNN(
                ModelBlock,
                self.params["conv_dict"],
                self.params["conv_ffnn_dict"],
                BASELINE_dim=self.params["NBaseline"],
            )
        else:
            model = RecursiveNN_Linear(
                in_features=2048,
                N1=self.params["N1"],
                N2=self.params["N2"],
                out_features=self.params["out_features"],
                dropout=self.params["dropout"],
                leaky_relu=self.params["leaky_relu"],
            )

        model = model.to(device)

        weights_initialiser = True
        if weights_initialiser:
            model.apply(weights_init)
        params_net = sum(p.numel() for p in model.parameters()
                         if p.requires_grad)

        print("Total number of parameters in Model is: {}".format(params_net))
        print(model)

        optimizer = optim.Adam(model.parameters(), lr=self.params["lr"])
        scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.params["step_size"],
            gamma=self.params["gamma"])

        date_string = (str(datetime.datetime.now())[:16].replace(":",
                                                                 "-").replace(
                                                                     " ", "-"))
        writer = SummaryWriter(logdir + date_string)
        print("Running model")
        for epoch in range(self.params["epochs"]):
            train_model(
                model,
                train_loader,
                optimizer,
                epoch,
                log_interval=1000,
                scheduler=scheduler,
                writer=writer,
            )
            test_loss = test_model(model, dev_loader, epoch, writer=writer)

        torch.save(model, "model.pt")
        self.model = model
Ejemplo n.º 18
0
 def __init__(self):
     self.face_detector = FaceDetector()
     self.preprocessor = Preprocessor()
     self.extractor = FeatureExtractor()
Ejemplo n.º 19
0
                       batch_size=32,
                       epochs=max_epoch,
                       validation_data=[x_val, y_val],
                       callbacks=[mc])

        self.model = load_model(save_filename, custom_objects={'f1': f1})

    def evaluate(self, x_test, y_test):
        print(self.model.evaluate(x_test, y_test))


aspect_list = list_from_file('resource/aspect.txt')
n_aspect = len(aspect_list)
w2v_pathname = 'resource/w2v_path.txt'
max_length = 50
fe = FeatureExtractor(max_length)
fe.set_w2v(w2v_pathname, 500, keep_alive=True)


def prepare_feature(filename):
    with open(filename) as file:
        data = json.load(file)

    sentences = [datum['sentence'] for datum in data]
    aspects = [datum['aspect'] for datum in data]

    sequences = [text_to_word_sequence(s) for s in sentences]

    label = []
    for i in range(len(aspects)):
        label.append(np.zeros(n_aspect))