def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015")
class Main: """ Main class, makes necessary function calls to necessary classes """ def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015") def retrieve_tweets(self, file_path_of_ids): """ Runs Import Manager to retrieve and import tweets :param file_path_of_ids: String, file path of tweets to import :return: void """ self.__import_manager.run(file_path_of_ids) def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'): """ Makes necessary function calls to extract features for given year and to generate arff file :param n: int, ngram count :param analyzer: string, word or char :param year: string, 2012, 2013, 2014, 2015 or ALL :return: string, path of generated arff file """ # Getting tweets with year print("Getting tweets for year "+ year) tweets_for_given_year = self.__db_manager.get_tweets_for_year(year) print("Generating document and classes of tweets.") document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True) print("Fitting the data, finding ngrams and frequencies.") ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer) print("Formatting the data for arff lib format.") formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data) print("Generating file.") # Experiment name, 1grams, 2grams, 3grams.. or words experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word' # File name, TTNet_3grams_2012 file_name = MODEL_NAME + '_' + experiment_name + '_' + year # File name randomized TTNet_3grams_2012_asfas12.arff file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION) # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/' # Generating the file with data self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data) print("Arff file generated at path:"+arff_file_path+file_name) def run_experiment_with_scikit_learn(self, n=1, analyzer='word'): """ Makes necessary method calls to run the experiment on scikit learn. :param n: int, count n in n-gram :param analyzer: string, either 'word' or 'char' :return: void """ # Retrieving all tweets from database print("Retrieving all tweets from database.") tweets_for_all_years = {} # Iterating over all years for year in self.years: # Retrieving tweets for the year tweets_for_year = self.__db_manager.get_tweets_for_year(year) tweets_for_all_years[year] = tweets_for_year # Creating a big list of tweets print("Creating a big list of tweets.") all_tweets = [] # Appending all tweets together for year, tweets in tweets_for_all_years.iteritems(): all_tweets += tweets # Generating document print("Generating document and classes by preprocessing") # Preprocessing and generation of document document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True) # Getting years' tweets counts print("Getting years' tweets counts.") years_tweets_counts = {} for year in self.years: years_tweets_counts[year] = len(tweets_for_all_years[year]) all_processes = [] self.all_experiments_results = [] pool = Pool(cpu_count()-1 or 1) copy_reg.pickle(types.MethodType, self._reduce_method) print("Running experiments.") t0 = time.time() for i in range(0, N_EXPERIMENTS): print("Experiment:"+str(i)) experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer) r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores) all_processes.append(r) for a_process in all_processes: a_process.wait() t1 = time.time() print("Elapsed time:", t1- t0, " seconds") pool.close() pool.join() print("Cumulating all the experiments' scores.") final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results) return final_results_from_all_experiments def _reduce_method(self, m): """ :param m: :return: """ if m.im_self is None: return getattr, (m.im_class, m.im_func.func_name) else: return getattr, (m.im_self, m.im_func.func_name) def _accumulate_experiments_scores(self, an_experiments_result): """ Accumulates experiments' scores :return: void """ an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result) self.all_experiments_results.append(an_experiments_result) def plot_experiment_results(self, root_dir): """ Plots experiment's results from log files :param root_dir: string :return: void """ lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir) self.__plot_manager.plot_experiments_results(lines_scores) def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments): """ Plots all line scores of all experiments :param all_line_scores_of_all_experiments: dict :return: void """ self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments) def plot_years_scores(self, root_dir): """ Makes necessary function calls to plot years scores :param dir: string :return: void """ self.__plot_manager.plot_years_scores_from_root_directory(root_dir) def plot_2012_vs_rest(self, root_dir): """ Makes necessary function calls to plot 2012 vs REST scores :param root_dir: string :return: void """ self.__plot_manager.plot_2012_vs_rest(root_dir) def plot_top_feature_frequencies_in_years(self): """ Makes necessary function calls to plot top features frequencies' in years :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts) def find_frequency_dictionary_for_year(self, year): """ Finds frequencies of each feature for given year :param year: string :return: dict """ # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words tweets_for_the_year = self.__db_manager.get_tweets_for_year(year) document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True) ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word') terms = vectorizer.get_feature_names() freqs = X.sum(axis=0).A1 result = sorted(zip(freqs, terms), reverse=True) freqs = [elm[0] for elm in result] terms = [elm[1] for elm in result] final_result = dict(zip(terms, freqs)) return final_result def plot_years_intersection_scores(self): """ Makes necessary function callst to plot a matrix which shows years' vocabularies similarities :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_years_intersection_scores(years_features_counts) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ self.__import_manager.import_new_tweets_from_csv(root_path)
def evaluate(self, clf): # Predict the response for test dataset y_pred = clf.predict(self.X_test) self.logger.info("-> Predicted: {}".format(y_pred)) self.logger.info("-> Correct: {}".format(self.y_test)) self.logger.info("-> Accuracy: {}".format( accuracy_score(self.y_test, y_pred))) if __name__ == "__main__": logger = ClassifierLogger().get_logger() dir_path = os.path.dirname(os.path.realpath(__file__)) corpus = os.path.join(dir_path, "resources", "News_category_train.json") features = ['authors', 'headline', 'short_description'] combinations = chain( *map(lambda x: combinations(features, x), range(0, len(features) + 1))) for combination in combinations: if len(combination) == 0: continue feat_manager = FeatureManager(corpus, combination, logger) # classifierNB = Classifier(feat_manager, ["Naive Bayes", "Decision Tree", "Adaboost", "Support Vector Machine", "Random Forest", "Gradient Descent"]) classifierNB = Classifier( feat_manager, ["Support Vector Machine", "Gradient Descent"], logger) # self.logger.info("TRAINING WITHOUT CLEANSING") # classifierNB.train(cleanse=False) classifierNB.train(cleanse=True)
import glob import os from FeatureManager import FeatureManager features_path = 'epadb/test/data' conf_path = 'conf' epadb_root_path = 'EpaDB' text_path = 'epadb/test/text' feature_manager = FeatureManager(epadb_root_path, features_path, conf_path) feature_manager.extract_features_using_kaldi(text_path) #Create symbolic links to labels used in evaluation stage for file in sorted(glob.glob('EpaDB/*/labels/*')): fullpath = os.path.abspath(file) basename = os.path.basename(file) #Get spkr id spkr = fullpath.split('/')[-3] labels_dir_for_spkr = 'evaluate/epadb_30/' + spkr + '/labels/' #Create directory for speaker's labels if not os.path.exists(labels_dir_for_spkr): os.system('mkdir -p ' + labels_dir_for_spkr) #Make symbolic link to speaker labels from EpaDB directory if not os.path.exists(labels_dir_for_spkr + '/' + basename): os.system('ln -s ' + fullpath + ' ' + labels_dir_for_spkr + '/') #Handle symbolic links for reference transcriptions used in evaluation stage if not os.path.exists('evaluate/epadb_30/reference_transcriptions.txt'): current_path = os.getcwd() print('ln -s ' + current_path + '/EpaDB/reference_transcriptions.txt ' +
lang_graph, symbols_path, disam, acoustic_scale=1.0) phones = SymbolTable.read_text(phones) wb_info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), "data/lang_test_tgsmall/phones/word_boundary.int") # Instantiate the PyTorch acoustic model (subclass of torch.nn.Module) model = FTDNN() model.load_state_dict(torch.load(acoustic_model_path)) model.eval() #Create feature manager feature_manager = FeatureManager(epadb_root_path, data_path, conf_path) align_out_file = open("gop/align_output", "w+") # Decode and write output lattices with DoubleMatrixWriter(loglikes_wspec) as loglikes_writer: for line in open(sample_list_path, 'r').readlines(): logid = line.split()[0] #tkey, text = line.strip().split(None, 1) feats, text = feature_manager.get_features_for_logid(logid) text = text.upper() feats = torch.unsqueeze(feats, 0) loglikes = model(feats) # Compute log-likelihoods loglikes = Matrix( loglikes.detach().numpy()[0]) # Convert to PyKaldi matrix loglikes_writer[logid] = loglikes out = aligner.align(loglikes, text)