def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015")
def __init__(self, path=None): """Sets up and starts the `AppServer`. `path` is the working directory for the AppServer (directory in which AppServer is contained, by default) This method loads plugins, creates the Application object, and starts the request handling loop. """ self._running = 0 self._startTime = time() global globalAppServer if globalAppServer: raise ProcessRunning('More than one AppServer' ' or __init__() invoked more than once.') globalAppServer = self # Set up the import manager: self._imp = ImportManager() ConfigurableForServerSidePath.__init__(self) if path is None: path = os.path.dirname(__file__) # os.getcwd() self._serverSidePath = os.path.abspath(path) self._webKitPath = os.path.abspath(os.path.dirname(__file__)) self._webwarePath = os.path.dirname(self._webKitPath) self.recordPID() self._verbose = self.setting('Verbose') if self._verbose: self._silentURIs = self.setting('SilentURIs') if self._silentURIs: import re self._silentURIs = re.compile(self._silentURIs) else: self._silentURIs = None self._plugIns = [] self._requestID = 0 self.checkForInstall() self.config() # cache the config self.printStartUpMessage() if self.setting('CheckInterval') is not None: sys.setcheckinterval(self.setting('CheckInterval')) self._app = self.createApplication() self.loadPlugIns() # @@ 2003-03 ib: shouldn't this just be in a subclass's __init__? if self.isPersistent(): self._closeEvent = Event() self._closeThread = Thread(target=self.closeThread, name="CloseThread") # self._closeThread.setDaemon(1) self._closeThread.start() self._running = 1
def userConfig(self): """Return the user config overrides. These settings can be found in the optional config file. Returns {} if there is no such file. The config filename is taken from configFilename(). """ # pylint: disable=assignment-from-no-return filename = self.configFilename() if not filename: return {} try: with open(filename) as f: contents = f.read() except IOError as e: print('WARNING: Config file', filename, 'not loaded:', e.strerror) print() return {} if contents.lstrip().startswith('{'): raise ConfigurationError( 'Configuration via a dict literal is not supported anymore.') try: from ImportManager import ImportManager ImportManager().watchFile(filename) except Exception as e: print('WARNING: Config file', filename, 'cannot be watched:', e) print() config = self.configReplacementValues().copy() try: exec(contents, config) keys = [key for key in config if key.startswith('_')] for key in keys: del config[key] except Exception as e: raise ConfigurationError( f'Invalid configuration file, {filename} ({e}).') return config
class Main: """ Main class, makes necessary function calls to necessary classes """ def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015") def retrieve_tweets(self, file_path_of_ids): """ Runs Import Manager to retrieve and import tweets :param file_path_of_ids: String, file path of tweets to import :return: void """ self.__import_manager.run(file_path_of_ids) def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'): """ Makes necessary function calls to extract features for given year and to generate arff file :param n: int, ngram count :param analyzer: string, word or char :param year: string, 2012, 2013, 2014, 2015 or ALL :return: string, path of generated arff file """ # Getting tweets with year print("Getting tweets for year "+ year) tweets_for_given_year = self.__db_manager.get_tweets_for_year(year) print("Generating document and classes of tweets.") document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True) print("Fitting the data, finding ngrams and frequencies.") ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer) print("Formatting the data for arff lib format.") formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data) print("Generating file.") # Experiment name, 1grams, 2grams, 3grams.. or words experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word' # File name, TTNet_3grams_2012 file_name = MODEL_NAME + '_' + experiment_name + '_' + year # File name randomized TTNet_3grams_2012_asfas12.arff file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION) # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/' # Generating the file with data self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data) print("Arff file generated at path:"+arff_file_path+file_name) def run_experiment_with_scikit_learn(self, n=1, analyzer='word'): """ Makes necessary method calls to run the experiment on scikit learn. :param n: int, count n in n-gram :param analyzer: string, either 'word' or 'char' :return: void """ # Retrieving all tweets from database print("Retrieving all tweets from database.") tweets_for_all_years = {} # Iterating over all years for year in self.years: # Retrieving tweets for the year tweets_for_year = self.__db_manager.get_tweets_for_year(year) tweets_for_all_years[year] = tweets_for_year # Creating a big list of tweets print("Creating a big list of tweets.") all_tweets = [] # Appending all tweets together for year, tweets in tweets_for_all_years.iteritems(): all_tweets += tweets # Generating document print("Generating document and classes by preprocessing") # Preprocessing and generation of document document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True) # Getting years' tweets counts print("Getting years' tweets counts.") years_tweets_counts = {} for year in self.years: years_tweets_counts[year] = len(tweets_for_all_years[year]) all_processes = [] self.all_experiments_results = [] pool = Pool(cpu_count()-1 or 1) copy_reg.pickle(types.MethodType, self._reduce_method) print("Running experiments.") t0 = time.time() for i in range(0, N_EXPERIMENTS): print("Experiment:"+str(i)) experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer) r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores) all_processes.append(r) for a_process in all_processes: a_process.wait() t1 = time.time() print("Elapsed time:", t1- t0, " seconds") pool.close() pool.join() print("Cumulating all the experiments' scores.") final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results) return final_results_from_all_experiments def _reduce_method(self, m): """ :param m: :return: """ if m.im_self is None: return getattr, (m.im_class, m.im_func.func_name) else: return getattr, (m.im_self, m.im_func.func_name) def _accumulate_experiments_scores(self, an_experiments_result): """ Accumulates experiments' scores :return: void """ an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result) self.all_experiments_results.append(an_experiments_result) def plot_experiment_results(self, root_dir): """ Plots experiment's results from log files :param root_dir: string :return: void """ lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir) self.__plot_manager.plot_experiments_results(lines_scores) def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments): """ Plots all line scores of all experiments :param all_line_scores_of_all_experiments: dict :return: void """ self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments) def plot_years_scores(self, root_dir): """ Makes necessary function calls to plot years scores :param dir: string :return: void """ self.__plot_manager.plot_years_scores_from_root_directory(root_dir) def plot_2012_vs_rest(self, root_dir): """ Makes necessary function calls to plot 2012 vs REST scores :param root_dir: string :return: void """ self.__plot_manager.plot_2012_vs_rest(root_dir) def plot_top_feature_frequencies_in_years(self): """ Makes necessary function calls to plot top features frequencies' in years :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts) def find_frequency_dictionary_for_year(self, year): """ Finds frequencies of each feature for given year :param year: string :return: dict """ # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words tweets_for_the_year = self.__db_manager.get_tweets_for_year(year) document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True) ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word') terms = vectorizer.get_feature_names() freqs = X.sum(axis=0).A1 result = sorted(zip(freqs, terms), reverse=True) freqs = [elm[0] for elm in result] terms = [elm[1] for elm in result] final_result = dict(zip(terms, freqs)) return final_result def plot_years_intersection_scores(self): """ Makes necessary function callst to plot a matrix which shows years' vocabularies similarities :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_years_intersection_scores(years_features_counts) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ self.__import_manager.import_new_tweets_from_csv(root_path)
def main(args): if len(args) < 3: sys.stderr.write("usage: {} <directories> <output.json>\n".format( args[0])) sys.stderr.write( "\t<directories>: list of space-separated directories to examine\n" ) sys.stderr.write( "\t<output>: json file describing graphs, interpreted by doc_grapher.html\n" ) sys.exit(1) directories = args[1:-1] outfname = args[-1] # for each file in each directory, recursively on down, # search for doc annotations and create objects appropriately docnodes = collections.OrderedDict() filecount = 0 for directory in directories: for root, dirs, files in os.walk(directory): for fname in files: filecount += 1 path = os.path.join(root, fname) docnode = parse_docfile(path) if docnode is None: # sys.stderr.write("Error! File is not annotated: {}\n" # .format(path)) continue docnodes[docnode.name] = docnode # if any docnodes have auto import set up, take care of that import_manager = ImportManager() import_manager.add_auto_imports(list(docnodes.values())) # validate all parents & siblings - make sure they actually exist rejectedEdges = [] for name in docnodes: docnode = docnodes[name] verified_edges = [] for edge in docnode.edges: if edge['id'] in docnodes: verified_edges.append(edge) else: rejectedEdges.append(edge) docnode.edges = verified_edges # print any rejected edges print('Rejected {} edge{}'.format(len(rejectedEdges), 's' if len(rejectedEdges) != 1 else '')) if len(rejectedEdges) > 0: print(rejectedEdges) ## assign colors to distinct segments ## we do this as follows: #### climb up chain of parents #### if parent has color assigned, assign same color to all children #### if we reach the top of the chain without having assigned a color, assign a color and bubble down #### IMPORTANT: remember to mark nodes as "seen" as we do this! #### (because we don't necessary want to force links as a tree structure) assigner = ColorAssigner() assigner.assign_colors(docnodes) nodes = [] edges = [] node_config = {'size': 10} edge_config = {'size': 3} for name in docnodes: docnode = docnodes[name] nodes.append(docnode.graph_node(node_config)) edges += docnode.graph_edges(edge_config) if len(nodes) == 0: sys.stderr.write( "No annotated files found! Not writing output file.\n") sys.exit(1) print("Extracted {} nodes with {} edges from {} files".format( len(nodes), len(edges), filecount)) graph = {'nodes': nodes, 'edges': edges} # pprint(graph) with open(outfname, 'w') as f: json.dump(graph, f, indent=4)
def __init__(self, path=None, settings=None, development=None): """Sets up the Application. You can specify the path of the application working directory, a dictionary of settings to override in the configuration, and whether the application should run in development mode. """ ConfigurableForServerSidePath.__init__(self) if path is None: path = os.getcwd() self._serverSidePath = os.path.abspath(path) self._webwarePath = os.path.abspath(os.path.dirname(__file__)) if not os.path.isfile(self.configFilename()): print("ERROR: The application cannot be started:") print(f"Configuration file {self.configFilename()} not found.") raise RuntimeError('Configuration file not found') if development is None: development = bool(os.environ.get('WEBWARE_DEVELOPMENT')) self._development = development self.initVersions() self._shutDownHandlers = [] self._plugIns = {} self._requestID = 0 self._imp = ImportManager() appConfig = self.config() # get and cache the configuration if settings: appConfig.update(settings) self._verbose = self.setting('Verbose') if self._verbose: self._silentURIs = self.setting('SilentURIs') if self._silentURIs: import re self._silentURIs = re.compile(self._silentURIs) else: self._silentURIs = None self._outputEncoding = self.setting('OutputEncoding') self._responseBufferSize = self.setting('ResponseBufferSize') self._wsgiWrite = self.setting('WSGIWrite') if self.setting('CheckInterval') is not None: sys.setswitchinterval(self.setting('CheckInterval')) logFilename = self.setting('AppLogFilename') if logFilename: sys.stderr = sys.stdout = open(logFilename, 'a', buffering=1) self.initErrorPage() self.printStartUpMessage() # Initialize task manager: if self.setting('RunTasks'): self._taskManager = Scheduler( daemon=True, exceptionHandler=self.handleException) self._taskManager.start() else: self._taskManager = None # Define this before initializing URLParser, so that contexts have a # chance to override this. Also be sure to define it before loading the # sessions, in case the loading of the sessions causes an exception. self._exceptionHandlerClass = ExceptionHandler self.makeDirs() self.initSessions() URLParser.initApp(self) self._rootURLParser = URLParser.ContextParser(self) self._startTime = time() if self.setting('UseSessionSweeper'): self.startSessionSweeper() self._plugInLoader = None self.loadPlugIns() self._needsShutDown = [True] atexit.register(self.shutDown) self._sigTerm = signal.signal(signal.SIGTERM, self.sigTerm) try: self._sigHup = signal.signal(signal.SIGHUP, self.sigTerm) except AttributeError: pass # SIGHUP does not exist on Windows
def main(args): if len(args) < 3: sys.stderr.write("usage: {} <directories> <output.json>\n".format(args[0])) sys.stderr.write("\t<directories>: list of space-separated directories to examine\n") sys.stderr.write("\t<output>: json file describing graphs, interpreted by doc_grapher.html\n") sys.exit(1) directories = args[1:-1] outfname = args[-1] # for each file in each directory, recursively on down, # search for doc annotations and create objects appropriately docnodes = collections.OrderedDict() filecount = 0 for directory in directories: for root, dirs, files in os.walk(directory): for fname in files: filecount += 1 path = os.path.join(root, fname) docnode = parse_docfile(path) if docnode is None: # sys.stderr.write("Error! File is not annotated: {}\n" # .format(path)) continue docnodes[docnode.name] = docnode # if any docnodes have auto import set up, take care of that import_manager = ImportManager() import_manager.add_auto_imports(list(docnodes.values())) # validate all parents & siblings - make sure they actually exist rejectedEdges = [] for name in docnodes: docnode = docnodes[name] verified_edges = [] for edge in docnode.edges: if edge['id'] in docnodes: verified_edges.append(edge) else: rejectedEdges.append(edge) docnode.edges = verified_edges # print any rejected edges print('Rejected {} edge{}'.format( len(rejectedEdges), 's' if len(rejectedEdges) != 1 else '')) if len(rejectedEdges) > 0: print(rejectedEdges) ## assign colors to distinct segments ## we do this as follows: #### climb up chain of parents #### if parent has color assigned, assign same color to all children #### if we reach the top of the chain without having assigned a color, assign a color and bubble down #### IMPORTANT: remember to mark nodes as "seen" as we do this! #### (because we don't necessary want to force links as a tree structure) assigner = ColorAssigner() assigner.assign_colors(docnodes) nodes = [] edges = [] node_config = {'size': 10} edge_config = {'size': 3} for name in docnodes: docnode = docnodes[name] nodes.append(docnode.graph_node(node_config)) edges += docnode.graph_edges(edge_config) if len(nodes) == 0: sys.stderr.write("No annotated files found! Not writing output file.\n") sys.exit(1) print("Extracted {} nodes with {} edges from {} files" .format(len(nodes), len(edges), filecount)) graph = {'nodes': nodes, 'edges': edges} # pprint(graph) with open(outfname, 'w') as f: json.dump(graph, f, indent=4)