def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")
Example #2
0
    def __init__(self, path=None):
        """Sets up and starts the `AppServer`.

        `path` is the working directory for the AppServer
        (directory in which AppServer is contained, by default)

        This method loads plugins, creates the Application object,
        and starts the request handling loop.
        """
        self._running = 0
        self._startTime = time()

        global globalAppServer
        if globalAppServer:
            raise ProcessRunning('More than one AppServer'
                ' or __init__() invoked more than once.')
        globalAppServer = self

        # Set up the import manager:
        self._imp = ImportManager()

        ConfigurableForServerSidePath.__init__(self)
        if path is None:
            path = os.path.dirname(__file__)  # os.getcwd()
        self._serverSidePath = os.path.abspath(path)
        self._webKitPath = os.path.abspath(os.path.dirname(__file__))
        self._webwarePath = os.path.dirname(self._webKitPath)

        self.recordPID()

        self._verbose = self.setting('Verbose')
        if self._verbose:
            self._silentURIs = self.setting('SilentURIs')
            if self._silentURIs:
                import re
                self._silentURIs = re.compile(self._silentURIs)
        else:
            self._silentURIs = None
        self._plugIns = []
        self._requestID = 0

        self.checkForInstall()
        self.config()  # cache the config
        self.printStartUpMessage()
        if self.setting('CheckInterval') is not None:
            sys.setcheckinterval(self.setting('CheckInterval'))
        self._app = self.createApplication()
        self.loadPlugIns()

        # @@ 2003-03 ib: shouldn't this just be in a subclass's __init__?
        if self.isPersistent():
            self._closeEvent = Event()
            self._closeThread = Thread(target=self.closeThread,
                name="CloseThread")
            # self._closeThread.setDaemon(1)
            self._closeThread.start()
        self._running = 1
Example #3
0
    def userConfig(self):
        """Return the user config overrides.

        These settings can be found in the optional config file.
        Returns {} if there is no such file.

        The config filename is taken from configFilename().
        """
        # pylint: disable=assignment-from-no-return
        filename = self.configFilename()
        if not filename:
            return {}
        try:
            with open(filename) as f:
                contents = f.read()
        except IOError as e:
            print('WARNING: Config file', filename, 'not loaded:', e.strerror)
            print()
            return {}
        if contents.lstrip().startswith('{'):
            raise ConfigurationError(
                'Configuration via a dict literal is not supported anymore.')
        try:
            from ImportManager import ImportManager
            ImportManager().watchFile(filename)
        except Exception as e:
            print('WARNING: Config file', filename, 'cannot be watched:', e)
            print()
        config = self.configReplacementValues().copy()
        try:
            exec(contents, config)
            keys = [key for key in config if key.startswith('_')]
            for key in keys:
                del config[key]
        except Exception as e:
            raise ConfigurationError(
                f'Invalid configuration file, {filename} ({e}).')
        return config
class Main:
    """
    Main class, makes necessary function calls to necessary classes
    """

    def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")

    def retrieve_tweets(self, file_path_of_ids):
        """
        Runs Import Manager to retrieve and import tweets
        :param file_path_of_ids: String, file path of tweets to import
        :return: void
        """
        self.__import_manager.run(file_path_of_ids)

    def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'):
        """
        Makes necessary function calls to extract features for given year and to generate arff file
        :param n: int, ngram count
        :param analyzer: string, word or char
        :param year: string, 2012, 2013, 2014, 2015 or ALL
        :return: string, path of generated arff file
        """

        # Getting tweets with year
        print("Getting tweets for year "+ year)
        tweets_for_given_year = self.__db_manager.get_tweets_for_year(year)

        print("Generating document and classes of tweets.")
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True)

        print("Fitting the data, finding ngrams and frequencies.")
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer)

        print("Formatting the data for arff lib format.")
        formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data)

        print("Generating file.")
        # Experiment name, 1grams, 2grams, 3grams.. or words
        experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word'

        # File name, TTNet_3grams_2012
        file_name = MODEL_NAME + '_' + experiment_name + '_' + year

        # File name randomized TTNet_3grams_2012_asfas12.arff
        file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION)

        # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff
        arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/'

        # Generating the file with data
        self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data)

        print("Arff file generated at path:"+arff_file_path+file_name)

    def run_experiment_with_scikit_learn(self, n=1, analyzer='word'):
        """
        Makes necessary method calls to run the experiment on scikit learn.
        :param n: int, count n in n-gram
        :param analyzer: string, either 'word' or 'char'
        :return: void
        """
        # Retrieving all tweets from database
        print("Retrieving all tweets from database.")
        tweets_for_all_years = {}
        # Iterating over all years
        for year in self.years:
            # Retrieving tweets for the year
            tweets_for_year = self.__db_manager.get_tweets_for_year(year)
            tweets_for_all_years[year] = tweets_for_year

        # Creating a big list of tweets
        print("Creating a big list of tweets.")
        all_tweets = []
        # Appending all tweets together
        for year, tweets in tweets_for_all_years.iteritems():
            all_tweets += tweets

        # Generating document
        print("Generating document and classes by preprocessing")
        # Preprocessing and generation of document
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True)

        # Getting years' tweets counts
        print("Getting years' tweets counts.")
        years_tweets_counts = {}
        for year in self.years:
            years_tweets_counts[year] = len(tweets_for_all_years[year])

        all_processes = []
        self.all_experiments_results = []

        pool = Pool(cpu_count()-1 or 1)
        copy_reg.pickle(types.MethodType, self._reduce_method)

        print("Running experiments.")
        t0 = time.time()
        for i in range(0, N_EXPERIMENTS):
            print("Experiment:"+str(i))
            experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer)
            r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores)
            all_processes.append(r)

        for a_process in all_processes:
            a_process.wait()

        t1 = time.time()

        print("Elapsed time:", t1- t0, " seconds")

        pool.close()
        pool.join()

        print("Cumulating all the experiments' scores.")
        final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results)
        return final_results_from_all_experiments

    def _reduce_method(self, m):
        """

        :param m:
        :return:
        """
        if m.im_self is None:
            return getattr, (m.im_class, m.im_func.func_name)
        else:
            return getattr, (m.im_self, m.im_func.func_name)

    def _accumulate_experiments_scores(self, an_experiments_result):
        """
        Accumulates experiments' scores
        :return: void
        """
        an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result)
        self.all_experiments_results.append(an_experiments_result)

    def plot_experiment_results(self, root_dir):
        """
        Plots experiment's results from log files
        :param root_dir: string
        :return: void
        """
        lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir)
        self.__plot_manager.plot_experiments_results(lines_scores)

    def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments):
        """
        Plots all line scores of all experiments
        :param all_line_scores_of_all_experiments: dict
        :return: void
        """
        self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments)

    def plot_years_scores(self, root_dir):
        """
        Makes necessary function calls to plot years scores
        :param dir: string
        :return: void
        """
        self.__plot_manager.plot_years_scores_from_root_directory(root_dir)

    def plot_2012_vs_rest(self, root_dir):
        """
        Makes necessary function calls to plot 2012 vs REST scores
        :param root_dir: string
        :return: void
        """
        self.__plot_manager.plot_2012_vs_rest(root_dir)

    def plot_top_feature_frequencies_in_years(self):
        """
        Makes necessary function calls to plot top features frequencies' in years
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)

        self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts)

    def find_frequency_dictionary_for_year(self, year):
        """
        Finds frequencies of each feature for given year
        :param year: string
        :return: dict
        """
        # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words

        tweets_for_the_year = self.__db_manager.get_tweets_for_year(year)
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True)
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word')

        terms = vectorizer.get_feature_names()
        freqs = X.sum(axis=0).A1

        result = sorted(zip(freqs, terms), reverse=True)

        freqs = [elm[0] for elm in result]
        terms = [elm[1] for elm in result]

        final_result = dict(zip(terms, freqs))

        return final_result

    def plot_years_intersection_scores(self):
        """
        Makes necessary function callst to plot a matrix which shows years' vocabularies similarities
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)
            
        self.__plot_manager.plot_years_intersection_scores(years_features_counts)

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        self.__import_manager.import_new_tweets_from_csv(root_path)
def main(args):
    if len(args) < 3:
        sys.stderr.write("usage: {} <directories> <output.json>\n".format(
            args[0]))
        sys.stderr.write(
            "\t<directories>: list of space-separated directories to examine\n"
        )
        sys.stderr.write(
            "\t<output>: json file describing graphs, interpreted by doc_grapher.html\n"
        )
        sys.exit(1)

    directories = args[1:-1]
    outfname = args[-1]

    # for each file in each directory, recursively on down,
    # search for doc annotations and create objects appropriately
    docnodes = collections.OrderedDict()
    filecount = 0
    for directory in directories:
        for root, dirs, files in os.walk(directory):
            for fname in files:
                filecount += 1

                path = os.path.join(root, fname)
                docnode = parse_docfile(path)

                if docnode is None:
                    # sys.stderr.write("Error! File is not annotated: {}\n"
                    #                  .format(path))
                    continue
                docnodes[docnode.name] = docnode

    # if any docnodes have auto import set up, take care of that
    import_manager = ImportManager()
    import_manager.add_auto_imports(list(docnodes.values()))

    # validate all parents & siblings - make sure they actually exist
    rejectedEdges = []
    for name in docnodes:
        docnode = docnodes[name]
        verified_edges = []
        for edge in docnode.edges:
            if edge['id'] in docnodes:
                verified_edges.append(edge)
            else:
                rejectedEdges.append(edge)
        docnode.edges = verified_edges
    # print any rejected edges
    print('Rejected {} edge{}'.format(len(rejectedEdges),
                                      's' if len(rejectedEdges) != 1 else ''))
    if len(rejectedEdges) > 0:
        print(rejectedEdges)

    ## assign colors to distinct segments
    ## we do this as follows:
    #### climb up chain of parents
    #### if parent has color assigned, assign same color to all children
    #### if we reach the top of the chain without having assigned a color, assign a color and bubble down
    #### IMPORTANT: remember to mark nodes as "seen" as we do this!
    ####            (because we don't necessary want to force links as a tree structure)
    assigner = ColorAssigner()
    assigner.assign_colors(docnodes)

    nodes = []
    edges = []
    node_config = {'size': 10}
    edge_config = {'size': 3}
    for name in docnodes:
        docnode = docnodes[name]

        nodes.append(docnode.graph_node(node_config))
        edges += docnode.graph_edges(edge_config)

    if len(nodes) == 0:
        sys.stderr.write(
            "No annotated files found! Not writing output file.\n")
        sys.exit(1)

    print("Extracted {} nodes with {} edges from {} files".format(
        len(nodes), len(edges), filecount))
    graph = {'nodes': nodes, 'edges': edges}
    # pprint(graph)

    with open(outfname, 'w') as f:
        json.dump(graph, f, indent=4)
Example #6
0
    def __init__(self, path=None, settings=None, development=None):
        """Sets up the Application.

        You can specify the path of the application working directory,
        a dictionary of settings to override in the configuration,
        and whether the application should run in development mode.
        """
        ConfigurableForServerSidePath.__init__(self)
        if path is None:
            path = os.getcwd()
        self._serverSidePath = os.path.abspath(path)
        self._webwarePath = os.path.abspath(os.path.dirname(__file__))

        if not os.path.isfile(self.configFilename()):
            print("ERROR: The application cannot be started:")
            print(f"Configuration file {self.configFilename()} not found.")
            raise RuntimeError('Configuration file not found')

        if development is None:
            development = bool(os.environ.get('WEBWARE_DEVELOPMENT'))
        self._development = development

        self.initVersions()

        self._shutDownHandlers = []
        self._plugIns = {}
        self._requestID = 0

        self._imp = ImportManager()

        appConfig = self.config()  # get and cache the configuration
        if settings:
            appConfig.update(settings)

        self._verbose = self.setting('Verbose')
        if self._verbose:
            self._silentURIs = self.setting('SilentURIs')
            if self._silentURIs:
                import re
                self._silentURIs = re.compile(self._silentURIs)
        else:
            self._silentURIs = None
        self._outputEncoding = self.setting('OutputEncoding')
        self._responseBufferSize = self.setting('ResponseBufferSize')
        self._wsgiWrite = self.setting('WSGIWrite')
        if self.setting('CheckInterval') is not None:
            sys.setswitchinterval(self.setting('CheckInterval'))

        logFilename = self.setting('AppLogFilename')
        if logFilename:
            sys.stderr = sys.stdout = open(logFilename, 'a', buffering=1)

        self.initErrorPage()
        self.printStartUpMessage()

        # Initialize task manager:
        if self.setting('RunTasks'):
            self._taskManager = Scheduler(
                daemon=True, exceptionHandler=self.handleException)
            self._taskManager.start()
        else:
            self._taskManager = None

        # Define this before initializing URLParser, so that contexts have a
        # chance to override this. Also be sure to define it before loading the
        # sessions, in case the loading of the sessions causes an exception.
        self._exceptionHandlerClass = ExceptionHandler

        self.makeDirs()
        self.initSessions()

        URLParser.initApp(self)
        self._rootURLParser = URLParser.ContextParser(self)

        self._startTime = time()

        if self.setting('UseSessionSweeper'):
            self.startSessionSweeper()

        self._plugInLoader = None
        self.loadPlugIns()

        self._needsShutDown = [True]
        atexit.register(self.shutDown)
        self._sigTerm = signal.signal(signal.SIGTERM, self.sigTerm)
        try:
            self._sigHup = signal.signal(signal.SIGHUP, self.sigTerm)
        except AttributeError:
            pass  # SIGHUP does not exist on Windows
def main(args):
    if len(args) < 3:
        sys.stderr.write("usage: {} <directories> <output.json>\n".format(args[0]))
        sys.stderr.write("\t<directories>: list of space-separated directories to examine\n")
        sys.stderr.write("\t<output>: json file describing graphs, interpreted by doc_grapher.html\n")
        sys.exit(1)

    directories = args[1:-1]
    outfname = args[-1]

    # for each file in each directory, recursively on down,
    # search for doc annotations and create objects appropriately
    docnodes = collections.OrderedDict()
    filecount = 0
    for directory in directories:
        for root, dirs, files in os.walk(directory):
            for fname in files:
                filecount += 1

                path = os.path.join(root, fname)
                docnode = parse_docfile(path)

                if docnode is None:
                    # sys.stderr.write("Error! File is not annotated: {}\n"
                    #                  .format(path))
                    continue
                docnodes[docnode.name] = docnode

    # if any docnodes have auto import set up, take care of that
    import_manager = ImportManager()
    import_manager.add_auto_imports(list(docnodes.values()))

    # validate all parents & siblings - make sure they actually exist
    rejectedEdges = []
    for name in docnodes:
        docnode = docnodes[name]
        verified_edges = []
        for edge in docnode.edges:
            if edge['id'] in docnodes:
                verified_edges.append(edge)
            else:
                rejectedEdges.append(edge)
        docnode.edges = verified_edges
    # print any rejected edges
    print('Rejected {} edge{}'.format(
        len(rejectedEdges),
        's' if len(rejectedEdges) != 1 else ''))
    if len(rejectedEdges) > 0:
        print(rejectedEdges)

    ## assign colors to distinct segments
    ## we do this as follows:
    #### climb up chain of parents
    #### if parent has color assigned, assign same color to all children
    #### if we reach the top of the chain without having assigned a color, assign a color and bubble down
    #### IMPORTANT: remember to mark nodes as "seen" as we do this!
    ####            (because we don't necessary want to force links as a tree structure)
    assigner = ColorAssigner()
    assigner.assign_colors(docnodes)

    nodes = []
    edges = []
    node_config = {'size': 10}
    edge_config = {'size': 3}
    for name in docnodes:
        docnode = docnodes[name]

        nodes.append(docnode.graph_node(node_config))
        edges += docnode.graph_edges(edge_config)

    if len(nodes) == 0:
        sys.stderr.write("No annotated files found! Not writing output file.\n")
        sys.exit(1)

    print("Extracted {} nodes with {} edges from {} files"
          .format(len(nodes), len(edges), filecount))
    graph = {'nodes': nodes, 'edges': edges}
    # pprint(graph)

    with open(outfname, 'w') as f:
        json.dump(graph, f, indent=4)