def __init__(self, db_name, forum_id, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type forum_id: int :param forum_id: the id of an existing forum in the DB :type interval: list int :param interval: a list of topic ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._forum_id = forum_id self._config = config self._logging_util = LoggingUtil() self._date_util = DateUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None
def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._dsl_util = DslUtil() self._date_util = DateUtil() self._db_util = DbUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-report-" + db_name + ".log" self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._db_name = db_name self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) self._chart_generator = ChartGenerator(self._cnx, self._logger) self._html_generator = HtmlGenerator(self._logger)
def __init__(self, cnx, logger): """ :type cnx: Object :param cnx: DB connection :type logger: Object :param logger: logger """ self._cnx = cnx self._logger = logger self._date_util = DateUtil()
def __init__(self, db_name, project_name, type, forum_name, url, before_date, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type type: str :param type: type of the forum (Stackoverflow, Eclipse forum) :type forum_name: str :param forum_name: the name of the forum to import :type url: str :param url: the URL of the forum :type before_date: str :param before_date: import data before date (YYYY-mm-dd) :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "import-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._type = type self._url = url self._forum_name = forum_name self._project_name = project_name self._db_name = db_name self._before_date = before_date config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbMain.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None
def __init__(self, config, logger): """ :type config: dict :param config: the DB configuration file :type logger: Object :param logger: logger """ self._config = config self._logger = logger self._git_dao = GitDao(self._config, self._logger) self._date_util = DateUtil()
def __init__(self, url, product, logger): """ :type url: str :param url: the URL of the Bugzilla issue tracker :type product: str :param product: the name of the product to import from the Bugzilla issue tracker :type logger: Object :param logger: logger """ self._logger = logger self._bzapi = self._init_bzapi(url) self._product = product self._date_util = DateUtil()
def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._date_util = DateUtil() self._db_util = DbUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-file-json-" + db_name self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, self._log_path, "info") self._db_name = db_name config.update({'database': db_name}) self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) self._file_util = FileUtil(self._config, self._logger)
def __init__(self, git_repo_path, logger): """ :type git_repo_path: str :param git_repo_path: local path of the Git repository :type logger: Object :param logger: logger """ try: self._logger = logger self._repo = Repo(git_repo_path, odbt=GitCmdObjectDB) self._gitt = self._repo.git self._date_util = DateUtil() except: self._logger.error("GitQuerier init failed") raise
def __init__(self, token, logger): """ :type token: str :param token: the token to access the Slack API :type logger: Object :param logger: logger """ try: self._token = token self._logger = logger self._date_util = DateUtil() self._slack = Slacker(self._token) except: self._logger.error("SlackQuerier init failed") raise
def __init__(self, db_name, project_name, forum_name, eclipse_forum_url, num_processes, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type project_name: str :param project_name: the name of an existing project in the DB :type forum_name: str :param forum_name: the name of an existing forum in the DB to update :type eclipse_forum_url: str :param eclipse_forum_url: the URL of the forum :type num_processes: int :param num_processes: number of processes to import the data (default 2) :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_path = log_root_path + "update-eclipse-forum-" + db_name + "-" + project_name + "-" + forum_name self._project_name = project_name self._url = eclipse_forum_url self._db_name = db_name self._forum_name = forum_name config.update({'database': db_name}) self._config = config if num_processes: self._num_processes = num_processes else: self._num_processes = EclipseForum2DbUpdate.NUM_PROCESSES self._logging_util = LoggingUtil() self._date_util = DateUtil() self._logger = None self._fileHandler = None self._querier = None self._dao = None
def get_order_time_lst(self): order_time = [] close_lst = list(set(self.close_time_list)) for time_str in close_lst: t_item = time_str.split(":") hour, minute = DateUtil.date_diff_min(int(t_item[0]), int(t_item[1]), int(ConfigUtil.instance().ahead_min) * -1) order_time.append(OrderTimeItem(hour, minute)) pass return order_time
def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = EclipseForumQuerier(None, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self.extract() except Exception: self._logger.error("EclipseTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
def __init__(self, db_name, repo_id, issue_tracker_id, url, product, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: the URL of the bugzilla issue tracker :type product: str :param product: the name of the product in the bugzilla issue tracker :type interval: list int :param interval: a list of issue ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._product = product self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._config = config self._logging_util = LoggingUtil() self._date_util = DateUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None
def __init__(self, token, logger): """ :type token: str :param token: the token to access the Stackoverflow API :type logger: Object :param logger: logger """ try: self._token = token self._logger = logger self._token_util = TokenUtil(self._logger, "stackoverflow") self._date_util = DateUtil() self._so = stackexchange.Site(stackexchange.StackOverflow, app_key=self._token) self._so.impose_throttling = True self._so.throttle_stop = False except: self._logger.error("StackOverflowQuerier init failed") raise
def __init__(self, url, token, logger): """ :type url: str :param url: full name of the GitHub repository :type token: str :param token: a GitHub token :type logger: Object :param logger: logger """ try: self._logger = logger self._url = url self._token = token self._github = Github(token) self._repo = self._load_repo(self._url) self._token_util = TokenUtil(self._logger, "github") self._date_util = DateUtil() except: self._logger.error("GitHubQuerier init failed") raise
def order(meican): try: order_week = ConfigUtil.instance().order_week cur_week = str(DateUtil.curr_week()) if cur_week not in order_week: critical( "meican | order week not in config!!!!!!!!!!!!, cur_week:<%s>" % cur_week) return info("meican |begin order") meican.order() except Exception as e: info(traceback.format_exc())
def two_rate_model_test(local_test_dir) -> None: """ Approval test for TermRateModel with single calibration across all currencies. The test is successful if output files match git state. """ country_count: int = 2 seed = 0 rand = np.random.RandomState(seed) lag_months = 6 lag_label = DateUtil.get_lag_label(lag_months=lag_months) # Perform simulation model = ShortRateModel() model.year_count = 1 model.seed = seed model.countries = [ "C" + str(country_index + 1).zfill(4) for country_index in range(country_count) ] model.vol = [0.01] * country_count model.rev = [0.2] * country_count model.cap_rev = [0.2] * country_count model.floor_rev = [0.5] * country_count model.soft_cap = [0.10] * country_count model.soft_floor = [0.02] * country_count model.target = [0.05] * country_count model.short_rate_0 = [ rand.uniform(-0.1, 0.30) for c in range(country_count) ] model.simulate(caller_file=__file__) # Create history plots short_rate_plot = LinePlot() short_rate_plot.input_files = ["history.short_rate"] short_rate_plot.title = "history.short_rate" short_rate_plot.save_plot(caller_file=__file__) # Create sample with time shift sample = LagSample() sample.features = ["short_rate"] sample.lag_months = lag_months sample.create_sample(caller_file=__file__) # Create sample plot short_rate_plot = ScatterPlot() short_rate_plot.input_file = "lag_sample" short_rate_plot.columns = ["short_rate(t)", f"short_rate(t{lag_label})"] short_rate_plot.title = "lag_sample.short_rate" short_rate_plot.save_plot(caller_file=__file__)
def get_lag_label_test(self): """Test for get_lag_label method.""" assert DateUtil.get_lag_label(lag_months=6) == "+6m" assert DateUtil.get_lag_label(lag_months=-6) == "-6m" assert DateUtil.get_lag_label(lag_months=12) == "+1y" assert DateUtil.get_lag_label(lag_months=-12) == "-1y" assert DateUtil.get_lag_label(lag_months=24) == "+2y" assert DateUtil.get_lag_label(lag_months=-24) == "-2y"
def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-issue2db-" + str(self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, log_path, "info") try: self._querier = GitHubQuerier(self._url, self._token, self._logger) self._dao = GitHubDao(self._config, self._logger) self.extract() except Exception: self._logger.error("GitHubIssue2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection()
def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-pr2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = GitHubQuerier(self._url, self._token, self._logger) self._dao = GitHubDao(self._config, self._logger) self._git_dao = GitDao(self._config, self._logger) self.extract() except Exception, e: self._logger.error("GitHubPullRequest2Db failed", exc_info=True)
def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-issue2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = BugzillaQuerier(self._url, self._product, self._logger) self._dao = BugzillaDao(self._config, self._logger) self.extract() except Exception, e: self._logger.error("BugzillaIssue2Db failed", exc_info=True)
def save_plot(self, *, caller_file: str) -> None: """ Create plot from sample. Pass __file__ variable of the caller script as caller_file parameter. It will be used as both input and output file prefix. """ # Prefix for all data files caller_name = FileUtil.get_caller_name(caller_file=caller_file) fig = go.Figure() plot_title = self.title x_axis_label = "Month" y_axis_label = "Value" for input_file in self.input_files: df = pd.read_csv(f'{caller_name}.{input_file}.csv') df = df.loc[df['FREQUENCY'] == 'M'] if self.countries is not None: df = df.loc[df['LOCATION'].isin(self.countries)] countries = df['LOCATION'].unique().tolist() for country in countries: times = [DateUtil.get_sequential_month(year_month=t) for t in df.loc[df['LOCATION'] == country]['TIME']] values = df.loc[df['LOCATION'] == country]['Value'] fig.add_trace( go.Scatter( x=times, y=values, mode='lines', line=dict(width=3.0), name=input_file + "." + country)) fig.update_layout(margin=dict(l=80, r=20, t=80, b=40), title={ 'text': plot_title, 'font': {'family': "Roboto", 'size': 18}, 'x': 0.5 }, xaxis=dict(showgrid=True, tickangle=0, title={'text': x_axis_label, 'font': {'family': "Roboto", 'size': 13}}), yaxis=dict(showgrid=True, tickformat='.2f', nticks=20, title={'text': y_axis_label, 'font': {'family': "Roboto", 'size': 13}}) ) # Save plot file file_name = f"{caller_name}.{self.title.lower()}.png" # fig.update_layout(template=plot_util.get_plot_template()) fig.write_image(file_name)
def __init__(self, db_name, repo_id, issue_tracker_id, url, interval, token, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type repo_id: int :param repo_id: the id of an existing repository in the DB :type issue_tracker_id: int :param issue_tracker_id: the id of an existing issue tracker in the DB :type url: str :param url: full name of the GitHub repository :type interval: list int :param interval: a list of issue ids to import :type token: str :param token: a GitHub token :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._url = url self._db_name = db_name self._repo_id = repo_id self._issue_tracker_id = issue_tracker_id self._interval = interval self._token = token self._config = config self._logging_util = LoggingUtil() self._date_util = DateUtil() self._fileHandler = None self._logger = None self._querier = None self._dao = None
from model.short_rate_model import ShortRateModel from util.date_util import DateUtil from util.line_plot import LinePlot from util.lag_sample import LagSample from util.scatter_plot import ScatterPlot if __name__ == "__main__": # Run two rate model with single calibration across all currencies. country_count: int = 50 seed = 0 rand = np.random.RandomState(seed) lag_months = 60 lag_label = DateUtil.get_lag_label(lag_months=lag_months) # Perform simulation model = ShortRateModel() model.year_count = 30 model.seed = seed model.countries = [ "C" + str(country_index + 1).zfill(4) for country_index in range(country_count) ] model.vol = [0.01] * country_count model.rev = [0.05] * country_count model.cap_rev = [0.2] * country_count model.floor_rev = [0.5] * country_count model.soft_cap = [0.10] * country_count model.soft_floor = [0.02] * country_count
class ChartGenerator(): """ This class handles the generation of charts """ def __init__(self, cnx, logger): """ :type cnx: Object :param cnx: DB connection :type logger: Object :param logger: logger """ self._cnx = cnx self._logger = logger self._date_util = DateUtil() def _get_db_data(self, query): # queries the database cursor = self._cnx.cursor() cursor.execute(query) results_y = [] results_x = [] row = cursor.fetchone() while row: counter = int(row[0]) span = int(row[1]) results_y.append(counter) results_x.append(span) row = cursor.fetchone() cursor.close() return results_x, results_y def create(self, query, x_label, y_label, time_dimension): """ creates the charts :type query: str :param query: SQL query :type x_label: str :param x_label: name of the x label :type y_label: str :param y_label: name of the y label :type time_dimension: str :param time_dimension: time dimension (week, month, year) """ intervals, counters = self._get_db_data(query) if "year" in time_dimension: span = [self._date_util.get_month_from_int(i) for i in intervals] elif "month" in time_dimension: span = intervals elif "week" in time_dimension: span = [ self._date_util.get_weekday_from_int(i - 1) for i in intervals if i <= 7 ] if '_' in y_label: y_label = y_label.replace('_', ' ') line_chart = pygal.Bar(style=LightColorizedStyle) line_chart.title = y_label + " * " + x_label line_chart.x_labels = span line_chart.add(y_label, counters) chart = line_chart.render() return chart
def create_sample(self, *, caller_file: str) -> None: """ Create sample from history record. Pass __file__ variable of the caller script as caller_file parameter. It will be used as both input and output file prefix. """ # Prefix for all data files caller_name = FileUtil.get_caller_name(caller_file=caller_file) # Create DF where the results will be merged sample_df = None shifted_sample_df = None for feature in self.features: # Read and transform time series for each feature time_series_df = pd.read_csv( f"{caller_name}.history.{feature}.csv") # Filter by monthly frequency time_series_df = time_series_df[time_series_df["FREQUENCY"] == "M"] # Filter by country if country list is specified if self.countries is not None: time_series_df = time_series_df[ time_series_df["LOCATION"].isin(self.countries)] # Create sequential month list unshifted_months = [ DateUtil.get_sequential_month(year_month=ym) for ym in time_series_df["TIME"] ] # Create DF with unshifted data values = time_series_df["Value"] location = time_series_df["LOCATION"] unshifted_df = pd.DataFrame({ "LOCATION": location, "Month": unshifted_months, f"{feature}(t)": values.values }) # Merge unshifted time series for the feature if sample_df is None: sample_df = unshifted_df else: sample_df = sample_df.merge(unshifted_df) # Add features with the specified time shift if not None if self.lag_months is not None: # Create sequential month list shifted backwards(!) by the specified time shift shifted_months = [ m - self.lag_months for m in unshifted_months ] shift_label = DateUtil.get_lag_label( lag_months=self.lag_months) # Merge shifted data shifted_df = pd.DataFrame({ "LOCATION": location, "Month": shifted_months, f"{feature}(t{shift_label})": values.values }) if shifted_sample_df is None: shifted_sample_df = shifted_df else: shifted_sample_df = shifted_sample_df.merge(shifted_df) sample_df = sample_df.merge(shifted_sample_df) # Drop month and location columns sample_df.drop(["Month"], axis=1, inplace=True) # Save sample to file sample_df.to_csv(f"{caller_name}.lag_sample.csv", index=False, float_format="%.6f")
class FileUtil(): """ This class provides utilities for the files stored in the Gitana DB """ def __init__(self, config, logger): """ :type config: dict :param config: the DB configuration file :type logger: Object :param logger: logger """ self._config = config self._logger = logger self._git_dao = GitDao(self._config, self._logger) self._date_util = DateUtil() def _get_directory_path(self, path_elements): directory_path = '' path_elements.reverse() for p in path_elements: directory_path = directory_path + p + '/' return directory_path def get_directories(self, file_path): """ extracts the directories where the file is located :type file_path: str :param file_path: path of the file """ directories = [] dir = file_path.split('/')[:-1] dir.reverse() for d in range(0, len(dir)): dir_path = self._get_directory_path(dir[d:]) directories.append(dir_path) if not directories: directories.append("/") return directories def _process_date(self, d): if d: if not self._date_util.check_format_timestamp(d, "%Y-%m-%d"): d = None self._logger.warning( "the date " + str(d) + " does not follow the pattern %Y-%m-%d, all changes be retrieved" ) return d def get_file_history_by_id(self, file_id, ref_id, reversed=False, before_date=None): """ get file history for a given file id within a reference and before a given date :type file_id: int :param file_id: the id of the target file :type ref_id: str :param ref_id: the id of the reference :type reversed: bool :param reversed: if True, it returns the changes from the most recent to the earliest :type before_date: str (YYYY-mm-dd) :param before_date: if not null, it returns the last version of the file before the given date """ before_date = self._process_date(before_date) previous_renamings = [file_id] changes = self._git_dao.select_file_changes(file_id, ref_id, before_date, patch=False, code=True) renamings_to_process = self._git_dao.select_file_renamings( file_id, ref_id) if renamings_to_process: while renamings_to_process != []: current_renamings = renamings_to_process for previous_file_id in current_renamings: changes = changes + self._git_dao.select_file_changes( previous_file_id, ref_id, before_date) previous_renamings.append(previous_file_id) renamings_to_process = renamings_to_process + self._git_dao.select_file_renamings( previous_file_id, ref_id) renamings_to_process = list( set(renamings_to_process) - set(previous_renamings)) return sorted(changes, key=lambda k: k['authored_date'], reverse=reversed) def get_file_history_by_name(self, repo_name, file_name, reference_name, reversed=False, before_date=None): """ get file history for a given file name within a reference and before a given date :type repo_name: str :param repo_name: the name of the repository to import. It cannot be null :type file_name: dict :param file_name: the name of the target file :type reference_name: str :param reference_name: the name of the reference :type reversed: bool :param reversed: if True, it returns the changes from the most recent to the earliest :type before_date: str (YYYY-mm-dd) :param before_date: if not null, it returns the last version of the file before the given date """ history = [] try: repo_id = self._git_dao.select_repo_id(repo_name) file_id = self._git_dao.select_file_id(repo_id, file_name) reference_id = self._git_dao.select_reference_id( repo_id, reference_name) history = self.get_file_history_by_id(file_id, reference_id, reversed, before_date) except: self._logger.error("FileUtil failed", exc_info=True) finally: if self._git_dao: self._git_dao.close_connection() return history def get_file_version_by_id(self, file_id, ref_id, before_date=None): """ get file version for a given file id within a reference and before a given date :type file_id: int :param file_id: the id of the target file :type ref_id: str :param ref_id: the id of the reference :type before_date: str (YYYY-mm-dd) :param before_date: if not null, it returns the last version of the file before the given date """ before_date = self._process_date(before_date) changes = self._git_dao.select_file_changes(file_id, ref_id, before_date, patch=True) sorted(changes, key=lambda k: k['committed_date'], reverse=False) # the digestion is needed because the library diff-match-patch requires that the preamble of the diff information (@@ -.. +.. @@) # appears alone in one line. Sometimes GitPython returns such a preamble mixed with other data diff_util = diff_match_patch() diff_util.Diff_Timeout = 0 diff_util.Match_Distance = 5000 diff_util.Match_Threshold = 0.8 diff_util.Patch_DeleteThreshold = 0.8 content = "" res_merge = [] for change in changes: digested_patches = [] p = change.get('patch') for line in p.split('\n'): m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@", line) if m: rest = line.split(m.group())[1] digested_patches.append(m.group()) if rest: digested_patches.append(rest.rstrip()) else: digested_patches.append(line) ps = diff_util.patch_fromText("\n".join(digested_patches)) res = diff_util.patch_apply(ps, content) content = res[0] res_merge = res_merge + res[1] self._logger.info( str(len([r for r in res_merge if r])) + " out of " + str(len(res_merge)) + " patches were successfully used to rebuild the file") return content def get_file_version_by_name(self, repo_name, file_name, reference_name, before_date=None): """ get file version for a given file name within a reference and before a given date :type repo_name: str :param repo_name: the name of the repository to import. It cannot be null :type file_name: dict :param file_name: the name of the target file :type reference_name: str :param reference_name: the name of the reference :type before_date: str (YYYY-mm-dd) :param reversed: if not null, it returns the last version of the file before the given date """ content = "" try: repo_id = self._git_dao.select_repo_id(repo_name) file_id = self._git_dao.select_file_id(repo_id, file_name) reference_id = self._git_dao.select_reference_id( repo_id, reference_name) content = self.get_file_version_by_id(file_id, reference_id, before_date) except: self._logger.error("FileUtil failed", exc_info=True) finally: if self._git_dao: self._git_dao.close_connection() return content
class EclipseTopic2Db(object): """ This class handles the import of Eclipse forum topics """ TOPIC_URL = 'https://www.eclipse.org/forums/index.php/t/' def __init__(self, db_name, forum_id, interval, config, log_root_path): """ :type db_name: str :param db_name: the name of an existing DB :type forum_id: int :param forum_id: the id of an existing forum in the DB :type interval: list int :param interval: a list of topic ids to import :type config: dict :param config: the DB configuration file :type log_root_path: str :param log_root_path: the log path """ self._log_root_path = log_root_path self._interval = interval self._db_name = db_name self._forum_id = forum_id self._config = config self._fileHandler = None self._logger = None self._querier = None self._dao = None def __call__(self): self._logging_util = LoggingUtil() self._date_util = DateUtil() log_path = self._log_root_path + "-topic2db-" + str( self._interval[0]) + "-" + str(self._interval[-1]) self._logger = self._logging_util.get_logger(log_path) self._fileHandler = self._logging_util.get_file_handler( self._logger, log_path, "info") try: self._querier = EclipseForumQuerier(None, self._logger) self._dao = EclipseForumDao(self._config, self._logger) self.extract() except Exception: self._logger.error("EclipseTopic2Db failed", exc_info=True) finally: if self._dao: self._dao.close_connection() def _get_message_attachments_info(self, message_id, message): #get attachment informatio of messages attachments = self._querier.message_get_attachments(message) for a in attachments: url = self._querier.get_attachment_url(a) own_id = self._querier.get_attachment_own_id(a) name = self._querier.get_attachment_name(a) extension = name.split('.')[-1].strip('').lower() size = self._querier.get_attachment_size(a) self._dao.insert_message_attachment(url, own_id, name, extension, size, message_id) def _get_message_info(self, topic_id, message, pos): #get information of topic messages own_id = self._querier.get_message_own_id(message) created_at = self._date_util.get_timestamp( self._querier.get_created_at(message), "%a, %d %B %Y %H:%M") body = self._querier.get_message_body(message) author_name = self._querier.get_message_author_name(message) message_id = self._dao.insert_message( own_id, pos, self._dao.get_message_type_id("reply"), topic_id, body, None, self._dao.get_user_id(author_name), created_at) if self._querier.message_has_attachments(message): self._get_message_attachments_info(message_id, message) if pos == 1: self._dao.update_topic_created_at(topic_id, created_at, self._forum_id) def extract(self): """ extracts Eclipse forum topic data and stores it in the DB """ self._logger.info("EclipseTopic2Db started") start_time = datetime.now() for topic_id in self._interval: topic_own_id = self._dao.get_topic_own_id(self._forum_id, topic_id) self._querier.set_url(EclipseTopic2Db.TOPIC_URL + str(topic_own_id) + "/") self._querier.start_browser() time.sleep(3) if 'index.php/e/' in self._querier._url: self._logger.warning("No URL exists for the topic id " + str(topic_id) + " - " + str(self._forum_id)) next_page = True pos = 1 while next_page: messages_on_page = self._querier.get_messages() for message in messages_on_page: self._get_message_info(topic_id, message, pos) pos += 1 next_page = self._querier.go_next_page() self._querier.close_browser() end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time( end_time, start_time) self._logger.info("EclipseTopic2Db finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler)
class StackOverflowQuerier(): """ This class collects the data available on Stackoverflow via its API """ def __init__(self, token, logger): """ :type token: str :param token: the token to access the Stackoverflow API :type logger: Object :param logger: logger """ try: self._token = token self._logger = logger self._token_util = TokenUtil(self._logger, "stackoverflow") self._date_util = DateUtil() self._so = stackexchange.Site(stackexchange.StackOverflow, app_key=self._token) self._so.impose_throttling = True self._so.throttle_stop = False except: self._logger.error("StackOverflowQuerier init failed") raise def get_topic_ids(self, search_query, before_date): """ gets the data source topic ids :type search_query: str :param search_query: a label used to mark questions in Stackoverflow :type before_date: str :param before_date: selects questions with creation date before a given date (YYYY-mm-dd) """ questions = [] self._token_util.wait_is_usable(self._so) for question in self._so.questions(tagged=[search_query], pagesize=10).fetch(): questions.append(question) self._token_util.wait_is_usable(self._so) if before_date: questions = [ q for q in questions if q.creation_date <= self._date_util.get_timestamp( before_date, "%Y-%m-%d") ] return [question.id for question in questions] def get_topic(self, question_id): """ gets the topic body :type question_id: int :param question_id: the data source question id """ try: self._token_util.wait_is_usable(self._so) question = self._so.question(question_id, body="True") except: question = None return question def get_topic_name(self, question): """ gets the topic title :type question: Object :param question: the Object representing the question """ return question.title def get_container_own_id(self, container): """ gets the data source container id :type container: Object :param container: the Object representing the container """ return container.id def get_container_votes(self, container): """ gets the data source container votes :type container: Object :param container: the Object representing the container """ return container.score def get_topic_labels(self, question): """ gets the topic labels :type question: Object :param question: the Object representing the question """ try: labels = question.tags except: labels = [] return labels def get_topic_views(self, question): """ gets the topic view count :type question: Object :param question: the Object representing the question """ return question.view_count def is_accepted_answer(self, answer): """ checks if the answer is the accepted one :type answer: Object :param answer: the Object representing the answer """ try: found = answer.accepted except: found = False return found def get_container_created_at(self, container): """ gets the container creation date :type container: Object :param container: the Object representing the container """ return container.creation_date def get_topic_last_change_at(self, question): """ gets the topic last change date :type question: Object :param question: the Object representing the question """ return question.last_activity_date def get_container_body(self, container): """ gets the container body :type container: Object :param container: the Object representing the container """ return container.body def remove_html_tags(self, html_text): """ removes HTML tags from html text :type html_text: str :param html_text: the html text of a question/answer/comment """ return BeautifulSoup(html_text).text def get_container_author(self, container): """ gets the container author :type container: Object :param container: the Object representing the container """ self._token_util.wait_is_usable(self._so) user = self._so.user(container.owner_id).display_name return user def get_comments(self, container): """ gets the container comments :type container: Object :param container: the Object representing the container """ comments = [] try: self._token_util.wait_is_usable(self._so) for comment in container.comments.fetch(): comments.append(comment) self._token_util.wait_is_usable(self._so) except: self._logger.error("Stackexchange error when retrieving comments") return comments def get_answers(self, question): """ gets the answer of a question :type question: Object :param question: the Object representing the question """ answers = [] self._token_util.wait_is_usable(self._so) for answer in question.answers: answers.append(answer) self._token_util.wait_is_usable(self._so) return answers def get_attachments(self, body): """ extracts the attachments from a text :type body: str :param body: text of a question/comment/answer """ p = re.compile("<a href=[^ ]*a>") matches = p.findall(body) attachments = [] for m in matches: attachments.append(m) return attachments def get_attachment_name(self, html_tag): """ extracts the attachment name :type html_tag: str :param html_tag: text """ p = re.compile(">.*</a>") matches = p.findall(html_tag) found = None if matches: found = matches[0].strip('</a>')[0:] else: self._logger.info("url name not extracted for: " + html_tag) return found def get_attachment_url(self, html_tag): """ extracts the attachment url :type html_tag: str :param html_tag: text """ p = re.compile("\".*\"") matches = p.findall(html_tag) found = None if matches: found = matches[0].strip('"') else: self._logger.info("url not extracted for: " + html_tag) return found def generate_attachment_id(self, message_id, pos): """ creates id for attachment using the message id and position :type message_id: int :param message_id: id of the message where the attachment was found :type pos: int :param pos: position of the message where the attachment was found """ return str(message_id) + str(pos)
class ActivityReportExporter(): """ This class handles the generation of reports """ LOG_FOLDER_PATH = "logs" INPUT_PATH = os.path.join(os.path.dirname(resources.__file__), 'queries.json') def __init__(self, config, db_name, log_root_path): """ :type config: dict :param config: the DB configuration file :type db_name: str :param config: name of an existing DB :type log_root_path: str :param log_root_path: the log path """ self._dsl_util = DslUtil() self._date_util = DateUtil() self._db_util = DbUtil() self._logging_util = LoggingUtil() self._log_path = log_root_path + "export-report-" + db_name + ".log" self._logger = self._logging_util.get_logger(self._log_path) self._fileHandler = self._logging_util.get_file_handler(self._logger, self._log_path, "info") self._db_name = db_name self._config = config self._cnx = self._db_util.get_connection(self._config) self._db_util.set_database(self._cnx, self._db_name) self._db_util.set_settings(self._cnx) self._chart_generator = ChartGenerator(self._cnx, self._logger) self._html_generator = HtmlGenerator(self._logger) def _create_log_folder(self, name): #creates the log folder if not os.path.exists(name): os.makedirs(name) def _create_output_file(self, filename): #creates the output folder if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise def _load_report_exporter_json(self, json_path): #loads the JSON that drives the report export process with open(json_path) as json_data: data = json.load(json_data) return data.get('report') def _find_entity_id(self, type, name): #finds the id of the tools stored in the DB found = None if type == "project": found = self._db_util.select_project_id(self._cnx, name, self._logger) elif type == "repo": found = self._db_util.select_repo_id(self._cnx, name, self._logger) elif type == "issuetracker": found = self._db_util.select_issue_tracker_id(self._cnx, name, self._logger) elif type == "forum": found = self._db_util.select_forum_id(self._cnx, name, self._logger) elif type == "instantmessaging": found = self._db_util.select_instant_messaging_id(self._cnx, name, self._logger) if not found: self._logger.error("ReporExporter: entity " + str(type) + " with name " + str(name) + " not found!") return found def _get_parameter(self, key, parameters): #gets parameters of the JSON found = None if key in ["AFTERDATE", "INTERVAL"]: found = parameters.get(key.lower()) else: if key.endswith("ID"): found = parameters.get(key[:-2].lower()) if not found: self._logger.error("ReportExporter: parameter " + str(key) + " not found!") return found def _load_query_json(self, metric_name, parameters): #loads the queries in the JSON configuration file with open(ActivityReportExporter.INPUT_PATH) as json_data: data = json.load(json_data) metrics = data.get('queries') try: found = [m for m in metrics if m.get('name') == metric_name][0] query = found.get('query') for k in found.keys(): if k not in ['name', 'query']: k_value = str(self._get_parameter(k, parameters)) query = query.replace(k, k_value) return query except: self._logger.error("ReportExporter: metric " + str(metric_name) + " not found!") def _get_activity_name(self, activity): #gets the name of the activity return activity.replace("_", " ") def _get_activity_type(self, activity): #gets the type of the activity return activity.replace("_activity", "").replace("_", "") def _generate_charts(self, activity, activity_data, project_id, time_span): #generates charts entity2charts = {} after_date, interval = self._calculate_time_information(time_span) activity_type = self._get_activity_type(activity) names = activity_data.get('names') measures = activity_data.get('measures') for entity_name in names: entity_id = self._dsl_util.find_entity_id(self._cnx, activity_type, entity_name, self._logger) charts = [] for measure in measures: query = self._load_query_json(measure, {activity_type: entity_id, 'project': project_id, 'afterdate': after_date, 'interval': interval}) charts.append(self._chart_generator.create(query, interval.lower(), measure, time_span)) entity2charts.update({entity_name: charts}) return entity2charts def _calculate_time_information(self, time_span): #calculates the time span information start = None interval = None current_time = datetime.now() #test datetime.strptime("2015-10-10", "%Y-%m-%d") if time_span == "this_week": start = self._date_util.get_start_time_span(current_time, "week", "%Y-%m-%d") interval = "DAY" elif time_span == "this_month": start = self._date_util.get_start_time_span(current_time, "month", "%Y-%m-%d") interval = "DAY" elif time_span == "this_year": start = self._date_util.get_start_time_span(current_time, "year", "%Y-%m-%d") interval = "MONTH" else: self._logger.error("ReportExporter: time span " + str(time_span) + " not recognized! Options are: this_week, this_month, this_year") return start, interval def export(self, file_path, json_path): """ exports the Gitana data to a report :type file_path: str :param file_path: the path where to export the report :type json_path: str :param json_path: the path of the JSON that drives the export process """ try: self._logger.info("ReportExporter started") start_time = datetime.now() exporter_data = self._load_report_exporter_json(json_path) project_name = exporter_data.get('project') project_id = self._dsl_util.find_entity_id(self._cnx, "project", project_name, self._logger) time_span = exporter_data.get('time_span') activity2charts = {} for activity in [attr for attr in exporter_data.keys() if attr.endswith('activity')]: activity_name = self._get_activity_name(activity) charts = self._generate_charts(activity, exporter_data.get(activity), project_id, time_span) activity2charts.update({activity_name: charts}) html_page = self._html_generator.create(project_name, activity2charts) with codecs.open(file_path, 'w', encoding='utf8') as f: f.write(html_page) self._db_util.close_connection(self._cnx) end_time = datetime.now() minutes_and_seconds = self._logging_util.calculate_execution_time(end_time, start_time) self._logger.info("ReportExporter: process finished after " + str(minutes_and_seconds[0]) + " minutes and " + str(round(minutes_and_seconds[1], 1)) + " secs") self._logging_util.remove_file_handler_logger(self._logger, self._fileHandler) except: self._logger.error("ReportExporter failed", exc_info=True)