def process_queries(self, queries): # If this is an old Trac version, update the timeline. if self.tm.old_trac: # Setup the TracTimeline instance try: self.timeline = TracTimeline.all_timelines.get( base_url=self.tm.get_base_url()) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url=self.tm.get_base_url()) # Check when the timeline was last updated. timeline_age = datetime.datetime.utcnow( ) - self.timeline.last_polled # Set up timeline URL. timeline_url = urlparse.urljoin( self.timeline.base_url, "timeline?ticket=on&daysback=%d&format=rss" % (timeline_age.days + 1)) # Add the URL to the waiting list self.add_url_to_waiting_list(url=timeline_url, callback=self.handle_timeline_rss) # Add all the queries to the waiting list for query in queries: query_url = query.get_query_url() self.add_url_to_waiting_list(url=query_url, callback=self.handle_query_csv) query.last_polled = datetime.datetime.utcnow() query.save() # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor()
def __init__(self, base_url, tracker_name): self.tracker_name = tracker_name try: self.timeline = TracTimeline.all_timelines.get(base_url = base_url) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url = base_url) # Unsure if this is required here, but can't hurt. self.timeline.save()
def process_queries(self, queries): # If this is an old Trac version, update the timeline. if self.tm.old_trac: # Setup the TracTimeline instance try: self.timeline = TracTimeline.all_timelines.get(base_url = self.tm.get_base_url()) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url = self.tm.get_base_url()) # Check when the timeline was last updated. timeline_age = datetime.datetime.utcnow() - self.timeline.last_polled # Set up timeline URL. timeline_url = urlparse.urljoin(self.timeline.base_url, "timeline?ticket=on&daysback=%d&format=rss" % (timeline_age.days + 1)) # Add the URL to the waiting list self.add_url_to_waiting_list( url=timeline_url, callback=self.handle_timeline_rss) # Add all the queries to the waiting list for query in queries: query_url = query.get_query_url() self.add_url_to_waiting_list( url=query_url, callback=self.handle_query_csv) query.last_polled = datetime.datetime.utcnow() query.save() # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor()
class TracBugImporter(BugImporter): def __init__(self, *args, **kwargs): # Create a list to store bug ids obtained from queries. self.bug_ids = [] # Call the parent __init__. super(TracBugImporter, self).__init__(*args, **kwargs) def process_queries(self, queries): # If this is an old Trac version, update the timeline. if self.tm.old_trac: # Setup the TracTimeline instance try: self.timeline = TracTimeline.all_timelines.get( base_url=self.tm.get_base_url()) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url=self.tm.get_base_url()) # Check when the timeline was last updated. timeline_age = datetime.datetime.utcnow( ) - self.timeline.last_polled # Set up timeline URL. timeline_url = urlparse.urljoin( self.timeline.base_url, "timeline?ticket=on&daysback=%d&format=rss" % (timeline_age.days + 1)) # Add the URL to the waiting list self.add_url_to_waiting_list(url=timeline_url, callback=self.handle_timeline_rss) # Add all the queries to the waiting list for query in queries: query_url = query.get_query_url() self.add_url_to_waiting_list(url=query_url, callback=self.handle_query_csv) query.last_polled = datetime.datetime.utcnow() query.save() # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_timeline_rss(self, timeline_rss): # There are two steps to updating the timeline. # First step is to use the actual timeline to update the date_reported and # last_touched fields for each bug. # Parse the returned timeline RSS feed. feed = feedparser.parse(timeline_rss) for entry in feed.entries: # Format the data. entry_url = entry.link.rsplit("#", 1)[0] entry_date = datetime.datetime(*entry.date_parsed[0:6]) entry_status = entry.title.split("): ", 1)[0].rsplit(" ", 1)[1] try: tb_times = self.timeline.tracbugtimes_set.get( canonical_bug_link=entry_url) except TracBugTimes.DoesNotExist: tb_times = TracBugTimes(canonical_bug_link=entry_url, timeline=self.timeline) # Set the date values as appropriate. if 'created' in entry_status: tb_times.date_reported = entry_date if tb_times.last_touched < entry_date: tb_times.last_touched = entry_date # Store entry status as well for use in second step. tb_times.latest_timeline_status = entry_status # Save the TracBugTimes object. tb_times.save() # Second step is to use the RSS feed for each individual bug to update the # last_touched field. This would be unneccessary if the timeline showed # update events as well as creation and closing ones, and in fact later # versions of Trac have this option - but then the later versions of Trac # also hyperlink to the timeline from the bug, making this all moot. # Also, we cannot just use the RSS feed for everything, as it is missing # the date_reported time, as well as a lot of information about the bug # itself (e.g. Priority). for tb_times in self.timeline.tracbugtimes_set.all(): # Check that the bug has not beeen seen as 'closed' in the timeline. # This will reduce network load by not grabbing the RSS feed of bugs # whose last_touched info is definitely correct. if 'closed' not in tb_times.latest_timeline_status: self.add_url_to_waiting_list(url=tb_times.canonical_bug_link + '?format=rss', callback=self.handle_bug_rss, callback_args=tb_times) # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_bug_rss(self, bug_rss, tb_times): feed = feedparser.parse(bug_rss) comment_dates = [ datetime.datetime(*e.date_parsed[0:6]) for e in feed.entries ] # Check if there are comments to grab from. if comment_dates: tb_times.last_polled = max(comment_dates) tb_times.save() def handle_query_csv(self, query_csv): # Turn the string into a list so csv.DictReader can handle it. query_csv_list = query_csv.split('\n') dictreader = csv.DictReader(query_csv_list) self.bug_ids.extend([int(line['id']) for line in dictreader]) def prepare_bug_urls(self): # Pull bug_ids our of the internal storage. This is done in case the # list is simultaneously being written to, in which case just copying # the entire thing followed by ddeleting the contents could lead to # lost IDs. bug_id_list = [] while self.bug_ids: bug_id_list.append(self.bug_ids.pop()) # Convert the obtained bug ids to URLs. bug_url_list = [ urlparse.urljoin(self.tm.get_base_url(), "ticket/%d" % bug_id) for bug_id in bug_id_list ] # Get the sub-list of URLs that are fresh. fresh_bug_urls = mysite.search.models.Bug.all_bugs.filter( canonical_bug_link__in=bug_url_list, last_polled__lt=datetime.datetime.now() - datetime.timedelta(days=1)).values_list('canonical_bug_link', flat=True) # Remove the fresh URLs to be let with stale or new URLs. for bug_url in fresh_bug_urls: bug_url_list.remove(bug_url) # Put the bug list in the form required for process_bugs. # The second entry of the tuple is None as Trac never supplies data via queries. bug_list = [(bug_url, None) for bug_url in bug_url_list] # And now go on to process the bug list self.process_bugs(bug_list) def process_bugs(self, bug_list): # If there are no bug URLs, finish now. if not bug_list: self.determine_if_finished() return for bug_url, _ in bug_list: # Create a TracBugParser instance to store the bug data tbp = TracBugParser(bug_url) self.add_url_to_waiting_list(url=tbp.bug_csv_url, callback=self.handle_bug_csv, c_args={'tbp': tbp}, errback=self.errback_bug_data, e_args={'tbp': tbp}) # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_bug_csv(self, bug_csv, tbp): # Pass the TracBugParser the CSV data tbp.set_bug_csv_data(bug_csv) # Now fetch the bug HTML self.add_url_to_waiting_list(url=tbp.bug_html_url, callback=self.handle_bug_html, c_args={'tbp': tbp}, errback=self.errback_bug_data, e_args={'tbp': tbp}) def errback_bug_data(self, failure, tbp): # For some unknown reason, some trackers choose to delete some bugs entirely instead # of just marking them as closed. That is fine for bugs we haven't yet pulled, but # if the bug is already being tracked then we get a 404 error. This catcher looks # for a 404 and deletes the bug if it occurs. if failure.check( twisted.web.error.Error ) and failure.value.status == twisted.web.http.NOT_FOUND: try: bug = mysite.search.models.Bug.all_bugs.get( canonical_bug_link=tbp.bug_url) bug.delete() except mysite.search.models.Bug.DoesNotExist: pass # To keep the callback chain happy, explicity return None. return None else: # Pass the Failure on. return failure def handle_bug_html(self, bug_html, tbp): # Pass the TracBugParser the HTML data tbp.set_bug_html_data(bug_html) # Get the parsed data dict from the TracBugParser data = tbp.get_parsed_data_dict(self.tm) if self.tm.old_trac: # It's an old version of Trac that doesn't have links from the # bugs to the timeline. So we need to fetch these times from # the database built earlier. (data['date_reported'], data['last_touched']) = self.timeline.get_times(tbp.bug_url) # Get or create a Bug object to put the parsed data in. try: bug = mysite.search.models.Bug.all_bugs.get( canonical_bug_link=tbp.bug_url) except mysite.search.models.Bug.DoesNotExist: bug = mysite.search.models.Bug(canonical_bug_link=tbp.bug_url) # Fill the Bug for key in data: value = data[key] setattr(bug, key, value) # Save the project onto it # Project name is generated from the bug_project_name_format property # of the TrackerModel. project_from_name, _ = mysite.search.models.Project.objects.get_or_create( name=self.generate_bug_project_name(tbp)) # Manually save() the Project to ensure that if it was created then it has # a display_name. project_from_name.save() bug.project = project_from_name # Store the tracker that generated the Bug, update last_polled and save it! bug.tracker = self.tm bug.last_polled = datetime.datetime.utcnow() bug.save() def generate_bug_project_name(self, tbp): return self.tm.bug_project_name_format.format( tracker_name=self.tm.tracker_name, component=tbp.component) def determine_if_finished(self): # If we got here then there are no more URLs in the waiting list. # So if self.bug_ids is also empty then we are done. if self.bug_ids: self.prepare_bug_urls() else: self.finish_import()
class TracBugTimeline(object): def __init__(self, base_url, tracker_name): self.tracker_name = tracker_name try: self.timeline = TracTimeline.all_timelines.get(base_url = base_url) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url = base_url) # Unsure if this is required here, but can't hurt. self.timeline.save() def generate_timeline_entries_from_rss(self, days_back): rss_url = urlparse.urljoin(self.timeline.base_url, "timeline?ticket=on&daysback=%d&format=rss" % days_back) logging.info("[Trac] Fetching timeline RSS...") feed = feedparser.parse(rss_url) for entry in feed.entries: yield entry def update(self): logging.info("[Trac] Started refreshing timeline for tracker named %s." % self.tracker_name) # Check when the timeline was last updated. timeline_age = datetime.datetime.utcnow() - self.timeline.last_polled # First step is to use the actual timeline to update the date_reported and # last_touched fields for each bug. # Add one to days count here to account for possible timezone differences. for entry in self.generate_timeline_entries_from_rss(timeline_age.days + 1): # Format the data. entry_url = entry.link.rsplit("#", 1)[0] entry_date = datetime.datetime(*entry.date_parsed[0:6]) entry_status = entry.title.split("): ", 1)[0].rsplit(" ", 1)[1] logging.info("[Trac] Updating %s entry on %s for %s" % (entry_status, entry_date, entry_url)) # Get or create a TracBugTimes object. try: tb_times = self.timeline.tracbugtimes_set.get(canonical_bug_link = entry_url) except TracBugTimes.DoesNotExist: tb_times = TracBugTimes(canonical_bug_link = entry_url, timeline = self.timeline) # Set the date values as appropriate. if 'created' in entry_status: tb_times.date_reported = entry_date if tb_times.last_touched < entry_date: tb_times.last_touched = entry_date # Store entry status as well for use in second step. tb_times.latest_timeline_status = entry_status # Save the TracBugTimes object. tb_times.save() # Second step is to use the RSS feed for each individual bug to update the # last_touched field. This would be unneccessary if the timeline showed # update events as well as creation and closing ones, and in fact later # versions of Trac have this option - but then the later versions of Trac # also hyperlink to the timeline from the bug, making this all moot. # Also, we cannot just use the RSS feed for everything, as it is missing # the date_reported time, as well as a lot of information about the bug # itself (e.g. Priority). for tb_times in self.timeline.tracbugtimes_set.all(): # Check that the bug has not beeen seen as 'closed' in the timeline. # This will reduce network load by not grabbing the RSS feed of bugs # whose last_touched info is definitely correct. if 'closed' not in tb_times.latest_timeline_status: logging.info("[Trac] Grabbing RSS feed for %s" % tb_times.canonical_bug_link) feed = feedparser.parse(tb_times.canonical_bug_link + '?format=rss') comment_dates = [datetime.datetime(*e.date_parsed[0:6]) for e in feed.entries] # Check if there are comments to grab from. if comment_dates: tb_times.last_polled = max(comment_dates) tb_times.save() # Finally, update the timeline's last_polled. self.timeline.last_polled = datetime.datetime.utcnow() self.timeline.save() def get_times(self, bug_url): bug_times = self.timeline.tracbugtimes_set.get(canonical_bug_link = bug_url) return (bug_times.date_reported, bug_times.last_touched)
class TracBugImporter(BugImporter): def __init__(self, *args, **kwargs): # Create a list to store bug ids obtained from queries. self.bug_ids = [] # Call the parent __init__. super(TracBugImporter, self).__init__(*args, **kwargs) def process_queries(self, queries): # If this is an old Trac version, update the timeline. if self.tm.old_trac: # Setup the TracTimeline instance try: self.timeline = TracTimeline.all_timelines.get(base_url = self.tm.get_base_url()) except TracTimeline.DoesNotExist: self.timeline = TracTimeline(base_url = self.tm.get_base_url()) # Check when the timeline was last updated. timeline_age = datetime.datetime.utcnow() - self.timeline.last_polled # Set up timeline URL. timeline_url = urlparse.urljoin(self.timeline.base_url, "timeline?ticket=on&daysback=%d&format=rss" % (timeline_age.days + 1)) # Add the URL to the waiting list self.add_url_to_waiting_list( url=timeline_url, callback=self.handle_timeline_rss) # Add all the queries to the waiting list for query in queries: query_url = query.get_query_url() self.add_url_to_waiting_list( url=query_url, callback=self.handle_query_csv) query.last_polled = datetime.datetime.utcnow() query.save() # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_timeline_rss(self, timeline_rss): # There are two steps to updating the timeline. # First step is to use the actual timeline to update the date_reported and # last_touched fields for each bug. # Parse the returned timeline RSS feed. feed = feedparser.parse(timeline_rss) for entry in feed.entries: # Format the data. entry_url = entry.link.rsplit("#", 1)[0] entry_date = datetime.datetime(*entry.date_parsed[0:6]) entry_status = entry.title.split("): ", 1)[0].rsplit(" ", 1)[1] try: tb_times = self.timeline.tracbugtimes_set.get(canonical_bug_link = entry_url) except TracBugTimes.DoesNotExist: tb_times = TracBugTimes(canonical_bug_link = entry_url, timeline = self.timeline) # Set the date values as appropriate. if 'created' in entry_status: tb_times.date_reported = entry_date if tb_times.last_touched < entry_date: tb_times.last_touched = entry_date # Store entry status as well for use in second step. tb_times.latest_timeline_status = entry_status # Save the TracBugTimes object. tb_times.save() # Second step is to use the RSS feed for each individual bug to update the # last_touched field. This would be unneccessary if the timeline showed # update events as well as creation and closing ones, and in fact later # versions of Trac have this option - but then the later versions of Trac # also hyperlink to the timeline from the bug, making this all moot. # Also, we cannot just use the RSS feed for everything, as it is missing # the date_reported time, as well as a lot of information about the bug # itself (e.g. Priority). for tb_times in self.timeline.tracbugtimes_set.all(): # Check that the bug has not beeen seen as 'closed' in the timeline. # This will reduce network load by not grabbing the RSS feed of bugs # whose last_touched info is definitely correct. if 'closed' not in tb_times.latest_timeline_status: self.add_url_to_waiting_list( url=tb_times.canonical_bug_link + '?format=rss', callback=self.handle_bug_rss, callback_args=tb_times) # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_bug_rss(self, bug_rss, tb_times): feed = feedparser.parse(bug_rss) comment_dates = [datetime.datetime(*e.date_parsed[0:6]) for e in feed.entries] # Check if there are comments to grab from. if comment_dates: tb_times.last_polled = max(comment_dates) tb_times.save() def handle_query_csv(self, query_csv): # Turn the string into a list so csv.DictReader can handle it. query_csv_list = query_csv.split('\n') dictreader = csv.DictReader(query_csv_list) self.bug_ids.extend([int(line['id']) for line in dictreader]) def prepare_bug_urls(self): # Pull bug_ids our of the internal storage. This is done in case the # list is simultaneously being written to, in which case just copying # the entire thing followed by ddeleting the contents could lead to # lost IDs. bug_id_list = [] while self.bug_ids: bug_id_list.append(self.bug_ids.pop()) # Convert the obtained bug ids to URLs. bug_url_list = [urlparse.urljoin(self.tm.get_base_url(), "ticket/%d" % bug_id) for bug_id in bug_id_list] # Get the sub-list of URLs that are fresh. fresh_bug_urls = mysite.search.models.Bug.all_bugs.filter( canonical_bug_link__in = bug_url_list, last_polled__lt = datetime.datetime.now() - datetime.timedelta(days = 1) ).values_list('canonical_bug_link', flat=True) # Remove the fresh URLs to be let with stale or new URLs. for bug_url in fresh_bug_urls: bug_url_list.remove(bug_url) # Put the bug list in the form required for process_bugs. # The second entry of the tuple is None as Trac never supplies data via queries. bug_list = [(bug_url, None) for bug_url in bug_url_list] # And now go on to process the bug list self.process_bugs(bug_list) def process_bugs(self, bug_list): # If there are no bug URLs, finish now. if not bug_list: self.determine_if_finished() return for bug_url, _ in bug_list: # Create a TracBugParser instance to store the bug data tbp = TracBugParser(bug_url) self.add_url_to_waiting_list( url=tbp.bug_csv_url, callback=self.handle_bug_csv, c_args={'tbp': tbp}, errback=self.errback_bug_data, e_args={'tbp': tbp}) # URLs are now all prepped, so start pushing them onto the reactor. self.push_urls_onto_reactor() def handle_bug_csv(self, bug_csv, tbp): # Pass the TracBugParser the CSV data tbp.set_bug_csv_data(bug_csv) # Now fetch the bug HTML self.add_url_to_waiting_list( url=tbp.bug_html_url, callback=self.handle_bug_html, c_args={'tbp': tbp}, errback=self.errback_bug_data, e_args={'tbp': tbp}) def errback_bug_data(self, failure, tbp): # For some unknown reason, some trackers choose to delete some bugs entirely instead # of just marking them as closed. That is fine for bugs we haven't yet pulled, but # if the bug is already being tracked then we get a 404 error. This catcher looks # for a 404 and deletes the bug if it occurs. if failure.check(twisted.web.error.Error) and failure.value.status == twisted.web.http.NOT_FOUND: try: bug = mysite.search.models.Bug.all_bugs.get( canonical_bug_link=tbp.bug_url) bug.delete() except mysite.search.models.Bug.DoesNotExist: pass # To keep the callback chain happy, explicity return None. return None else: # Pass the Failure on. return failure def handle_bug_html(self, bug_html, tbp): # Pass the TracBugParser the HTML data tbp.set_bug_html_data(bug_html) # Get the parsed data dict from the TracBugParser data = tbp.get_parsed_data_dict(self.tm) if self.tm.old_trac: # It's an old version of Trac that doesn't have links from the # bugs to the timeline. So we need to fetch these times from # the database built earlier. (data['date_reported'], data['last_touched']) = self.timeline.get_times(tbp.bug_url) # Get or create a Bug object to put the parsed data in. try: bug = mysite.search.models.Bug.all_bugs.get( canonical_bug_link=tbp.bug_url) except mysite.search.models.Bug.DoesNotExist: bug = mysite.search.models.Bug(canonical_bug_link = tbp.bug_url) # Fill the Bug for key in data: value = data[key] setattr(bug, key, value) # Save the project onto it # Project name is generated from the bug_project_name_format property # of the TrackerModel. project_from_name, _ = mysite.search.models.Project.objects.get_or_create(name=self.generate_bug_project_name(tbp)) # Manually save() the Project to ensure that if it was created then it has # a display_name. project_from_name.save() bug.project = project_from_name # Store the tracker that generated the Bug, update last_polled and save it! bug.tracker = self.tm bug.last_polled = datetime.datetime.utcnow() bug.save() def generate_bug_project_name(self, tbp): return self.tm.bug_project_name_format.format(tracker_name=self.tm.tracker_name, component=tbp.component) def determine_if_finished(self): # If we got here then there are no more URLs in the waiting list. # So if self.bug_ids is also empty then we are done. if self.bug_ids: self.prepare_bug_urls() else: self.finish_import()