def parse_ticket(self, id): # Use CSV export to get ticket fields url = self.full_url(self.TICKET_URL % id, 'csv') f = self.csvopen(url) reader = csv.DictReader(f) ticket_fields = reader.next() ticket_fields['class'] = 'ARTIFACT' ticket = self.remap_fields(ticket_fields) # Use HTML export to get ticket description and comments import html2text html2text.BODY_WIDTH = 0 url = self.full_url(self.TICKET_URL % id) self.log_url(url) d = BeautifulSoup(urlopen(url)) self.clean_missing_wiki_links(d) desc = d.find('div', 'description').find('div', 'searchable') ticket['description'] = html2text.html2text( desc.renderContents('utf8').decode('utf8')) if desc else '' comments = [] for comment in d.findAll('form', action='#comment'): c = {} c['submitter'] = re.sub(r'.* by ', '', comment.find('h3', 'change').text).strip() c['date'] = self.trac2z_date( comment.find('a', 'timeline')['title'].replace(' in Timeline', '')) changes = unicode(comment.find('ul', 'changes') or '') body = comment.find('div', 'comment') body = body.renderContents('utf8').decode('utf8') if body else '' c['comment'] = html2text.html2text(changes + body) c['class'] = 'COMMENT' comments.append(c) ticket['comments'] = comments return ticket
def parse_ticket_attachments(self, id): SIZE_PATTERN = r'(\d+) bytes' TIMESTAMP_PATTERN = r'(.+) in Timeline' # Scrape HTML to get ticket attachments url = self.full_url(self.ATTACHMENT_LIST_URL % id) self.log_url(url) f = urlopen(url) soup = BeautifulSoup(f) attach = soup.find('div', id='attachments') list = [] while attach: attach = attach.findNext('dt') if not attach: break d = {} d['filename'] = attach.a['href'].rsplit('/', 1)[1] d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename'])) size_s = attach.span['title'] d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s)) timestamp_s = attach.find('a', {'class': 'timeline'})['title'] d['date'] = self.trac2z_date(self.match_pattern(TIMESTAMP_PATTERN, timestamp_s)) d['by'] = attach.find(text=re.compile('added by')).nextSibling.renderContents() d['description'] = '' # Skip whitespace while attach.nextSibling and type(attach.nextSibling) is NavigableString: attach = attach.nextSibling # if there's a description, there will be a <dd> element, other immediately next <dt> if attach.nextSibling and attach.nextSibling.name == 'dd': desc_el = attach.nextSibling if desc_el: # TODO: Convert to Allura link syntax as needed d['description'] = ''.join(desc_el.findAll(text=True)).strip() list.append(d) return list
def parse_ticket(self, id): # Use CSV export to get ticket fields url = self.full_url(self.TICKET_URL % id, 'csv') f = self.csvopen(url) reader = csv.DictReader(f) ticket_fields = reader.next() ticket_fields['class'] = 'ARTIFACT' ticket = self.remap_fields(ticket_fields) # Use HTML export to get ticket description and comments import html2text html2text.BODY_WIDTH = 0 url = self.full_url(self.TICKET_URL % id) self.log_url(url) d = BeautifulSoup(urlopen(url)) self.clean_missing_wiki_links(d) desc = d.find('div', 'description').find('div', 'searchable') ticket['description'] = html2text.html2text( desc.renderContents('utf8').decode('utf8')) if desc else '' comments = [] for comment in d.findAll('form', action='#comment'): c = {} c['submitter'] = re.sub( r'.* by ', '', comment.find('h3', 'change').text).strip() c['date'] = self.trac2z_date( comment.find('a', 'timeline')['title'].replace(' in Timeline', '')) changes = unicode(comment.find('ul', 'changes') or '') body = comment.find('div', 'comment') body = body.renderContents('utf8').decode('utf8') if body else '' c['comment'] = html2text.html2text(changes + body) c['class'] = 'COMMENT' comments.append(c) ticket['comments'] = comments return ticket
def parse_ticket(self, id): # Use CSV export to get ticket fields url = self.full_url(self.TICKET_URL % id, 'csv') f = self.csvopen(url) reader = csv.DictReader(f) ticket_fields = reader.next() ticket_fields['class'] = 'ARTIFACT' ticket = self.remap_fields(ticket_fields) # Use RSS export to get ticket description and comments import html2text html2text.BODY_WIDTH = 0 url = self.full_url(self.TICKET_URL % id, 'rss') self.log_url(url) d = feedparser.parse(urlopen(url)) ticket['description'] = html2text.html2text(d.feed.description) comments = [] for comment in d['entries']: c = {} c['submitter'] = getattr(comment, 'author', None) c['date'] = comment.updated_parsed c['comment'] = html2text.html2text(comment.summary) c['class'] = 'COMMENT' comments.append(c) ticket['comments'] = comments return ticket
def csvopen(self, url): self.log_url(url) f = urlopen(url) # Trac doesn't throw 403 error, just shows normal 200 HTML page # telling that access denied. So, we'll emulate 403 ourselves. # TODO: currently, any non-csv result treated as 403. if not f.info()['Content-Type'].startswith('text/csv'): raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f) return f
def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=120, unredirected_hdrs=None, **kw): req = six.moves.urllib.request.Request(url, **kw) if unredirected_hdrs: for key, val in unredirected_hdrs.items(): req.add_unredirected_header(key, val) req.add_header('User-Agent', 'Allura Data Importer (https://allura.apache.org/)') return h.urlopen(req, retries=retries, codes=codes, timeout=timeout)
def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), **kw): req = urllib2.Request(url, **kw) req.add_header('User-Agent', 'Allura Data Importer (https://allura.apache.org/)') return h.urlopen(req, retries=retries, codes=codes)
def test_no_error(self, urllib2): r = h.urlopen('myurl') self.assertEqual(r, urllib2.urlopen.return_value) urllib2.urlopen.assert_called_once_with('myurl', timeout=None)
def urlopen(url, retries=3, codes=(408,), **kw): req = urllib2.Request(url, **kw) req.add_header('User-Agent', 'Allura Data Importer (https://forge-allura.apache.org/p/allura/)') return h.urlopen(req, retries=retries, codes=codes)
def fetch(self, url): return urlopen(url)
def urlopen(url, retries=3, codes=(408, 500, 502, 503, 504), timeout=120, **kw): req = urllib2.Request(url, **kw) req.add_header( 'User-Agent', 'Allura Data Importer (https://allura.apache.org/)') return h.urlopen(req, retries=retries, codes=codes, timeout=timeout)