def parse(self): duecomments = self.soup.findAll(text=re.compile("Due Date")) for comment in duecomments: tr = comment.findPrevious('tr') item = models.Item() marker = tr.find(text=re.compile("Title")) if marker is None: marker = tr.find(text=re.compile("Print the title")) title = self.findcontent(marker.parent) title = util.unescape(title) item.title = util.stripNonAscii(title) marker = tr.find(text=re.compile("Author")) author = self.findcontent(marker.parent) L = author.split(',') author = ','.join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) marker = tr.find(text=re.compile("Due Date")) dueDate = self.findcontent(marker.parent) dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item
def parse_itemlisting_style(self): item_tds = self.soup.findAll('td', {'class' : ('itemlisting', 'itemlisting2')}) for td in item_tds: tr = td.findPrevious('tr') item = models.Item() marker = tr.find(text=re.compile("Print the title")) title = marker.nextSibling.strip() title = util.unescape(title) item.title = util.stripNonAscii(title) marker = tr.find(text=re.compile("Print the author")) if marker is None or marker.nextSibling is None: author = '' else: author = marker.nextSibling.strip().strip('.') L = author.split(',') author = ','.join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) marker = tr.find(text=re.compile("Print the date due")) #<td>Due <!--Print the date due--> <strong>12/10/2011,.... dueDate = marker.parent.find('strong').string.strip() dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item print self.itemsOut
def parse(self): # look for pending fine fine = self.soup.find('div', {'id':'panelVerifyCharges'}) if fine != None: raise PendingFineException row = self.soup.find('div', {'id':'panelMessage'}) titles = row.findAll('i') for title in titles: item = models.Item() reason = title.nextSibling.strip() if reason == 'is renewed.': item.renewed = True item.renewalError = None else: item.renewed = False error_ul = title.findNextSibling('ul') if error_ul == None: item.renewalError = 'Renewal failed' else: item.renewalError = error_ul.li.string titlestr = title.contents[0].strip() titlestr = util.unescape(titlestr) titlestr = util.stripNonAscii(titlestr) self.renewalItems[titlestr] = item
def parse(self): self.form = self.soup.find("form", {"name" : "hasnow"}) row = self.soup.find('input', {'name' : 'HASNOW'}) if row == None: return table = row.findPrevious('table') #print table.__class__.__name__ #print table.prettify() rows = table.findAll('tr') #print len(rows) for itemrow in rows: #print row.__class__.__name__ #print row.prettify() # ignore the header row -- we know it's a header if there isn't a renewal checkbox next to it if itemrow.find('input', {'name':'HASNOW'}) == row.Null: continue item = models.Item() #print row.prettify() renewitemkeys = itemrow.find('input', {'name':'HASNOW'}) divs = itemrow.findAll('div', {'id' : 'globaltext'}) #print len(divs) title = divs[0].string.strip() title = util.unescape(title) item.title = util.stripNonAscii(title) #print title dueDate = divs[4].string.strip() dueDate = dueDate.split(',')[0] #strip time item.dueDate = util.toDatetime(dueDate) self.itemsOut[item.title] = item
def parse_title(self, td, item): link = td.find('a') title = util.unescape(link.text.strip(' :/.')) item.title = util.stripNonAscii(title) span = td.find('span') if span is not None and span.text is not None: item.author = span.text.strip(' :/.') return item
def parseTitle(self, td, item): links = td.findAll("a", {"class": lambda (x): x != "boldRedFont1"}) # for some reason many title links have a superfluous ' /' at the end -- remove this title = links[0].string.rstrip(" /") title = util.unescape(title) item.title = util.stripNonAscii(title) author = links[1].string author = author.rstrip(".") if author.startswith("by "): author = author.replace("by ", "", 1) # sometimes there is extraneous information after the author's name, ex: Dylan, Bob, 1941- L = author.split(",") author = ",".join(L[0:2]) author = util.unescape(author) item.author = util.stripNonAscii(author) return item
def parseTitle(self, td, item): span = td.find('span') link = span.find('a') if link == None: title = span.contents[0].strip() else: title = link.contents[0].strip() title = util.unescape(title) item.title = util.stripNonAscii(title) return item
def parse(self): self.form = self.soup.find('form', {'name' : 'renewitems'}) checkboxes = self.form.findAll('input', {'type' : 'checkbox'}) for checkbox in checkboxes: item = models.Item() item.renewitemkey = checkbox['name'] title_label = checkbox.findNext('td').label title = title_label.contents[2].strip() title = util.unescape(title) item.title = util.stripNonAscii(title) self.renewalitems[item.title] = item
def parse(self): #print self.soup.prettify() dds = self.soup.findAll('dd') for dd in dds: item = models.Item() reasonSoup = dd.findPrevious('strong') print reasonSoup.prettify() reason = util.inner_text(reasonSoup) print "reason=" + reason if reason == 'Item renewed': item.renewed = True item.renewalError = None else: item.renewed = False item.renewalError = reason title = dd.contents[0].strip() title = util.unescape(title) title = util.stripNonAscii(title) self.renewalItems[title] = item