def get(self): code = int(self.request.get('code', '0')) q = Department.gql('WHERE dptCode >= :1 ORDER BY dptCode', code) dpts = q.fetch(limit=2) nowDpt = dpts[0] soup = BeautifulSoup( urllib2.urlopen( nowDpt.dptLink ) ) list = soup.table.findAll('a') for one in list: text = one.text; name = re.split('[0-9]', text)[0] code = text[ len(name):].split(' ')[0].split('(')[0] # Dealing w/ the special cases doc = Doctor.all().filter('docCode =', code).get() if not doc and len(code) != 0: doc = Doctor() doc.docName = name doc.docCode = code doc.put() if( len(dpts) > 1): nextDpt = dpts[1] nextUrl = '/parse/doctor?code=%d' % nextDpt.dptCode nextName = nextDpt.dptName else: nextUrl = '/' nextName = 'END OF PARSING' context = { 'type' : 'Doctor', 'nextUrl' : nextUrl, 'nextName': nextName, } path = os.path.join( os.path.dirname('__file__'), 'templates', 'parser.html') self.response.out.write( template.render( path, context) )
def get(self): code = int(self.request.get('code', '0')) q = Department.gql('WHERE dptCode >= :1 ORDER BY dptCode', code) dpts = q.fetch(limit=2) nowDpt = dpts[0] soup = BeautifulSoup( urllib2.urlopen( nowDpt.dptLink ) ) trlist = soup.table.findAll('tr', align='left') for tr in trlist: tdlist = tr.findAll('td') column = 0; for td in tdlist: if column == 0: dateStr = td.text.split('(')[1].split(')')[0] month = dateStr.split('/')[0] day = dateStr.split('/')[1] year = str(datetime.datetime.now().year) else: if column == 1: timeStr = 'A' elif column == 2: timeStr = 'B' else: timeStr = 'C' alist = td.findAll(lambda tag: tag.name == 'a' and len(tag.attrs) == 2) for a in alist: text = a.text name = re.split('[0-9]', text)[0] doc = Doctor.all().filter('docName = ', name).get() if doc: clinic = Clinic() link = a['href'] code = link.split('data=')[1].split('&sLoc')[0] clinic.link = tzuPrifix + link clinic.code = code clinic.doctor = doc.key() clinic.dept = nowDpt.key() clinic.date = year + '-' + month + '-' + day + '-' + timeStr clinic.put() column = column + 1 if( len(dpts) > 1): nextDpt = dpts[1] nextUrl = '/parse/clinic?code=%d' % nextDpt.dptCode nextName = nextDpt.dptName else: nextUrl = '/' nextName = 'END OF PARSING' context = { 'type' : 'Clinic', 'nextUrl' : nextUrl, 'nextName': nextName, } path = os.path.join( os.path.dirname('__file__'), 'templates', 'parser.html') self.response.out.write( template.render( path, context) )