def parse_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): txt = utils.get_tag_contents(td) if i < len(order) and txt: txt = txt.strip() col = order[i] if col == 'gztype': words = txt.split('/') metainfo['gztype'] = words[0].strip() if len(words) > 1: metainfo['partnum'] = words[1].strip() if len(words) > 2: metainfo['district'] = words[2].strip() elif col == 'download': inp = td.find('input') if inp and inp.get('onclick'): metainfo['download'] = inp.get('onclick') elif col in ['notification_num', 'gznum', 'department']: metainfo[col] = txt elif col == 'subject': metainfo.set_subject(txt) i += 1 return metainfo
def process_result_row(self, tr, metainfos, dateobj, order): gznum = None i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'gznum': gznum = txt elif col.startswith('partnum'): h, partnum = col.split('|') metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) metainfo.set_gztype(self.gazette_type) if gznum: metainfo['gznum'] = gznum metainfo['partnum'] = partnum inp = td.find('input') if inp: name = inp.get('name') if name: metainfo['download'] = name i += 1
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gztype': metainfo.set_gztype(txt) elif col == 'download': link = td.find('a') if link: href = link.get('href') if href: metainfo['download'] = href elif col in ['partnum', 'division', 'subject']: metainfo[col] = txt i += 1 if 'download' not in metainfo: self.logger.warn('No download link, ignoring: %s', tr) else: metainfos.append(metainfo)
def process_result_row(self, tr, metainfos, dateobj, order): tds = tr.find_all('td') if len(tds) != len(order): return metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tds: if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gztype': metainfo.set_gztype(txt) elif col == 'gznum': metainfo['gznum'] = txt link = td.find('a') if link and link.get('href'): metainfo['download'] = link.get('href') i += 1
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: txt = utils.get_tag_contents(td) txt = txt.strip() if order[i] in [ 'gznum', 'department', 'notification_num', 'subject' ]: metainfo[order[i]] = txt elif order[i] == 'gzdate': nums = re.findall('\d+', txt) if len(nums) == 3: try: d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0])) metainfo['gzdate'] = d except: self.logger.warn('Unable to form date for %s', txt) elif order[i] == 'download': link = td.find('a') if link and link.get('href'): metainfo['href'] = link.get('href') i += 1 if 'href' in metainfo and 'gznum' in metainfo: return metainfo return None
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() i = 0 for td in tr.find_all('td'): txt = utils.get_tag_contents(td) if txt and order[i] in ['subject', 'department', 'gznum']: txt, n = re.subn('\s+', ' ', txt) metainfo[order[i]] = txt.strip() elif txt and order[i] == 'date': nums = re.split('[./-]+', txt) if len(nums) < 3: self.logger.warn( 'Couldn\'t get date from %s for extraordinary gazette list', txt) i += 1 continue nums = [re.subn('\s+', '', n)[0] for n in nums] nums = [n for n in nums if n] d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0])) try: metainfo.set_date(d) except: self.logger.warn('Could not parse date %s', txt) i += 1 if metainfo.get_date() == dateobj: metainfo.set_gztype('Extraordinary') return metainfo return None
def process_result_row(self, tr, metainfos, dateobj, order): download = None for link in tr.find_all('a'): txt = utils.get_tag_contents(link) if txt and re.match('\s*Download', txt, re.IGNORECASE): download = link.get('href') break if not download: return metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) metainfo['download'] = download i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gznum': metainfo['gznum'] = txt.splitlines()[0] elif col in ['subject', 'department', 'notification_num', \ 'gztype']: metainfo[col] = txt i += 1
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfo.set_gztype(self.gazette_type) metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'subject': metainfo.set_subject(txt) elif col == 'gznum': reobj = re.search('\w+', txt) if reobj: metainfo['gznum'] = txt[reobj.start():reobj.end()] elif col == 'notification_date': d = utils.parse_datestr(txt) if d: metainfo[col] = d elif col in ['department', 'notification_num']: metainfo[col] = txt elif col == 'download': inp = tr.find('input') if inp: name = inp.get('name') if name: metainfo[col] = name i += 1
def process_result_row(self, tr, metainfos, dateobj, order): metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() if col == 'ministry': metainfo.set_ministry(txt) elif col == 'subject': metainfo.set_subject(txt) elif col == 'gztype': metainfo.set_gztype(txt) elif col == 'download': inp = td.find('input') if inp: name = inp.get('name') if name: metainfo[col] = name else: link = td.find('a') if link: metainfo[col] = link elif col in ['office', 'department', 'partnum', 'refnum']: metainfo[col] = txt i += 1
def get_metainfo(self, link, td): onclick = link.get('onclick') if onclick == None: return None reobj = re.search( "loadFullImg\(\s*'(?P<gzyear>\w+)'\s*,\s*'(?P<month>\w+)'\s*,\s*'(?P<day>\w+)'\s*,\s*'(?P<accno>\w+)'\s*,\s*(?P<pdf_page>\w+)\s*,\s*(?P<gzpage>\w+)\)", onclick) if not reobj: return None groupdict = reobj.groupdict() gzyear = groupdict['gzyear'] month = groupdict['month'] day = groupdict['day'] accno = groupdict['accno'] page = int(groupdict['pdf_page']) gzpage = int(groupdict['gzpage']) pagenumber = '%d' % page if gzpage >= 10 and gzpage < 100: if page < 10: pagenumber = '0' + pagenumber elif gzpage >= 100 and gzpage < 1000: if page < 10: pagenumber = '00' + pagenumber elif page < 100: pagenumber = '0' + pagenumber elif gzpage >= 1000: if page < 10: pagenumber = '000' + pagenumber elif page < 100: pagenumber = '00' + pagenumber elif page < 1000: pagenumber = '0' + pagenumber month_num = utils.get_month_num(month, calendar.month_abbr) d = datetime.date(int(gzyear), month_num, int(day)) gzurl = self.baseurl + self.gzurl_format % (gzyear, accno, accno, pagenumber) metainfo = utils.MetaInfo() metainfo.set_url(gzurl) metainfo.set_date(d) metainfo['relurl'] = '%s_Page_%s' % (accno, pagenumber) txt = utils.get_tag_contents(link) self.populate_link_metainfo(txt, metainfo) return metainfo
def datepage_metainfos(self, url, dateobj): minfos = [] response = self.download_url(url) if not response or not response.webpage: self.logger.warn('Unable to download %s. Skipping', url) return minfos d = utils.parse_webpage(response.webpage, self.parser) if not d: self.logger.warn('Unable to parse %s. Skipping.', url) return minfos partnum = None dept = None for td in d.find_all('td'): bgcolor = td.get('bgcolor') links = td.find_all('a') if bgcolor == '#91BAE8' and len(links) == 0: partnum = utils.get_tag_contents(td) partnum = utils.remove_spaces(partnum) dept = None elif len(links) > 0: reobj = re.compile('^(strong|a)$') for x in td.find_all(reobj): if x.name == 'strong': dept = utils.get_tag_contents(x) dept = utils.remove_spaces(dept) elif x.name == 'a' and partnum: href = x.get('href') if not href.startswith('pdf'): continue title = utils.get_tag_contents(x) title = utils.remove_spaces(title) metainfo = utils.MetaInfo() minfos.append(metainfo) metainfo.set_title(title) metainfo.set_date(dateobj) metainfo['partnum'] = partnum if dept: metainfo['department'] = dept gzurl = urllib.basejoin(url, href) metainfo['url'] = gzurl return minfos
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: if order[i] in ['notification_num', 'subject', 'issued_by']: txt = utils.get_tag_contents(td) if txt: metainfo[order[i]] = txt i += 1 link = tr.find('a') if link and link.get('href'): href = link.get('href') metainfo['url'] = urllib.basejoin(self.searchurl, href) return metainfo
def parse_listing_webpage(self, parturl, d, dateobj, partnum, gztype): minfos = [] for li in d.find_all('li'): link = li.find('a') if link == None: continue href = link.get('href') if href and href.startswith('pdf'): url = urllib.basejoin(parturl, href) txt = utils.get_tag_contents(li) txt = txt.strip() if not txt: continue nums = re.findall('\d+', txt) if len(nums) < 4: self.logger.warn('Not able to parse. Ignoring %s', link) continue gznum = ''.join(nums[:-3]) try: date = datetime.date(int(nums[-1]), int(nums[-2]), int(nums[-3])) except: self.logger.warn('Could not get date. Ignoring %s', txt) continue if date != dateobj: continue metainfo = utils.MetaInfo() metainfo.set_gztype(gztype) metainfo.set_date(dateobj) metainfo.set_url(url) metainfo['gznum'] = gznum if partnum: metainfo['partnum'] = partnum minfos.append(metainfo) return minfos
def process_row(self, tr, order, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): if len(order) > i: if order[i] in ['department', 'notification_num', 'subject', \ 'notification_type', 'gznum', 'gztype', 'category']: txt = utils.get_tag_contents(td) metainfo[order[i]] = txt elif order[i] == 'download': link = td.find('a') if link and link.get('onclick'): onclick = link.get('onclick') reobj = re.search('\d+', onclick) if reobj: metainfo['docid'] = onclick[reobj.start():reobj. end()] i += 1 return metainfo
def process_result_row(self, tr, metainfos, dateobj, order): download = None for link in tr.find_all('a'): txt = utils.get_tag_contents(link) if txt and re.match('\s*select', txt, re.IGNORECASE): download = link.get('href') break if not download: return metainfo = utils.MetaInfo() metainfos.append(metainfo) metainfo.set_date(dateobj) metainfo['download'] = download i = 0 for td in tr.find_all('td'): if len(order) > i: col = order[i] txt = utils.get_tag_contents(td) if txt: txt = txt.strip() else: continue if col == 'gztype': pos = txt.find('PART') if pos > 0: metainfo.set_gztype(txt[:pos]) metainfo['partnum'] = txt[pos:] else: metainfo.set_gztype(txt) elif col in ['subject', 'department', 'issued_by', 'gznum']: metainfo[col] = txt i += 1
def get_metainfo(self, order, tr, dateobj): metainfo = utils.MetaInfo() metainfo.set_date(dateobj) i = 0 for td in tr.find_all('td'): txt = utils.get_tag_contents(td) txt = txt.strip() if order[i] == 'download': link = td.find('a') if link: metainfo['download'] = link elif order[i] == 'gznum': metainfo['gznum'] = txt elif order[i] == 'gztype': metainfo.set_gztype(txt) elif order[i] == 'num': metainfo['num'] = txt elif order[i] == 'series': metainfo['series'] = txt i += 1 return metainfo
def download_oneday(self, relpath, dateobj): dls = [] if dateobj >= self.flip_date1: if dateobj >= self.flip_date2: datestr = '%d-%d-%d' % (dateobj.day, dateobj.month, dateobj.year) else: datestr = '%s-%s-%d' % (utils.pad_zero( dateobj.day), utils.pad_zero(dateobj.month), dateobj.year) mainhref = 'Contents-(%s).pdf' % datestr else: datestr = utils.dateobj_to_str(dateobj, '', reverse=True) mainhref = 'Contents(%s-%s-%s).pdf' % (utils.pad_zero( dateobj.day), utils.pad_zero( dateobj.month), utils.pad_zero(dateobj.year % 100)) dateurl = self.baseurl % datestr docurl = urllib.basejoin(dateurl, mainhref) mainmeta = utils.MetaInfo() mainmeta.set_date(dateobj) mainmeta.set_url(self.url_fix(docurl)) response = self.download_url(docurl) if not response or not response.webpage or response.error: return dls mainrelurl = os.path.join(relpath, 'main') updated = False if self.storage_manager.save_rawdoc(self.name, mainrelurl, response.srvresponse, response.webpage): self.logger.info(u'Saved rawfile %s' % mainrelurl) updated = True page_type = self.get_file_extension(response.webpage) if page_type != 'pdf': self.logger.warn( 'Got a non-pdf page and we can\'t handle it for datte %s', dateobj) return dls links = [] linknames = [] hrefs = utils.extract_links_from_pdf(StringIO(response.webpage)) for href in hrefs: reobj = re.search('(?P<num>Part-\w+)', href) if reobj: partnum = reobj.groupdict()['num'] else: partnum = '%s' % href reobj = re.search('.pdf$', partnum) if partnum: partnum = partnum[:reobj.start()] relurl = os.path.join(relpath, partnum) docurl = urllib.basejoin(dateurl, href) metainfo = utils.MetaInfo() metainfo.set_date(dateobj) metainfo['partnum'] = partnum links.append(relurl) linknames.append(partnum) if self.save_gazette(relurl, docurl, metainfo): dls.append(relurl) mainmeta['links'] = links mainmeta['linknames'] = linknames if self.storage_manager.save_metainfo(self.name, mainrelurl, mainmeta): updated = True self.logger.info(u'Saved metainfo %s' % mainrelurl) if updated: dls.append(mainrelurl) return dls