Exemple #1
0
    def parse_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)

        i = 0
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            if i < len(order) and txt:
                txt = txt.strip()
                col = order[i]
                if col == 'gztype':
                    words = txt.split('/')
                    metainfo['gztype'] = words[0].strip()
                    if len(words) > 1:
                        metainfo['partnum'] = words[1].strip()
                    if len(words) > 2:
                        metainfo['district'] = words[2].strip()
                elif col == 'download':
                    inp = td.find('input')
                    if inp and inp.get('onclick'):
                        metainfo['download'] = inp.get('onclick')
                elif col in ['notification_num', 'gznum', 'department']:
                    metainfo[col] = txt
                elif col == 'subject':
                    metainfo.set_subject(txt)
            i += 1
        return metainfo
Exemple #2
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        gznum = None
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                if col == 'gznum':
                    gznum = txt
                elif col.startswith('partnum'):
                    h, partnum = col.split('|')
                    metainfo = utils.MetaInfo()
                    metainfos.append(metainfo)
                    metainfo.set_date(dateobj)
                    metainfo.set_gztype(self.gazette_type)

                    if gznum:
                        metainfo['gznum'] = gznum
                    metainfo['partnum'] = partnum
                    inp = td.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo['download'] = name
            i += 1
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gztype':
                    metainfo.set_gztype(txt)
                elif col == 'download':
                    link = td.find('a')
                    if link:
                        href = link.get('href')
                        if href:
                            metainfo['download'] = href
                elif col in ['partnum', 'division', 'subject']:
                    metainfo[col] = txt
  
            i += 1
        if 'download' not in metainfo:
            self.logger.warn('No download link, ignoring: %s', tr)
        else:
            metainfos.append(metainfo)
Exemple #4
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        tds = tr.find_all('td')
        if len(tds) != len(order):
            return

        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)

        i = 0
        for td in tds:
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gztype':
                    metainfo.set_gztype(txt)

                elif col == 'gznum':
                    metainfo['gznum'] = txt
                    link = td.find('a')
                    if link and link.get('href'):
                        metainfo['download'] = link.get('href')

            i += 1
Exemple #5
0
    def process_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                txt = utils.get_tag_contents(td)
                txt = txt.strip()
                if order[i] in [
                        'gznum', 'department', 'notification_num', 'subject'
                ]:
                    metainfo[order[i]] = txt
                elif order[i] == 'gzdate':
                    nums = re.findall('\d+', txt)
                    if len(nums) == 3:
                        try:
                            d = datetime.date(int(nums[2]), int(nums[1]),
                                              int(nums[0]))
                            metainfo['gzdate'] = d
                        except:
                            self.logger.warn('Unable to form date for %s', txt)
                elif order[i] == 'download':
                    link = td.find('a')
                    if link and link.get('href'):
                        metainfo['href'] = link.get('href')

            i += 1
        if 'href' in metainfo and 'gznum' in metainfo:
            return metainfo
        return None
    def process_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        i = 0
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            if txt and order[i] in ['subject', 'department', 'gznum']:
                txt, n = re.subn('\s+', ' ', txt)
                metainfo[order[i]] = txt.strip()
            elif txt and order[i] == 'date':
                nums = re.split('[./-]+', txt)
                if len(nums) < 3:
                    self.logger.warn(
                        'Couldn\'t get date from %s for extraordinary gazette list',
                        txt)
                    i += 1
                    continue

                nums = [re.subn('\s+', '', n)[0] for n in nums]
                nums = [n for n in nums if n]
                d = datetime.date(int(nums[2]), int(nums[1]), int(nums[0]))
                try:
                    metainfo.set_date(d)
                except:
                    self.logger.warn('Could not parse date %s', txt)

            i += 1
        if metainfo.get_date() == dateobj:
            metainfo.set_gztype('Extraordinary')
            return metainfo

        return None
Exemple #7
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        download = None
        for link in tr.find_all('a'):
            txt = utils.get_tag_contents(link)
            if txt and re.match('\s*Download', txt, re.IGNORECASE):
                download = link.get('href')
                break

        if not download:
            return

        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)
        metainfo['download'] = download

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gznum':
                    metainfo['gznum'] = txt.splitlines()[0]

                elif col in ['subject', 'department', 'notification_num', \
                             'gztype']:
                    metainfo[col] = txt

            i += 1
Exemple #8
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfo.set_gztype(self.gazette_type)
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)
        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                if col == 'subject':
                    metainfo.set_subject(txt)
                elif col == 'gznum':
                    reobj = re.search('\w+', txt)
                    if reobj:
                        metainfo['gznum'] = txt[reobj.start():reobj.end()]

                elif col == 'notification_date':
                    d = utils.parse_datestr(txt)
                    if d:
                        metainfo[col] = d

                elif col in ['department', 'notification_num']:
                    metainfo[col] = txt
                elif col == 'download':
                    inp = tr.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo[col] = name

            i += 1
Exemple #9
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)

        i        = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()

                if col == 'ministry':
                    metainfo.set_ministry(txt)
                elif col == 'subject':
                    metainfo.set_subject(txt)
                elif col == 'gztype':
                    metainfo.set_gztype(txt)    
                elif col == 'download':
                    inp = td.find('input')
                    if inp:
                        name = inp.get('name')
                        if name:
                            metainfo[col] = name
                    else:
                        link = td.find('a')
                        if link:
                            metainfo[col] = link 
                                            
                elif col in ['office', 'department', 'partnum', 'refnum']:
                    metainfo[col] = txt
            i += 1
Exemple #10
0
    def get_metainfo(self, link, td):
        onclick = link.get('onclick')
        if onclick == None:
            return None

        reobj = re.search(
            "loadFullImg\(\s*'(?P<gzyear>\w+)'\s*,\s*'(?P<month>\w+)'\s*,\s*'(?P<day>\w+)'\s*,\s*'(?P<accno>\w+)'\s*,\s*(?P<pdf_page>\w+)\s*,\s*(?P<gzpage>\w+)\)",
            onclick)
        if not reobj:
            return None

        groupdict = reobj.groupdict()
        gzyear = groupdict['gzyear']
        month = groupdict['month']
        day = groupdict['day']
        accno = groupdict['accno']
        page = int(groupdict['pdf_page'])
        gzpage = int(groupdict['gzpage'])

        pagenumber = '%d' % page
        if gzpage >= 10 and gzpage < 100:
            if page < 10:
                pagenumber = '0' + pagenumber
        elif gzpage >= 100 and gzpage < 1000:
            if page < 10:
                pagenumber = '00' + pagenumber
            elif page < 100:
                pagenumber = '0' + pagenumber
        elif gzpage >= 1000:
            if page < 10:
                pagenumber = '000' + pagenumber
            elif page < 100:
                pagenumber = '00' + pagenumber
            elif page < 1000:
                pagenumber = '0' + pagenumber

        month_num = utils.get_month_num(month, calendar.month_abbr)
        d = datetime.date(int(gzyear), month_num, int(day))

        gzurl = self.baseurl + self.gzurl_format % (gzyear, accno, accno,
                                                    pagenumber)

        metainfo = utils.MetaInfo()
        metainfo.set_url(gzurl)
        metainfo.set_date(d)
        metainfo['relurl'] = '%s_Page_%s' % (accno, pagenumber)

        txt = utils.get_tag_contents(link)
        self.populate_link_metainfo(txt, metainfo)

        return metainfo
Exemple #11
0
    def datepage_metainfos(self, url, dateobj):
        minfos = []
        response = self.download_url(url)

        if not response or not response.webpage:
            self.logger.warn('Unable to download %s. Skipping', url)
            return minfos

        d = utils.parse_webpage(response.webpage, self.parser)
        if not d:
            self.logger.warn('Unable to parse %s. Skipping.', url)
            return minfos

        partnum = None
        dept    = None
        for td in d.find_all('td'):
            bgcolor = td.get('bgcolor')
            links   = td.find_all('a')
            if bgcolor == '#91BAE8' and len(links) == 0:
                partnum =  utils.get_tag_contents(td)
                partnum  = utils.remove_spaces(partnum)
                dept    = None
            elif len(links) > 0:
                reobj  = re.compile('^(strong|a)$')
                for x in td.find_all(reobj):
                    if x.name == 'strong':
                        dept = utils.get_tag_contents(x)
                        dept = utils.remove_spaces(dept)
                    elif x.name == 'a'  and partnum:
                        href  = x.get('href')
                        if not href.startswith('pdf'):
                            continue

                        title = utils.get_tag_contents(x)
                        title = utils.remove_spaces(title)

                        metainfo = utils.MetaInfo()
                        minfos.append(metainfo)

                        metainfo.set_title(title)
                        metainfo.set_date(dateobj)     
                        metainfo['partnum'] = partnum
                        if dept:
                            metainfo['department']    = dept
                        gzurl = urllib.basejoin(url, href)
                        metainfo['url'] = gzurl

        return minfos    
    def process_row(self, tr, order, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)    

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                if order[i] in ['notification_num', 'subject', 'issued_by']:
                    txt = utils.get_tag_contents(td)
                    if txt:
                        metainfo[order[i]] = txt
            i += 1            
        link = tr.find('a')
        if link and link.get('href'):
            href = link.get('href')
            metainfo['url'] = urllib.basejoin(self.searchurl, href)

        return metainfo 
    def parse_listing_webpage(self, parturl, d, dateobj, partnum, gztype):
        minfos = []
        for li in d.find_all('li'):
            link = li.find('a')
            if link == None:
                continue

            href = link.get('href')
            if href and href.startswith('pdf'):
                url = urllib.basejoin(parturl, href)
                txt = utils.get_tag_contents(li)
                txt = txt.strip()
                if not txt:
                    continue

                nums = re.findall('\d+', txt)
                if len(nums) < 4:
                    self.logger.warn('Not able to parse. Ignoring %s', link)
                    continue

                gznum = ''.join(nums[:-3])
                try:
                    date = datetime.date(int(nums[-1]), int(nums[-2]),
                                         int(nums[-3]))
                except:
                    self.logger.warn('Could not get date. Ignoring %s', txt)
                    continue

                if date != dateobj:
                    continue

                metainfo = utils.MetaInfo()
                metainfo.set_gztype(gztype)
                metainfo.set_date(dateobj)
                metainfo.set_url(url)
                metainfo['gznum'] = gznum
                if partnum:
                    metainfo['partnum'] = partnum

                minfos.append(metainfo)

        return minfos
Exemple #14
0
 def process_row(self, tr, order, dateobj):
     metainfo = utils.MetaInfo()
     metainfo.set_date(dateobj)
     i = 0
     for td in tr.find_all('td'):
         if len(order) > i:
             if order[i] in ['department', 'notification_num', 'subject', \
                         'notification_type', 'gznum', 'gztype', 'category']:
                 txt = utils.get_tag_contents(td)
                 metainfo[order[i]] = txt
             elif order[i] == 'download':
                 link = td.find('a')
                 if link and link.get('onclick'):
                     onclick = link.get('onclick')
                     reobj = re.search('\d+', onclick)
                     if reobj:
                         metainfo['docid'] = onclick[reobj.start():reobj.
                                                     end()]
         i += 1
     return metainfo
Exemple #15
0
    def process_result_row(self, tr, metainfos, dateobj, order):
        download = None
        for link in tr.find_all('a'):
            txt = utils.get_tag_contents(link)
            if txt and re.match('\s*select', txt, re.IGNORECASE):
                download = link.get('href')
                break

        if not download:
            return

        metainfo = utils.MetaInfo()
        metainfos.append(metainfo)
        metainfo.set_date(dateobj)
        metainfo['download'] = download

        i = 0
        for td in tr.find_all('td'):
            if len(order) > i:
                col = order[i]
                txt = utils.get_tag_contents(td)
                if txt:
                    txt = txt.strip()
                else:
                    continue

                if col == 'gztype':
                    pos = txt.find('PART')
                    if pos > 0:
                        metainfo.set_gztype(txt[:pos])
                        metainfo['partnum'] = txt[pos:]
                    else:
                        metainfo.set_gztype(txt)

                elif col in ['subject', 'department', 'issued_by', 'gznum']:
                    metainfo[col] = txt

            i += 1
Exemple #16
0
    def get_metainfo(self, order, tr, dateobj):
        metainfo = utils.MetaInfo()
        metainfo.set_date(dateobj)

        i = 0
        for td in tr.find_all('td'):
            txt = utils.get_tag_contents(td)
            txt = txt.strip()
            if order[i] == 'download':
                link = td.find('a')
                if link:
                    metainfo['download'] = link
            elif order[i] == 'gznum':
                metainfo['gznum'] = txt
            elif order[i] == 'gztype':
                metainfo.set_gztype(txt)
            elif order[i] == 'num':
                metainfo['num'] = txt
            elif order[i] == 'series':
                metainfo['series'] = txt

            i += 1

        return metainfo
Exemple #17
0
    def download_oneday(self, relpath, dateobj):
        dls = []
        if dateobj >= self.flip_date1:
            if dateobj >= self.flip_date2:
                datestr = '%d-%d-%d' % (dateobj.day, dateobj.month,
                                        dateobj.year)
            else:
                datestr = '%s-%s-%d' % (utils.pad_zero(
                    dateobj.day), utils.pad_zero(dateobj.month), dateobj.year)
            mainhref = 'Contents-(%s).pdf' % datestr
        else:
            datestr = utils.dateobj_to_str(dateobj, '', reverse=True)
            mainhref = 'Contents(%s-%s-%s).pdf' % (utils.pad_zero(
                dateobj.day), utils.pad_zero(
                    dateobj.month), utils.pad_zero(dateobj.year % 100))

        dateurl = self.baseurl % datestr
        docurl = urllib.basejoin(dateurl, mainhref)

        mainmeta = utils.MetaInfo()
        mainmeta.set_date(dateobj)
        mainmeta.set_url(self.url_fix(docurl))

        response = self.download_url(docurl)
        if not response or not response.webpage or response.error:
            return dls

        mainrelurl = os.path.join(relpath, 'main')
        updated = False
        if self.storage_manager.save_rawdoc(self.name, mainrelurl,
                                            response.srvresponse,
                                            response.webpage):
            self.logger.info(u'Saved rawfile %s' % mainrelurl)
            updated = True

        page_type = self.get_file_extension(response.webpage)
        if page_type != 'pdf':
            self.logger.warn(
                'Got a non-pdf page and we can\'t handle it for datte %s',
                dateobj)
            return dls

        links = []
        linknames = []
        hrefs = utils.extract_links_from_pdf(StringIO(response.webpage))
        for href in hrefs:
            reobj = re.search('(?P<num>Part-\w+)', href)
            if reobj:
                partnum = reobj.groupdict()['num']
            else:
                partnum = '%s' % href
                reobj = re.search('.pdf$', partnum)
                if partnum:
                    partnum = partnum[:reobj.start()]

            relurl = os.path.join(relpath, partnum)
            docurl = urllib.basejoin(dateurl, href)

            metainfo = utils.MetaInfo()
            metainfo.set_date(dateobj)
            metainfo['partnum'] = partnum

            links.append(relurl)
            linknames.append(partnum)

            if self.save_gazette(relurl, docurl, metainfo):
                dls.append(relurl)

        mainmeta['links'] = links
        mainmeta['linknames'] = linknames
        if self.storage_manager.save_metainfo(self.name, mainrelurl, mainmeta):
            updated = True
            self.logger.info(u'Saved metainfo %s' % mainrelurl)

        if updated:
            dls.append(mainrelurl)

        return dls