Esempio n. 1
0
def get_links(content, session):
    links = []
    if '<a' in content:
        for link in pq(content)('a'):
            if 'href' not in link.attrib:
                continue
            href = link.attrib['href']
            if href.startswith('/'):
                href = SITE_URL + href
            if not href.startswith('http'):
                continue
            if href in links:
                continue
            filename = href.rpartition('/')[2]
            if filename == '' or filename.endswith(
                    '.html') or filename.endswith('.aspx'):
                continue

            s3_object_name = 'government_decisions/' + filename
            if not object_storage.exists(s3_object_name):
                try:
                    conn = session.get(href)
                    if not conn.status_code == requests.codes.ok:
                        continue
                    href = object_storage.write(s3_object_name,
                                                data=conn.content,
                                                public_bucket=True,
                                                create_bucket=True)
                except:
                    continue
            else:
                href = object_storage.urlfor(s3_object_name)
            links.append(dict(href=href, title=pq(link).text()))
    return links
Esempio n. 2
0
 def write_to_object_storage(self, object_name, data):
     logging.error('write_to_object_storage %s', object_name)
     if not object_storage.exists(object_name):
         ret = object_storage.write(object_name, data=data, public_bucket=True, create_bucket=True)
     else:
         ret = object_storage.urlfor(object_name)
     return ret
Esempio n. 3
0
def unsign_document_link(url):
    url = url.replace("http://", "https://")
    if not url.startswith("https://www.mr.gov.il/Files_Michrazim/"):
        raise Exception("invalid url: {}".format(url))
    filename = url.replace("https://www.mr.gov.il/Files_Michrazim/",
                           "").replace(".signed", "")
    decoded_indicator = base_object_name + filename + '.decoded'
    if object_storage.exists(decoded_indicator):
        decoded_indicator_url = object_storage.urlfor(decoded_indicator)
        ret = requests.get(decoded_indicator_url)
        if ret.status_code == 200:
            return ret.text
    try:
        content = requests_get_content(url)
        page = pq(content)
        data_elt = page(page(page.children()[1]).children()[0]).children()[0]
        assert b'The requested operation is not supported, and therefore can not be displayed' not in content
    except Exception as e:
        logging.error(
            'Failed to download from %s (%s), returning original url', url, e)
        return url
    try:
        if data_elt.attrib["DataEncodingType"] != "base64":
            raise Exception("unknown DataEncodingType: {}".format(
                data_elt.attrib["DataEncodingType"]))
    except KeyError:
        return None
    buffer = data_elt.text
    if buffer:
        buffer = base64.decodebytes(buffer.encode("ascii"))
    else:
        buffer = ''
    mime = data_elt.attrib["MimeType"]
    guessed_mime = None
    orig_filename = None
    try:
        page.remove_namespaces()
        orig_filename = next(page[0].iterdescendants('FileName')).text
        _, ext = os.path.splitext(orig_filename)
    except:
        ext = mimetypes.guess_extension(mime, strict=False)
    if not ext:
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            guessed_mime = m.id_buffer(buffer)
            logging.info('Attempted to detect buffer type: %s', guessed_mime)
            if guessed_mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                ext = '.docx'
            else:
                ext = mimetypes.guess_extension(guessed_mime)
    assert ext, "Unknown file type mime:%s filename:%s guessed_mime:%s ext:%r buffer:%r" % (
        mime, orig_filename, guessed_mime, ext, buffer[:128])
    object_name = base_object_name + filename + (ext if ext else "")
    ret = write_to_object_storage(object_name, buffer)
    write_to_object_storage(decoded_indicator, ret)
    return ret
Esempio n. 4
0
def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = row['url']
    if not object_storage.exists(s3_object_name):
        conn = session.get(url)
        time.sleep(3)
        if not conn.status_code == requests.codes.ok:
            return None

        charset = get_charset(conn.content)
        conn.encode = charset
        object_storage.write(
            s3_object_name,
            data=conn.content,
            public_bucket=True,
            create_bucket=True,
            content_type="text/html; charset={}".format(charset))
    return row
def process_row(row, *_):
#    if random.uniform(0, 1) >= 0.005:
#        return None

    s3_object_name = row['s3_object_name']
    #url = os.path.join("https://ams3.digitaloceanspaces.com", "budgetkey-files", s3_object_name)
    url = object_storage.urlfor(s3_object_name)
    try:
        if object_storage.exists(s3_object_name):
            conn = session.get(url)
            txt = conn.text
            if needs_decoding(txt):
                txt = conn.content.decode('utf-8')
            pg = pq(txt, parser='html')

            row.update({
                'url': url,
                'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')),
                'HeaderProofValue': len(pg.find('#HeaderProofValue')),
                'HeaderProof': len(pg.find('#HeaderProof ~ span:first')),
                'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(),

                'HeaderFixtReport': len(pg.find('#HeaderFixtReport')),
                'HeaderProofFormat': len(pg.find("#HeaderProofFormat")),

                'notification_type': pg.find('#HeaderFormNumber').text().strip(),



                'positions': get_positions_array(pg),

                'alias_stats': collect_all_aliases(pg)

            })
        else:
            return None
    except Exception as err:
        raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err
    return row
Esempio n. 6
0
def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = object_storage.urlfor(s3_object_name)
    try:
        if object_storage.exists(s3_object_name):
            conn = session.get(url)
            txt = conn.text
            if needs_decoding(txt):
                txt = conn.content.decode('utf-8')
            pg = pq(txt, parser='html')

            row.update({
                'url': url,
                'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')),
                'HeaderProofValue': len(pg.find('#HeaderProofValue')),
                'HeaderProof': len(pg.find('#HeaderProof ~ span:first')),
                'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(),
                'notification_type': pg.find('#HeaderFormNumber').text().strip(),

                'HeaderFixtReport': len(pg.find('#HeaderFixtReport')),
                'HeaderProofFormat': len(pg.find("#HeaderProofFormat")),

                'TaarichTchilatHaCehuna': len(pg.find("[fieldalias=TaarichTchilatHaCehuna]")),
                'TaarichTchilatCehuna': len(pg.find("[fieldalias=TaarichTchilatCehuna]")),
                'TaarichTehilatCehuna': len(pg.find("[fieldalias=TaarichTehilatCehuna]")),
                'TaarichTchilatHaKehuna': len(pg.find("[fieldalias=TaarichTchilatHaKehuna]")),
                'TaarichTchilatKehuna': len(pg.find("[fieldalias=TaarichTchilatKehuna]")),
                'TaarichTehilatKehuna': len(pg.find("[fieldalias=TaarichTehilatKehuna]")),

                'Gender': len(pg.find("[fieldalias=Gender]")),
                'gender': pg.find("[fieldalias=Gender]").text().strip(),

                'Shem': len(pg.find("[fieldalias=Shem]")),
                'ShemPratiVeMishpacha': len(pg.find("[fieldalias=ShemPratiVeMishpacha]")),
                'ShemPriatiVeMishpacha': len(pg.find("[fieldalias=ShemPriatiVeMishpacha]")),
                'ShemMishpahaVePrati': len(pg.find("[fieldalias=ShemMishpahaVePrati]")),
                'ShemRoeCheshbon': len(pg.find("[fieldalias=ShemRoeCheshbon]")),
                'ShemRoehHeshbon': len(pg.find("[fieldalias=ShemRoehHeshbon]")),
                'Accountant': len(pg.find("[fieldalias=Accountant]")),
                'Tapkid':  len(pg.find("[fieldalias=Tapkid]")),
                'Tafkid':  len(pg.find("[fieldalias=Tafkid]")),
                'HaTafkidLoMuna':  len(pg.find("[fieldalias=HaTafkidLoMuna]")),
                'TeurTafkid':  len(pg.find("[fieldalias=TeurTafkid]")),
                'LeloTeur':  len(pg.find("[fieldalias=LeloTeur]")),
                'TeurHaTafkidLoMuna':  len(pg.find("[fieldalias=TeurHaTafkidLoMuna]")),

                'full_name': all_aliases_as_string(pg, ['Shem', 'ShemPratiVeMishpacha', 'ShemPriatiVeMishpacha',
                                                        'ShemMishpahaVePrati']),

                'positions': get_positions_array(pg),


                #'is_nomination': False,
                #'positions':"",
                #'gender':"",
                #'name':""
            })
        else:
            return None
    except Exception as err:
        raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err
    return row
Esempio n. 7
0
 report['revision'] = REVISION
 time.sleep(1)
 url_to_use = report_url
 if url_to_use in url_to_fixed_file:
     url_to_use = url_to_fixed_file[url_to_use]
     logging.info("Using fixed file: %s", url_to_use)
 if url_to_use.startswith('http'):
     hash = hashlib.md5(
         report['report-title'].encode('utf8')).hexdigest()[:4]
     obj_name = "{report-year}-{report-period}-{report-publisher}-{report-subunit}-{report-date}".format(
         **report)
     obj_name += '-' + hash
     _, ext = os.path.splitext(url_to_use)
     obj_name += ext
     obj_name = os.path.join('spending-reports', obj_name)
     if not object_storage.exists(obj_name):
         tmp = tempfile.NamedTemporaryFile()
         try:
             stream = requests.get(url_to_use,
                                   stream=True,
                                   verify=False).raw
         except:
             logging.exception('Failed to load data from %s',
                               url_to_use)
         stream.read = functools.partial(stream.read,
                                         decode_content=True)
         shutil.copyfileobj(stream, tmp)
         tmp.flush()
         url_to_use = object_storage.write(obj_name,
                                           file_name=tmp.name,
                                           create_bucket=False)