def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = object_storage.urlfor(s3_object_name)
    conn = session.get(url)
    maya_form = MayaForm(conn.text)
    try:
        row.update({
            'source': 'maya.tase.co.il',
            'is_parse_error': False,
            'organisation_name': maya_form.company,
            'id': maya_form.id,
            'notification_type': maya_form.type,
            'fix_for': maya_form.fix_for,
            'is_nomination': False,
            'start_date': None,
            'positions':"",
            'gender':"",
            'name':""
        })

        if maya_form.is_nomination:
            row.update({
                'is_nomination': True,
                'start_date' : maya_form.position_start_date,
                'positions': maya_form.positions,
                'gender': maya_form.gender,
                'name': maya_form.full_name
            })
    except ParseError as err:
        logging.info("Failed to parse Maya Form {} with err {}".format(url, str(err.__cause__) if err.__cause__ else str(err)))
        row.update({'is_parse_error': True})
    except Exception as err:
        raise RuntimeError('Parsing Failed Unexpectedly') from err
    return row
Ejemplo n.º 2
0
def get_links(content, session):
    links = []
    if '<a' in content:
        for link in pq(content)('a'):
            if 'href' not in link.attrib:
                continue
            href = link.attrib['href']
            if href.startswith('/'):
                href = SITE_URL + href
            if not href.startswith('http'):
                continue
            if href in links:
                continue
            filename = href.rpartition('/')[2]
            if filename == '' or filename.endswith(
                    '.html') or filename.endswith('.aspx'):
                continue

            s3_object_name = 'government_decisions/' + filename
            if not object_storage.exists(s3_object_name):
                try:
                    conn = session.get(href)
                    if not conn.status_code == requests.codes.ok:
                        continue
                    href = object_storage.write(s3_object_name,
                                                data=conn.content,
                                                public_bucket=True,
                                                create_bucket=True)
                except:
                    continue
            else:
                href = object_storage.urlfor(s3_object_name)
            links.append(dict(href=href, title=pq(link).text()))
    return links
Ejemplo n.º 3
0
def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = object_storage.urlfor(s3_object_name)
    conn = session.get(url)
    maya_form = MayaForm(conn.text)
    try:
        row.update({
            'source': 'maya.tase.co.il',

            'organisation_name': maya_form.company,
            'id': maya_form.id,
            'notification_type': maya_form.type,
            'fix_for': maya_form.fix_for,
            'is_nomination': False,
            'start_date': None,
            'positions':"",
            'gender':"",
            'name':""
        })

        if maya_form.is_nomination:
            row.update({
                'is_nomination': True,
                'start_date' : maya_form.position_start_date,
                'positions': maya_form.positions,
                'gender': maya_form.gender,
                'name': maya_form.full_name
            })
    except ValueError as err:
        raise ValueError("Failed to parse object {}".format(url)) from err
    return row
Ejemplo n.º 4
0
 def write_to_object_storage(self, object_name, data):
     logging.error('write_to_object_storage %s', object_name)
     if not object_storage.exists(object_name):
         ret = object_storage.write(object_name, data=data, public_bucket=True, create_bucket=True)
     else:
         ret = object_storage.urlfor(object_name)
     return ret
Ejemplo n.º 5
0
def unsign_document_link(url):
    url = url.replace("http://", "https://")
    if not url.startswith("https://www.mr.gov.il/Files_Michrazim/"):
        raise Exception("invalid url: {}".format(url))
    filename = url.replace("https://www.mr.gov.il/Files_Michrazim/",
                           "").replace(".signed", "")
    decoded_indicator = base_object_name + filename + '.decoded'
    if object_storage.exists(decoded_indicator):
        decoded_indicator_url = object_storage.urlfor(decoded_indicator)
        ret = requests.get(decoded_indicator_url)
        if ret.status_code == 200:
            return ret.text
    try:
        content = requests_get_content(url)
        page = pq(content)
        data_elt = page(page(page.children()[1]).children()[0]).children()[0]
        assert b'The requested operation is not supported, and therefore can not be displayed' not in content
    except Exception as e:
        logging.error(
            'Failed to download from %s (%s), returning original url', url, e)
        return url
    try:
        if data_elt.attrib["DataEncodingType"] != "base64":
            raise Exception("unknown DataEncodingType: {}".format(
                data_elt.attrib["DataEncodingType"]))
    except KeyError:
        return None
    buffer = data_elt.text
    if buffer:
        buffer = base64.decodebytes(buffer.encode("ascii"))
    else:
        buffer = ''
    mime = data_elt.attrib["MimeType"]
    guessed_mime = None
    orig_filename = None
    try:
        page.remove_namespaces()
        orig_filename = next(page[0].iterdescendants('FileName')).text
        _, ext = os.path.splitext(orig_filename)
    except:
        ext = mimetypes.guess_extension(mime, strict=False)
    if not ext:
        with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
            guessed_mime = m.id_buffer(buffer)
            logging.info('Attempted to detect buffer type: %s', guessed_mime)
            if guessed_mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
                ext = '.docx'
            else:
                ext = mimetypes.guess_extension(guessed_mime)
    assert ext, "Unknown file type mime:%s filename:%s guessed_mime:%s ext:%r buffer:%r" % (
        mime, orig_filename, guessed_mime, ext, buffer[:128])
    object_name = base_object_name + filename + (ext if ext else "")
    ret = write_to_object_storage(object_name, buffer)
    write_to_object_storage(decoded_indicator, ret)
    return ret
def process_row(row, *_):
#    if random.uniform(0, 1) >= 0.005:
#        return None

    s3_object_name = row['s3_object_name']
    #url = os.path.join("https://ams3.digitaloceanspaces.com", "budgetkey-files", s3_object_name)
    url = object_storage.urlfor(s3_object_name)
    try:
        if object_storage.exists(s3_object_name):
            conn = session.get(url)
            txt = conn.text
            if needs_decoding(txt):
                txt = conn.content.decode('utf-8')
            pg = pq(txt, parser='html')

            row.update({
                'url': url,
                'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')),
                'HeaderProofValue': len(pg.find('#HeaderProofValue')),
                'HeaderProof': len(pg.find('#HeaderProof ~ span:first')),
                'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(),

                'HeaderFixtReport': len(pg.find('#HeaderFixtReport')),
                'HeaderProofFormat': len(pg.find("#HeaderProofFormat")),

                'notification_type': pg.find('#HeaderFormNumber').text().strip(),



                'positions': get_positions_array(pg),

                'alias_stats': collect_all_aliases(pg)

            })
        else:
            return None
    except Exception as err:
        raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err
    return row
Ejemplo n.º 7
0
def process_row(row, *_):
    s3_object_name = row['s3_object_name']
    url = object_storage.urlfor(s3_object_name)
    try:
        if object_storage.exists(s3_object_name):
            conn = session.get(url)
            txt = conn.text
            if needs_decoding(txt):
                txt = conn.content.decode('utf-8')
            pg = pq(txt, parser='html')

            row.update({
                'url': url,
                'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')),
                'HeaderProofValue': len(pg.find('#HeaderProofValue')),
                'HeaderProof': len(pg.find('#HeaderProof ~ span:first')),
                'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(),
                'notification_type': pg.find('#HeaderFormNumber').text().strip(),

                'HeaderFixtReport': len(pg.find('#HeaderFixtReport')),
                'HeaderProofFormat': len(pg.find("#HeaderProofFormat")),

                'TaarichTchilatHaCehuna': len(pg.find("[fieldalias=TaarichTchilatHaCehuna]")),
                'TaarichTchilatCehuna': len(pg.find("[fieldalias=TaarichTchilatCehuna]")),
                'TaarichTehilatCehuna': len(pg.find("[fieldalias=TaarichTehilatCehuna]")),
                'TaarichTchilatHaKehuna': len(pg.find("[fieldalias=TaarichTchilatHaKehuna]")),
                'TaarichTchilatKehuna': len(pg.find("[fieldalias=TaarichTchilatKehuna]")),
                'TaarichTehilatKehuna': len(pg.find("[fieldalias=TaarichTehilatKehuna]")),

                'Gender': len(pg.find("[fieldalias=Gender]")),
                'gender': pg.find("[fieldalias=Gender]").text().strip(),

                'Shem': len(pg.find("[fieldalias=Shem]")),
                'ShemPratiVeMishpacha': len(pg.find("[fieldalias=ShemPratiVeMishpacha]")),
                'ShemPriatiVeMishpacha': len(pg.find("[fieldalias=ShemPriatiVeMishpacha]")),
                'ShemMishpahaVePrati': len(pg.find("[fieldalias=ShemMishpahaVePrati]")),
                'ShemRoeCheshbon': len(pg.find("[fieldalias=ShemRoeCheshbon]")),
                'ShemRoehHeshbon': len(pg.find("[fieldalias=ShemRoehHeshbon]")),
                'Accountant': len(pg.find("[fieldalias=Accountant]")),
                'Tapkid':  len(pg.find("[fieldalias=Tapkid]")),
                'Tafkid':  len(pg.find("[fieldalias=Tafkid]")),
                'HaTafkidLoMuna':  len(pg.find("[fieldalias=HaTafkidLoMuna]")),
                'TeurTafkid':  len(pg.find("[fieldalias=TeurTafkid]")),
                'LeloTeur':  len(pg.find("[fieldalias=LeloTeur]")),
                'TeurHaTafkidLoMuna':  len(pg.find("[fieldalias=TeurHaTafkidLoMuna]")),

                'full_name': all_aliases_as_string(pg, ['Shem', 'ShemPratiVeMishpacha', 'ShemPriatiVeMishpacha',
                                                        'ShemMishpahaVePrati']),

                'positions': get_positions_array(pg),


                #'is_nomination': False,
                #'positions':"",
                #'gender':"",
                #'name':""
            })
        else:
            return None
    except Exception as err:
        raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err
    return row
Ejemplo n.º 8
0
                                          stream=True,
                                          verify=False).raw
                except:
                    logging.exception('Failed to load data from %s',
                                      url_to_use)
                stream.read = functools.partial(stream.read,
                                                decode_content=True)
                shutil.copyfileobj(stream, tmp)
                tmp.flush()
                url_to_use = object_storage.write(obj_name,
                                                  file_name=tmp.name,
                                                  create_bucket=False)
                tmp.close()
                del tmp
            else:
                url_to_use = object_storage.urlfor(obj_name)

        report['report-sheets'] = 0
        report['report-headers-row'] = None
        report['report-rows'] = None
        report['report-bad-rows'] = None
        report['load-error'] = None

        with tempfile.NamedTemporaryFile(
                suffix=os.path.splitext(url_to_use)[1]) as tmp:
            if url_to_use.startswith('http'):
                time.sleep(1)
                stream = requests.get(url_to_use, stream=True).raw
                stream.read = functools.partial(stream.read,
                                                decode_content=True)
                shutil.copyfileobj(stream, tmp)