def test_attending_members_invalid_data(self):
     # file does not exist
     with CommitteeMeetingProtocol.get_from_filename('/foo/bar/baz') as protocol:
         with self.assertRaises(IOError): protocol.find_attending_members([])
     # no text
     with CommitteeMeetingProtocol.get_from_text(None) as protocol:
         self.assertEqual([], protocol.find_attending_members([]))
Example #2
0
 def test_attending_members_invalid_data(self):
     # file does not exist
     with CommitteeMeetingProtocol.get_from_filename('/foo/bar/baz') as protocol:
         with self.assertRaises(IOError): protocol.find_attending_members([])
     # no text
     with CommitteeMeetingProtocol.get_from_text(None) as protocol:
         self.assertEqual([], protocol.find_attending_members([]))
 def test_docx_protocol_attendees(self):
     source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_502208.doc')
     protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name)
     with protocol_generator as protocol:
         self.assertEqual(['איתן כבל', 'יצחק וקנין', "עבד אל חכים חאג' יחיא",
                           'איתן ברושי', 'שרן השכל'],
                          protocol.find_attending_members([u"איתן כבל", u"יצחק וקנין",
                                                           u"עבד אל חכים חאג' יחיא",
                                                           u"איתן ברושי", u"שרן השכל"]))
         self.assertEqual({'mks': ['איתן ברושי', 'שרן השכל', 'איתן כבל – היו"ר',
                                   'יצחק וקנין', "עבד אל חכים חאג' יחיא"],
                           'invitees': [
                               {'name': 'צביקה כהן', 'role': 'סמנכ"ל בכיר למימון והשקעות, משרד החקלאות ופיתוח הכפר'},
                               {'name': 'אורי צוק-בר',
                                'role': 'סמנכ"ל מחקר כלכלה ואסטרטגיה, משרד החקלאות ופיתוח הכפר'},
                               {'name': 'אסף לוי', 'role': 'סמנכ"ל גורמי יצור, משרד החקלאות ופיתוח הכפר'},
                               {'name': 'דפנה טיש'}, {'name': 'עמרי איתן בן צבי'},
                               {'name': 'עדי טל נוסבוים'},
                               {'name': 'ליאורה עופרי'},
                               {'name': 'עו"ד, משרד החקלאות ופיתוח הכפר'},
                               {'name': 'עו"ד, מח\' יעוץ וחקיקה, משרד המשפטים'},
                               {'name': 'יועמ"ש, המשרד לביטחון פנים'},
                               {'name': 'עו"ד, המשרד להגנת הסביבה'},
                               {'name': 'צבי אלון', 'role': 'מנכ"ל, מועצת הצמחים'},
                               {'name': 'אמיר שניידר'},
                               {'name': 'ירון סולומון'},
                               {'name': 'יועמ"ש, התאחדות האיכרים והחקלאים בישראל'},
                               {'name': 'מנהל המחלקה להתיישבות, האיחוד החקלאי'},
                               {'name': 'אריאל ארליך', 'role': 'ראש מחלקת ליטיגציה, פורום קהלת'},
                               {'name': 'מיכל זליקוביץ', 'role': 'נציגה, פורום קהלת'},
                               {'name': 'יעל שביט', 'role': 'שדלן/ית'}],
                           'legal_advisors': ['איתי עצמון'],
                           'manager': ['לאה ורון']},
                          protocol.attendees)
 def test_missing_member_issue132(self):
     # TODO: switch to env_conditional_mock function when PR #9 is merged
     if os.environ.get("NO_MOCKS", "") == "1":
         all_mk_names = get_all_mk_names()
     else:
         all_mk_names = MOCK_OPEN_KNESSET_GET_ALL_MK_NAMES_RESPONSE
     mks, mk_names = all_mk_names
     with CommitteeMeetingProtocol.get_from_filename(
             os.path.join(os.path.dirname(__file__),
                          '20_ptv_367393.doc')) as protocol:
         attending_members = protocol.find_attending_members(mk_names)
         self.assertEqual(
             attending_members,
             [
                 u"אוסאמה סעדי",
                 u"אורי מקלב",
                 u"זאב בנימין בגין",
                 u"יוליה מלינובסקי",
                 # this MK has extra space which caused him not to be found
                 # now we search the stripped name
                 # but the return value still has the extra space (as provided)
                 u"מיכאל מלכיאלי ",
                 u"רויטל סויד",
                 u"בנימין בגין",
             ])
Example #5
0
def parse_protocol(row):
    original_filename, ext, output_filename, full_output_filename, download_filename = get_filenames(
        row)
    if os.path.exists(full_output_filename):
        logging.info('file exists: {}'.format(full_output_filename))
        stats["existing files"] += 1
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('existing file: {}'.format(full_output_filename))
    elif os.path.exists(download_filename):
        with open(download_filename, "rb") as f:
            with CommitteeMeetingProtocol.get_from_file(f) as protocol:
                os.makedirs(os.path.dirname(full_output_filename),
                            exist_ok=True)
                with utils.temp_file() as temp_filename:
                    with open(temp_filename, "w") as of:
                        if parse_type == "text":
                            of.write(protocol.text)
                        else:
                            csv_writer = csv.writer(of)
                            csv_writer.writerow(["header", "body"])
                            for part in protocol.parts:
                                csv_writer.writerow([part.header, part.body])
                    shutil.copy(temp_filename, full_output_filename)
        filesize = os.path.getsize(full_output_filename)
        crc32c = get_crc32c(full_output_filename)
        logging.info('parsed file: {}'.format(full_output_filename))
        stats["parsed files"] += 1
    else:
        logging.warning('missing document committee session file: {}'.format(
            download_filename))
        ext, output_filename, filesize, crc32c = None, None, 0, None
    return ext, output_filename, filesize, crc32c
Example #6
0
def redownload_protocol(committee_meeting):
    if committee_meeting.committee.type == 'plenum':
        download_for_existing_meeting(committee_meeting)
    else:
        with CommitteeMeetingProtocol.get_from_url(
                committee_meeting.src_url) as protocol:
            committee_meeting.protocol_text = protocol.text
            committee_meeting.protocol_text_update_date = datetime.now()
            committee_meeting.save()
Example #7
0
 def _filter_row(self, meeting_protocol, **kwargs):
     parts_relpath = os.path.join(
         str(meeting_protocol["committee_id"]),
         "{}.csv".format(meeting_protocol["meeting_id"]))
     text_relpath = os.path.join(
         str(meeting_protocol["committee_id"]),
         "{}.txt".format(meeting_protocol["meeting_id"]))
     parts_filename = self._get_filename(parts_relpath)
     text_filename = self._get_filename(text_relpath)
     protocol_filename = meeting_protocol["protocol_file"]
     protocol_ext = protocol_filename.strip()[-4:]
     if protocol_ext == ".doc":
         # for now, only doc files are being parsed and should be added to all_filenames
         if parts_relpath not in self._all_filenames:
             self._all_filenames += [parts_relpath, text_relpath]
             os.makedirs(os.path.dirname(parts_filename), exist_ok=True)
     if not os.path.exists(parts_filename):
         if protocol_ext == ".doc":
             with CommitteeMeetingProtocol.get_from_filename(
                     protocol_filename) as protocol:
                 with open(text_filename, "w") as f:
                     f.write(protocol.text)
                     logging.info(
                         "parsed doc to text -> {}".format(text_filename))
                 with open(parts_filename, "w") as f:
                     csv_writer = csv.writer(f)
                     csv_writer.writerow(["header", "body"])
                     for part in protocol.parts:
                         csv_writer.writerow([part.header, part.body])
                     logging.info(
                         "parsed parts file -> {}".format(parts_filename))
         elif protocol_ext == ".rtf":
             # rtf parsing proved difficult, skipping for now
             text_filename = None
             parts_filename = None
             # rtf_to_txt_filename = self._rtf_to_txt(protocol_filename)
             # shutil.copyfile(rtf_to_txt_filename, text_filename)
             # os.unlink(rtf_to_txt_filename)
             # logging.info("parsed rtf to text -> {}".format(text_filename))
             # with open(text_filename) as f:
             #     with CommitteeMeetingProtocol.get_from_text(f.read()) as protocol:
             #         with open(parts_filename, "w") as f:
             #             csv_writer = csv.writer(f)
             #             csv_writer.writerow(["header", "body"])
             #             for part in protocol.parts:
             #                 csv_writer.writerow([part.header, part.body])
             #             logging.info("parsed parts file -> {}".format(parts_filename))
         else:
             raise Exception("unknown extension: {}".format(protocol_ext))
     yield {
         "committee_id": meeting_protocol["committee_id"],
         "meeting_id": meeting_protocol["meeting_id"],
         "protocol_file": protocol_filename,
         "text_file": text_filename,
         "parts_file": parts_filename
     }
 def test_protocol_attendenace_strange_title(self):
     source_doc_file_name = os.path.join(os.path.dirname(__file__),
                                         '20_ptv_321195.doc')
     protocol_generator = CommitteeMeetingProtocol.get_from_filename(
         source_doc_file_name)
     with protocol_generator as protocol:
         self.assertEqual([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"],
                          protocol.find_attending_members([
                              u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"
                          ]))
Example #9
0
def process_row(row, row_index, spec, resource_index, parameters, stats):
    if spec['name'] == 'kns_committeesession':
        row.update(mks=None, invitees=None, legal_advisors=None, manager=None)
        if (
            (not parameters.get("filter-meeting-id") or int(row["CommitteeSessionID"]) in parameters["filter-meeting-id"])
            and (not parameters.get("filter-committee-id") or int(row["CommitteeID"]) in parameters["filter-committee-id"])
            and (not parameters.get("filter-knesset-num") or int(row["KnessetNum"]) in parameters["filter-knesset-num"])
        ):
            if row["text_parsed_filename"]:
                new_cache_hash, old_cache_hash, cache_hash_path, cache_hash_row = None, None, None, None
                if os.environ.get('KNESSET_PIPELINES_DATA_PATH'):
                    m = BASE_HASH_OBJ.copy()
                    m.update(str(row['text_crc32c']).encode())
                    m.update(str(row['parts_crc32c']).encode())
                    new_cache_hash = m.hexdigest()
                    cache_hash_path = os.path.join(os.environ['KNESSET_PIPELINES_DATA_PATH'],
                                                   'people/committees/meeting-attendees/cache_hash/{}.json'.format(row["text_parsed_filename"]))
                    if os.path.exists(cache_hash_path):
                        with open(cache_hash_path) as f:
                            cache_data = json.load(f)
                            old_cache_hash = cache_data['hash']
                            cache_hash_row = cache_data['row']
                if cache_hash_path and old_cache_hash and old_cache_hash == new_cache_hash:
                    row.update(**cache_hash_row)
                else:
                    logging.info('getting attendees for meeting {}'.format(row['CommitteeSessionID']))
                    text = None
                    if os.environ.get('KNESSET_PIPELINES_DATA_PATH'):
                        protocol_text_path = os.path.join(os.environ['KNESSET_PIPELINES_DATA_PATH'],
                                                          'committees/meeting_protocols_text/{}'.format(row["text_parsed_filename"]))
                        if os.path.exists(protocol_text_path) and os.path.getsize(protocol_text_path) > 0:
                            with open(protocol_text_path) as f:
                                text = f.read()
                    else:
                        protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/data/committees/" \
                                            "meeting_protocols_text/{}".format(row["text_parsed_filename"])
                        res = requests.get(protocol_text_url)
                        if res.status_code == 200:
                            text = requests.get(protocol_text_url).content.decode("utf-8")
                    update_row = dict(mks=None, invitees=None, legal_advisors=None, manager=None)
                    if text:
                        with CommitteeMeetingProtocol.get_from_text(text) as protocol:
                            attendees = protocol.attendees
                            if attendees:
                                update_row = dict(mks=attendees['mks'],
                                                  invitees=attendees['invitees'],
                                                  legal_advisors=attendees['legal_advisors'],
                                                  manager=attendees['manager'])
                                row.update(**update_row)
                    if cache_hash_path:
                        os.makedirs(os.path.dirname(cache_hash_path), exist_ok=True)
                        with open(cache_hash_path, 'w') as f:
                            json.dump({'hash': new_cache_hash,
                                       'row': update_row}, f)
    return row
Example #10
0
 def redownload_protocol(self):
     if self.committee.type == 'plenum':
         # TODO: Using managment command this way is an antipattern, a common service should be extracted and used
         from plenum.management.commands.parse_plenum_protocols_subcommands.download import \
             download_for_existing_meeting
         download_for_existing_meeting(self)
     else:
         with KnessetDataCommitteeMeetingProtocol.get_from_url(self.src_url) as protocol:
             self.protocol_text = protocol.text
             self.protocol_text_update_date = datetime.now()
             self.save()
Example #11
0
    def create_protocol_parts(self,
                              delete_existing=False,
                              mks=None,
                              mk_names=None):
        """ Create protocol parts from this instance's protocol_text
            Optionally, delete existing parts.
            If the meeting already has parts, and you don't ask to
            delete them, a ValidationError will be thrown, because
            it doesn't make sense to create the parts again.
        """
        logger.debug('create_protocol_parts %s' % delete_existing)
        if delete_existing:
            ppct = ContentType.objects.get_for_model(ProtocolPart)
            annotations = Annotation.objects.filter(
                content_type=ppct, object_id__in=self.parts.all)
            logger.debug(
                'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d'
                % (annotations.count(), self.id))
            annotations.delete()
            self.parts.all().delete()
        else:
            if self.parts.count():
                raise ValidationError(
                    'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.'
                )
        if not self.protocol_text:  # sometimes there are empty protocols
            return  # then we don't need to do anything here.
        if self.committee.type == 'plenum':
            create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names)
            return
        else:

            def get_protocol_part(i, part):
                logger.debug('creating protocol part %s' % i)
                return ProtocolPart(meeting=self,
                                    order=i,
                                    header=part.header,
                                    body=part.body)

            with KnessetDataCommitteeMeetingProtocol.get_from_text(
                    self.protocol_text) as protocol:
                # TODO: use bulk_create (I had a strange error when using it)
                # ProtocolPart.objects.bulk_create(
                # for testing, you could just save one part:
                # get_protocol_part(1, protocol.parts[0]).save()
                list([
                    get_protocol_part(i, part).save()
                    for i, part in zip(range(1,
                                             len(protocol.parts) +
                                             1), protocol.parts)
                ])
            self.protocol_parts_update_date = datetime.now()
            self.save()
Example #12
0
 def _parse_doc_protocol(self, committee_id, meeting_id, protocol_filename, parts_filename, text_filename):
     try:
         with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
             with open(text_filename, "w") as f:
                 f.write(protocol.text)
                 logging.info("parsed doc to text -> {}".format(text_filename))
             self._parse_protocol_parts(parts_filename, protocol)
     except (AntiwordException, subprocess.SubprocessError):
         logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id,
                                                                                                meeting_id))
         return False
     return True
Example #13
0
def get_kns_committeesession_resource():
    for committeesession_row in kns_committeesession_resource:
        if (
            (not parameters.get("filter-meeting-id") or int(committeesession_row["CommitteeSessionID"]) in parameters["filter-meeting-id"])
            and (not parameters.get("filter-committee-id") or int(committeesession_row["CommitteeID"]) in parameters["filter-committee-id"])
            and (not parameters.get("filter-knesset-num") or int(committeesession_row["KnessetNum"]) in parameters["filter-knesset-num"])
        ):
            if committeesession_row["text_object_name"]:
                protocol_text_url = "https://minio.oknesset.org/committees/" + committeesession_row["text_object_name"]
                text = requests.get(protocol_text_url).content.decode("utf-8")
                with CommitteeMeetingProtocol.get_from_text(text) as protocol:
                    committeesession_row.update(protocol.attendees)
            yield committeesession_row
def get_resource():
    for row_num, row in enumerate(download_rows):
        logging.info("{} / {}".format(row_num, len(download_rows)))
        try:
            original_filename = os.path.join("files", str(row["GroupTypeID"]),
                                             str(row["DocumentCommitteeSessionID"])[0],
                                             str(row["DocumentCommitteeSessionID"])[1],
                                             str(row["DocumentCommitteeSessionID"]) + "." + row["ApplicationDesc"])
            ext = os.path.splitext(original_filename)[1].lower()
            output_filename = "files/{}/{}/{}.{}".format(str(row["CommitteeSessionID"])[0],
                                                         str(row["CommitteeSessionID"])[1],
                                                         str(row["CommitteeSessionID"]),
                                                         "csv" if parse_type == "parts" else "txt")
            if not files_limit or stats["parsed files"] < files_limit:
                if download_from_path:
                    download_filename = "../data/committees/download_document_committee_session/" + original_filename
                    if os.path.exists(download_filename):
                        with open(download_filename, "rb") as f:
                            with CommitteeMeetingProtocol.get_from_file(f) as protocol:
                                parse_protocol(output_filename, protocol)
                    else:
                        logging.warning("missing download_filename {}".format(download_filename))
                elif download_from_remote_storage:
                    url = download_from_remote_storage + original_filename
                    with CommitteeMeetingProtocol.get_from_url(url) as protocol:
                        parse_protocol(output_filename, protocol)
                else:
                    raise Exception("no valid download option")
            row.update(protocol_extension=ext,
                       parsed_filename=output_filename)
            yield row
        except Exception as e:
            # there is a bug in knesset-data-python which prevents getting the error message from the exception
            # TODO: fix this bug
            error_message = "failed to parse CommitteeSessionID {}".format(row["CommitteeSessionID"])  # , str(e))
            logging.exception(error_message)
            row.update(error=error_message)
            errors.append(row)
    def extract_speakers_from_txt_file(self,file_object_path,committee_id,meeting_id):

        text = object_storage.read(self.s3, "committees", file_object_path).decode()


        with CommitteeMeetingProtocol.get_from_text(text) as protocol:
            speakers = protocol.speakers

        if speakers is not None:

            for speaker in speakers:
                yield {"committee_id":committee_id,
                       "meeting_id":meeting_id,
                       "name":speaker }
 def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name):
     logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name))
     with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename:
         try:
             with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol:
                 object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True)
                 self._parse_protocol_parts(bucket, parts_object_name, protocol)
         except (
                 AntiwordException,  # see https://github.com/hasadna/knesset-data-pipelines/issues/15
                 subprocess.SubprocessError,
                 xml.etree.ElementTree.ParseError  # see https://github.com/hasadna/knesset-data-pipelines/issues/32
         ):
             logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id))
             return False
     return True
Example #17
0
 def find_attending_members(self, mks=None, mk_names=None):
     logger.debug('find_attending_members')
     if mks is None and mk_names is None:
         logger.debug('get_all_mk_names')
         mks, mk_names = get_all_mk_names()
     with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol:
         attended_mk_names = protocol.find_attending_members(mk_names)
         for name in attended_mk_names:
             i = mk_names.index(name)
             if not mks[i].party_at(self.date):  # not a member at time of this meeting?
                 continue  # then don't search for this MK.
             self.mks_attended.add(mks[i])
     logger.debug('meeting %d now has %d attending members' % (
         self.id,
         self.mks_attended.count()))
Example #18
0
    def create_protocol_parts(self, delete_existing=False, mks=None,
                              mk_names=None):
        """ Create protocol parts from this instance's protocol_text
            Optionally, delete existing parts.
            If the meeting already has parts, and you don't ask to
            delete them, a ValidationError will be thrown, because
            it doesn't make sense to create the parts again.
        """
        logger.debug('create_protocol_parts %s' % delete_existing)
        if delete_existing:
            ppct = ContentType.objects.get_for_model(ProtocolPart)
            annotations = Annotation.objects.filter(content_type=ppct,
                                                    object_id__in=self.parts.all)
            logger.debug(
                'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d' % (
                    annotations.count(), self.id))
            annotations.delete()
            self.parts.all().delete()
        else:
            if self.parts.count():
                raise ValidationError(
                    'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.')
        if not self.protocol_text:  # sometimes there are empty protocols
            return  # then we don't need to do anything here.
        if self.committee.type == 'plenum':
            create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names)
            return
        else:
            def get_protocol_part(i, part):
                logger.debug('creating protocol part %s' % i)
                return ProtocolPart(meeting=self, order=i, header=part.header,
                                    body=part.body)

            with KnessetDataCommitteeMeetingProtocol.get_from_text(
                    self.protocol_text) as protocol:
                # TODO: use bulk_create (I had a strange error when using it)
                # ProtocolPart.objects.bulk_create(
                # for testing, you could just save one part:
                # get_protocol_part(1, protocol.parts[0]).save()
                list([
                         get_protocol_part(i, part).save()
                         for i, part
                         in
                         zip(range(1, len(protocol.parts) + 1), protocol.parts)
                         ])
            self.protocol_parts_update_date = datetime.now()
            self.save()
Example #19
0
    def extract_speakers_from_txt_file(self, file_object_path, committee_id,
                                       meeting_id):

        text = object_storage.read(self.s3, "committees",
                                   file_object_path).decode()

        with CommitteeMeetingProtocol.get_from_text(text) as protocol:
            speakers = protocol.speakers

        if speakers is not None:

            for speaker in speakers:
                yield {
                    "committee_id": committee_id,
                    "meeting_id": meeting_id,
                    "name": speaker
                }
Example #20
0
 def redownload_protocol(self):
     if self.committee.type == 'plenum':
         # TODO: Using managment command this way is an antipattern, a common service should be extracted and used
         from plenum.management.commands.parse_plenum_protocols_subcommands.download import \
             download_for_existing_meeting
         download_for_existing_meeting(self)
     else:
         try:
             with KnessetDataCommitteeMeetingProtocol.get_from_url(
                     self.src_url) as protocol:
                 self.protocol_text = protocol.text
                 self.protocol_text_update_date = datetime.now()
                 self.save()
         except AntiwordException as e:
             logger.error(e.message,
                          exc_info=True,
                          extra={'output': e.output})
             raise e
Example #21
0
 def _parse_rtf_protocol(self, committee_id, meeting_id, protocol_filename, parts_filename, text_filename):
     rtf_extractor = os.environ.get("RTF_EXTRACTOR_BIN")
     if rtf_extractor:
         cmd = rtf_extractor + ' ' + protocol_filename + ' ' + text_filename
         try:
             subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True)
             with open(text_filename) as f:
                 protocol_text = f.read()
             with CommitteeMeetingProtocol.get_from_text(protocol_text) as protocol:
                 self._parse_protocol_parts(parts_filename, protocol)
         except subprocess.SubprocessError:
             logging.exception("committee {} meeting {}: failed to parse rtf file, skipping".format(committee_id,
                                                                                                    meeting_id))
             return False
         return True
     else:
         logging.warning("missing RTF_EXTRACTOR_BIN environment variable, skipping rtf parsing")
         return False
Example #22
0
 def redownload_protocol(self):
     if self.committee.type == 'plenum':
         # TODO: Using managment command this way is an antipattern, a common service should be extracted and used
         from plenum.management.commands.parse_plenum_protocols_subcommands.download import \
             download_for_existing_meeting
         download_for_existing_meeting(self)
     else:
         try:
             with KnessetDataCommitteeMeetingProtocol.get_from_url(self.src_url) as protocol:
                 self.protocol_text = protocol.text
                 self.protocol_text_update_date = datetime.now()
                 self.save()
         except AntiwordException, e:
             logger.error(
                 e.message,
                 exc_info=True,
                 extra={
                     'output': e.output
                 }
             )
             raise e
Example #23
0
def get_kns_committeesession_resource():
    for committeesession_row in kns_committeesession_resource:
        if ((not parameters.get("filter-meeting-id")
             or int(committeesession_row["CommitteeSessionID"])
             in parameters["filter-meeting-id"])
                and (not parameters.get("filter-committee-id")
                     or int(committeesession_row["CommitteeID"])
                     in parameters["filter-committee-id"])
                and (not parameters.get("filter-knesset-num")
                     or int(committeesession_row["KnessetNum"])
                     in parameters["filter-knesset-num"])):
            # text_file_name	                                            text_file_size
            # data/committees/meeting_protocols_text/files/5/7/570611.txt	72817
            if (committeesession_row["text_file_name"]
                    and committeesession_row["text_file_size"]
                    and committeesession_row["text_file_size"] > 0):
                protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/{}".format(
                    committeesession_row["text_file_name"])
                text = requests.get(protocol_text_url).content.decode("utf-8")
                with CommitteeMeetingProtocol.get_from_text(text) as protocol:
                    committeesession_row.update(protocol.attendees)
            yield committeesession_row
Example #24
0
 def find_attending_members(self, mks=None, mk_names=None):
     logger.debug('find_attending_members')
     if mks is None and mk_names is None:
         logger.debug('get_all_mk_names')
         mks, mk_names = get_all_mk_names()
     try:
         with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol:
             attended_mk_names = protocol.find_attending_members(mk_names)
             for name in attended_mk_names:
                 i = mk_names.index(name)
                 if not mks[i].party_at(self.date):  # not a member at time of this meeting?
                     continue  # then don't search for this MK.
                 self.mks_attended.add(mks[i])
     except Exception:
         exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
         logger.debug("%s%s",
                      ''.join(traceback.format_exception(exceptionType,
                                                         exceptionValue,
                                                         exceptionTraceback)
                              ),
                      '\nCommitteeMeeting.id=' + str(self.id))
     logger.debug('meeting %d now has %d attending members' % (
         self.id,
         self.mks_attended.count()))
Example #25
0
 def find_attending_members(self, mks=None, mk_names=None):
     logger.debug('find_attending_members')
     if mks is None and mk_names is None:
         logger.debug('get_all_mk_names')
         mks, mk_names = get_all_mk_names()
     try:
         with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol:
             attended_mk_names = protocol.find_attending_members(mk_names)
             for name in attended_mk_names:
                 i = mk_names.index(name)
                 if not mks[i].party_at(self.date):  # not a member at time of this meeting?
                     continue  # then don't search for this MK.
                 self.mks_attended.add(mks[i])
     except Exception:
         exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
         logger.debug("%s%s",
                      ''.join(traceback.format_exception(exceptionType,
                                                         exceptionValue,
                                                         exceptionTraceback)
                              ),
                      '\nCommitteeMeeting.id=' + str(self.id))
     logger.debug('meeting %d now has %d attending members' % (
         self.id,
         self.mks_attended.count()))
    def extract_attendees_from_txt_file(self,file_object_path,committee_id,meeting_id):

        text = object_storage.read(self.s3, "committees",file_object_path).decode()

        with CommitteeMeetingProtocol.get_from_text(text) as protocol:
            attendees = protocol.attendees

        if attendees is not None:

            for key in attendees.keys():
                for attendee in attendees[key]:
                    if key == "invitees":
                        yield {"committee_id":committee_id,
                               "meeting_id":meeting_id,
                               "name":attendee["name"],
                               "role":"invitees",
                               "additional_information":attendee["role"] if "role" in attendee.keys() else ""}

                    else:
                        yield {"committee_id":committee_id,
                               "meeting_id":meeting_id,
                               "name":attendee,
                               "role":key,
                               "additional_information":""}
Example #27
0
def process_row(row, row_index, resource_descriptor, resource_index,
                parameters, stats):
    if resource_descriptor['name'] == 'kns_documentcommitteesession':
        t = parameters['type']
        row[t + "_protocol_extension"] = None
        row[t + "_parsed_filename"] = None
        row[t + "_filesize"] = 0
        row[t + "_crc32c"] = None
        row[t + "_error"] = None
        if (row['GroupTypeID'] == 23 and row['ApplicationDesc'] == 'DOC'
                and (row["FilePath"].lower().endswith('.doc')
                     or row["FilePath"].lower().endswith('.docx'))):
            document_id = "{}-{}-{}".format(row["GroupTypeID"],
                                            row["DocumentCommitteeSessionID"],
                                            row["ApplicationDesc"])
            original_filename, ext, output_filename, full_output_filename, download_filename, full_output_hash_filename = get_filenames(
                row, parameters)
            if os.path.exists(download_filename) and row.get(
                    'download_crc32c'):
                m = BASE_HASH_OBJ.copy()
                m.update(row['download_crc32c'].encode())
                new_cache_hash = m.hexdigest()
                if os.path.exists(full_output_filename) and os.path.exists(
                        full_output_hash_filename):
                    with open(full_output_hash_filename) as f:
                        old_cache_hash = f.read()
                else:
                    old_cache_hash = None
                if old_cache_hash and new_cache_hash and new_cache_hash == old_cache_hash:
                    stats[t + ": existing files"] += 1
                    row[t + "_protocol_extension"] = ext
                    row[t + "_parsed_filename"] = output_filename
                    row[t +
                        "_filesize"] = os.path.getsize(full_output_filename)
                    row[t + "_crc32c"] = get_crc32c(full_output_filename)
                elif parameters.get('files-limit') and parameters[
                        'files-limit'] <= stats[t + ": parsed files"]:
                    row[t + "_error"] = 'reached files-limit, skipping'
                    stats[t + ": skipped files"] += 1
                else:
                    error_string = None
                    try:
                        with open(download_filename, "rb") as f:
                            with CommitteeMeetingProtocol.get_from_file(
                                    f) as protocol:
                                os.makedirs(
                                    os.path.dirname(full_output_filename),
                                    exist_ok=True)
                                with utils.temp_file() as temp_filename:
                                    with open(temp_filename, "w") as of:
                                        if parameters['type'] == "text":
                                            of.write(protocol.text)
                                        else:
                                            csv_writer = csv.writer(of)
                                            csv_writer.writerow(
                                                ["header", "body"])
                                            for part in protocol.parts:
                                                csv_writer.writerow(
                                                    [part.header, part.body])
                                    shutil.copy(temp_filename,
                                                full_output_filename)
                    except Exception as e:
                        logging.exception(
                            'exception parsing protocol for {}'.format(
                                document_id))
                        try:
                            error_string = str(e)
                        except Exception:
                            error_string = 'unexpected exception'
                    if error_string:
                        row[t + "_error"] = error_string
                        stats[t + ': errored files'] += 1
                    else:
                        row[t + "_protocol_extension"] = ext
                        row[t + "_parsed_filename"] = output_filename
                        row[t + "_filesize"] = os.path.getsize(
                            full_output_filename)
                        row[t + "_crc32c"] = get_crc32c(full_output_filename)
                        stats[t + ": parsed files"] += 1
                        with open(full_output_hash_filename, 'w') as f:
                            f.write(new_cache_hash)
            else:
                row[t + "_error"] = 'missing download file'
                stats[t + ': missing download files'] += 1
    return row
Example #28
0
class CommitteeMeeting(BaseKnessetDataServiceFunctionObject):

    ORDERED_FIELDS = [
        ("id",
         KnessetDataServiceSimpleField(
             'Committee_Agenda_id', 'integer',
             "the primary key of committee meetings")),
        ("committee_id",
         KnessetDataServiceSimpleField(
             'Committee_Agenda_committee_id', 'integer',
             "id of the committee (linked to Committee object)")),
        ("datetime",
         KnessetDataServiceSimpleField('committee_agenda_date', 'datetime',
                                       "date/time when the meeting started")),
        ("title",
         KnessetDataServiceSimpleField('title', 'string',
                                       "title of the meeting")),
        ("session_content",
         KnessetDataServiceSimpleField(
             'committee_agenda_session_content', 'string',
             "seems like in some committee meetings, the title field is empty, in that case title can be taken from this field"
         )),
        ("url",
         KnessetDataServiceSimpleField('url', 'string',
                                       "url to download the protocol")),
        # a CommitteeMeetingProtocol object which allows to get data from the protocol
        # because parsing the protocol requires heavy IO and processing - we provide it as a generator
        # see tests/test_meetings.py for usage example
        ("protocol",
         KnessetDataServiceLambdaField(
             lambda obj, entry: CommitteeMeetingProtocol.get_from_url(
                 obj.url, proxies=obj._proxies) if obj.url else None)),
        ("location ",
         KnessetDataServiceSimpleField(
             'committee_location', 'string',
             "this seems like a shorter name of the place where meeting took place"
         )),
        ("place ",
         KnessetDataServiceSimpleField(
             'Committee_Agenda_place', 'string',
             "this looks like a longer field with the specific details of where the meeting took place"
         )),
        ("meeting_stop ",
         KnessetDataServiceSimpleField(
             'meeting_stop', 'string',
             "date/time when the meeting ended - this is not always available, in some meetings it's empty"
         )),
        ### following fields seem less interesting ###
        ("agenda_canceled ",
         KnessetDataServiceSimpleField('Committee_Agenda_canceled')),
        ("agenda_sub ", KnessetDataServiceSimpleField('Committee_agenda_sub')),
        ("agenda_associated ",
         KnessetDataServiceSimpleField('Committee_agenda_associated')),
        ("agenda_associated_id ",
         KnessetDataServiceSimpleField('Committee_agenda_associated_id')),
        ("agenda_special ",
         KnessetDataServiceSimpleField('Committee_agenda_special')),
        ("agenda_invited1 ",
         KnessetDataServiceSimpleField('Committee_agenda_invited1')),
        ("agenda_invite ",
         KnessetDataServiceSimpleField('sd2committee_agenda_invite')),
        ("note ", KnessetDataServiceSimpleField('Committee_agenda_note')),
        ("start_datetime ", KnessetDataServiceSimpleField('StartDateTime')),
        ("topid_id ", KnessetDataServiceSimpleField('Topic_ID')),
        ("creation_date ", KnessetDataServiceSimpleField('Date_Creation')),
        ("streaming_url ", KnessetDataServiceSimpleField('streaming_url')),
        ("meeting_start ", KnessetDataServiceSimpleField('meeting_start')),
        ("is_paused ", KnessetDataServiceSimpleField('meeting_is_paused')),
        ("date_order ", KnessetDataServiceSimpleField('committee_date_order')),
        ("date ", KnessetDataServiceSimpleField('committee_date')),
        ("day ", KnessetDataServiceSimpleField('committee_day')),
        ("month ", KnessetDataServiceSimpleField('committee_month')),
        ("material_id ", KnessetDataServiceSimpleField('material_id')),
        ("material_committee_id ",
         KnessetDataServiceSimpleField('material_comittee_id')),
        ("material_expiration_date ",
         KnessetDataServiceSimpleField('material_expiration_date')),
        ("material_hour ",
         KnessetDataServiceSimpleField('committee_material_hour')),
        ("old_url ", KnessetDataServiceSimpleField('OldUrl')),
        ("background_page_link ",
         KnessetDataServiceSimpleField('CommitteeBackgroundPageLink')),
        ("agenda_invited ",
         KnessetDataServiceSimpleField('Committee_agenda_invited')),
    ]

    @classmethod
    def _get_url_base(cls):
        return "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/CommitteeScheduleData.svc/CommitteeAgendaSearch"

    @classmethod
    def get(cls, committee_id, from_date, to_date=None, proxies=None):
        """
        # example usage:
        >>> from datetime import datetime
        # get all meetings of committee 1 from Jan 01, 2016
        >>> CommitteeMeeting.get(1, datetime(2016, 1, 1))
        # get all meetings of committee 2 from Feb 01, 2015 to Feb 20, 2015
        >>> CommitteeMeeting.get(2, datetime(2015, 2, 1), datetime(2015, 2, 20))
        """
        params = {
            "CommitteeId": "'%s'" % committee_id,
            "FromDate": "'%sT00:00:00'" % from_date.strftime('%Y-%m-%d')
        }
        if to_date:
            params["ToDate"] = "'%sT00:00:00'" % to_date.strftime('%Y-%m-%d')
        return super(CommitteeMeeting, cls).get(params, proxies=proxies)
 def setUp(self):
     source_doc_file_name = os.path.join(os.path.dirname(__file__),
                                         '20_ptv_317899.doc')
     self.protocol_generator = CommitteeMeetingProtocol.get_from_filename(
         source_doc_file_name)
Example #30
0
 def setUp(self):
     source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_317899.doc')
     self.protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name)
Example #31
0
 def test_protocol_attendenace_strange_title(self):
     source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_321195.doc')
     protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name)
     with protocol_generator as protocol:
         self.assertEqual([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"],
                          protocol.find_attending_members([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"]))
    def test_docx_protocol_parts(self):
        source_doc_file_name = os.path.join(os.path.dirname(__file__),
                                            '20_ptv_502208.doc')
        protocol_generator = CommitteeMeetingProtocol.get_from_filename(
            source_doc_file_name)
        with protocol_generator as protocol:
            self.assertProtocolPartEquals(
                protocol.parts[0], '', u"""פרוטוקול של ישיבת ועדה

הכנסת העשרים

הכנסת



12
ועדת הכלכלה
27/06/2018


מושב רביעי



פרוטוקול מס' 800
מישיבת ועדת הכלכלה
יום רביעי, י"ד בתמוז התשע"ח (27 ביוני 2018), שעה 9:00""")
            self.assertProtocolPartEquals(
                protocol.parts[1], u"""סדר היום""",
                u"""הצעת חוק מועצת הצמחים (ייצור ושיווק) (תיקון מס' 10), התשע"ד-2014"""
            )
            self.assertProtocolPartEquals(protocol.parts[2], u"""נכחו""",
                                          u"""""")
            self.assertProtocolPartEquals(
                protocol.parts[3], u"""חברי הוועדה:""", u"""איתן כבל – היו"ר
יצחק וקנין
עבד אל חכים חאג' יחיא""")
            self.assertProtocolPartEquals(protocol.parts[4], u"""חברי הכנסת""",
                                          u"""איתן ברושי
שרן השכל""")
            self.assertProtocolPartEquals(protocol.parts[5], u"""נכחו:""",
                                          u"""""")
            self.assertProtocolPartEquals(
                protocol.parts[6], u"""מוזמנים:""",
                u"""צביקה כהן - סמנכ"ל בכיר למימון והשקעות, משרד החקלאות ופיתוח הכפר

אורי צוק-בר - סמנכ"ל מחקר כלכלה ואסטרטגיה, משרד החקלאות ופיתוח הכפר

אסף לוי - סמנכ"ל גורמי יצור, משרד החקלאות ופיתוח הכפר

דפנה טיש
עמרי איתן בן צבי
עדי טל נוסבוים
ליאורה עופרי
	–
–
–
–
	עו"ד, משרד החקלאות ופיתוח הכפר
עו"ד, מח' יעוץ וחקיקה, משרד המשפטים
יועמ"ש, המשרד לביטחון פנים
עו"ד, המשרד להגנת הסביבה

צבי אלון - מנכ"ל, מועצת הצמחים

אמיר שניידר
ירון סולומון
	–
–
	יועמ"ש, התאחדות האיכרים והחקלאים בישראל
מנהל המחלקה להתיישבות, האיחוד החקלאי

אריאל ארליך - ראש מחלקת ליטיגציה, פורום קהלת

מיכל זליקוביץ - נציגה, פורום קהלת

יעל שביט - שדלן/ית""")