def test_attending_members_invalid_data(self): # file does not exist with CommitteeMeetingProtocol.get_from_filename('/foo/bar/baz') as protocol: with self.assertRaises(IOError): protocol.find_attending_members([]) # no text with CommitteeMeetingProtocol.get_from_text(None) as protocol: self.assertEqual([], protocol.find_attending_members([]))
def test_docx_protocol_attendees(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_502208.doc') protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name) with protocol_generator as protocol: self.assertEqual(['איתן כבל', 'יצחק וקנין', "עבד אל חכים חאג' יחיא", 'איתן ברושי', 'שרן השכל'], protocol.find_attending_members([u"איתן כבל", u"יצחק וקנין", u"עבד אל חכים חאג' יחיא", u"איתן ברושי", u"שרן השכל"])) self.assertEqual({'mks': ['איתן ברושי', 'שרן השכל', 'איתן כבל – היו"ר', 'יצחק וקנין', "עבד אל חכים חאג' יחיא"], 'invitees': [ {'name': 'צביקה כהן', 'role': 'סמנכ"ל בכיר למימון והשקעות, משרד החקלאות ופיתוח הכפר'}, {'name': 'אורי צוק-בר', 'role': 'סמנכ"ל מחקר כלכלה ואסטרטגיה, משרד החקלאות ופיתוח הכפר'}, {'name': 'אסף לוי', 'role': 'סמנכ"ל גורמי יצור, משרד החקלאות ופיתוח הכפר'}, {'name': 'דפנה טיש'}, {'name': 'עמרי איתן בן צבי'}, {'name': 'עדי טל נוסבוים'}, {'name': 'ליאורה עופרי'}, {'name': 'עו"ד, משרד החקלאות ופיתוח הכפר'}, {'name': 'עו"ד, מח\' יעוץ וחקיקה, משרד המשפטים'}, {'name': 'יועמ"ש, המשרד לביטחון פנים'}, {'name': 'עו"ד, המשרד להגנת הסביבה'}, {'name': 'צבי אלון', 'role': 'מנכ"ל, מועצת הצמחים'}, {'name': 'אמיר שניידר'}, {'name': 'ירון סולומון'}, {'name': 'יועמ"ש, התאחדות האיכרים והחקלאים בישראל'}, {'name': 'מנהל המחלקה להתיישבות, האיחוד החקלאי'}, {'name': 'אריאל ארליך', 'role': 'ראש מחלקת ליטיגציה, פורום קהלת'}, {'name': 'מיכל זליקוביץ', 'role': 'נציגה, פורום קהלת'}, {'name': 'יעל שביט', 'role': 'שדלן/ית'}], 'legal_advisors': ['איתי עצמון'], 'manager': ['לאה ורון']}, protocol.attendees)
def test_missing_member_issue132(self): # TODO: switch to env_conditional_mock function when PR #9 is merged if os.environ.get("NO_MOCKS", "") == "1": all_mk_names = get_all_mk_names() else: all_mk_names = MOCK_OPEN_KNESSET_GET_ALL_MK_NAMES_RESPONSE mks, mk_names = all_mk_names with CommitteeMeetingProtocol.get_from_filename( os.path.join(os.path.dirname(__file__), '20_ptv_367393.doc')) as protocol: attending_members = protocol.find_attending_members(mk_names) self.assertEqual( attending_members, [ u"אוסאמה סעדי", u"אורי מקלב", u"זאב בנימין בגין", u"יוליה מלינובסקי", # this MK has extra space which caused him not to be found # now we search the stripped name # but the return value still has the extra space (as provided) u"מיכאל מלכיאלי ", u"רויטל סויד", u"בנימין בגין", ])
def parse_protocol(row): original_filename, ext, output_filename, full_output_filename, download_filename = get_filenames( row) if os.path.exists(full_output_filename): logging.info('file exists: {}'.format(full_output_filename)) stats["existing files"] += 1 filesize = os.path.getsize(full_output_filename) crc32c = get_crc32c(full_output_filename) logging.info('existing file: {}'.format(full_output_filename)) elif os.path.exists(download_filename): with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file(f) as protocol: os.makedirs(os.path.dirname(full_output_filename), exist_ok=True) with utils.temp_file() as temp_filename: with open(temp_filename, "w") as of: if parse_type == "text": of.write(protocol.text) else: csv_writer = csv.writer(of) csv_writer.writerow(["header", "body"]) for part in protocol.parts: csv_writer.writerow([part.header, part.body]) shutil.copy(temp_filename, full_output_filename) filesize = os.path.getsize(full_output_filename) crc32c = get_crc32c(full_output_filename) logging.info('parsed file: {}'.format(full_output_filename)) stats["parsed files"] += 1 else: logging.warning('missing document committee session file: {}'.format( download_filename)) ext, output_filename, filesize, crc32c = None, None, 0, None return ext, output_filename, filesize, crc32c
def redownload_protocol(committee_meeting): if committee_meeting.committee.type == 'plenum': download_for_existing_meeting(committee_meeting) else: with CommitteeMeetingProtocol.get_from_url( committee_meeting.src_url) as protocol: committee_meeting.protocol_text = protocol.text committee_meeting.protocol_text_update_date = datetime.now() committee_meeting.save()
def _filter_row(self, meeting_protocol, **kwargs): parts_relpath = os.path.join( str(meeting_protocol["committee_id"]), "{}.csv".format(meeting_protocol["meeting_id"])) text_relpath = os.path.join( str(meeting_protocol["committee_id"]), "{}.txt".format(meeting_protocol["meeting_id"])) parts_filename = self._get_filename(parts_relpath) text_filename = self._get_filename(text_relpath) protocol_filename = meeting_protocol["protocol_file"] protocol_ext = protocol_filename.strip()[-4:] if protocol_ext == ".doc": # for now, only doc files are being parsed and should be added to all_filenames if parts_relpath not in self._all_filenames: self._all_filenames += [parts_relpath, text_relpath] os.makedirs(os.path.dirname(parts_filename), exist_ok=True) if not os.path.exists(parts_filename): if protocol_ext == ".doc": with CommitteeMeetingProtocol.get_from_filename( protocol_filename) as protocol: with open(text_filename, "w") as f: f.write(protocol.text) logging.info( "parsed doc to text -> {}".format(text_filename)) with open(parts_filename, "w") as f: csv_writer = csv.writer(f) csv_writer.writerow(["header", "body"]) for part in protocol.parts: csv_writer.writerow([part.header, part.body]) logging.info( "parsed parts file -> {}".format(parts_filename)) elif protocol_ext == ".rtf": # rtf parsing proved difficult, skipping for now text_filename = None parts_filename = None # rtf_to_txt_filename = self._rtf_to_txt(protocol_filename) # shutil.copyfile(rtf_to_txt_filename, text_filename) # os.unlink(rtf_to_txt_filename) # logging.info("parsed rtf to text -> {}".format(text_filename)) # with open(text_filename) as f: # with CommitteeMeetingProtocol.get_from_text(f.read()) as protocol: # with open(parts_filename, "w") as f: # csv_writer = csv.writer(f) # csv_writer.writerow(["header", "body"]) # for part in protocol.parts: # csv_writer.writerow([part.header, part.body]) # logging.info("parsed parts file -> {}".format(parts_filename)) else: raise Exception("unknown extension: {}".format(protocol_ext)) yield { "committee_id": meeting_protocol["committee_id"], "meeting_id": meeting_protocol["meeting_id"], "protocol_file": protocol_filename, "text_file": text_filename, "parts_file": parts_filename }
def test_protocol_attendenace_strange_title(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_321195.doc') protocol_generator = CommitteeMeetingProtocol.get_from_filename( source_doc_file_name) with protocol_generator as protocol: self.assertEqual([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"], protocol.find_attending_members([ u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי" ]))
def process_row(row, row_index, spec, resource_index, parameters, stats): if spec['name'] == 'kns_committeesession': row.update(mks=None, invitees=None, legal_advisors=None, manager=None) if ( (not parameters.get("filter-meeting-id") or int(row["CommitteeSessionID"]) in parameters["filter-meeting-id"]) and (not parameters.get("filter-committee-id") or int(row["CommitteeID"]) in parameters["filter-committee-id"]) and (not parameters.get("filter-knesset-num") or int(row["KnessetNum"]) in parameters["filter-knesset-num"]) ): if row["text_parsed_filename"]: new_cache_hash, old_cache_hash, cache_hash_path, cache_hash_row = None, None, None, None if os.environ.get('KNESSET_PIPELINES_DATA_PATH'): m = BASE_HASH_OBJ.copy() m.update(str(row['text_crc32c']).encode()) m.update(str(row['parts_crc32c']).encode()) new_cache_hash = m.hexdigest() cache_hash_path = os.path.join(os.environ['KNESSET_PIPELINES_DATA_PATH'], 'people/committees/meeting-attendees/cache_hash/{}.json'.format(row["text_parsed_filename"])) if os.path.exists(cache_hash_path): with open(cache_hash_path) as f: cache_data = json.load(f) old_cache_hash = cache_data['hash'] cache_hash_row = cache_data['row'] if cache_hash_path and old_cache_hash and old_cache_hash == new_cache_hash: row.update(**cache_hash_row) else: logging.info('getting attendees for meeting {}'.format(row['CommitteeSessionID'])) text = None if os.environ.get('KNESSET_PIPELINES_DATA_PATH'): protocol_text_path = os.path.join(os.environ['KNESSET_PIPELINES_DATA_PATH'], 'committees/meeting_protocols_text/{}'.format(row["text_parsed_filename"])) if os.path.exists(protocol_text_path) and os.path.getsize(protocol_text_path) > 0: with open(protocol_text_path) as f: text = f.read() else: protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/data/committees/" \ "meeting_protocols_text/{}".format(row["text_parsed_filename"]) res = requests.get(protocol_text_url) if res.status_code == 200: text = requests.get(protocol_text_url).content.decode("utf-8") update_row = dict(mks=None, invitees=None, legal_advisors=None, manager=None) if text: with CommitteeMeetingProtocol.get_from_text(text) as protocol: attendees = protocol.attendees if attendees: update_row = dict(mks=attendees['mks'], invitees=attendees['invitees'], legal_advisors=attendees['legal_advisors'], manager=attendees['manager']) row.update(**update_row) if cache_hash_path: os.makedirs(os.path.dirname(cache_hash_path), exist_ok=True) with open(cache_hash_path, 'w') as f: json.dump({'hash': new_cache_hash, 'row': update_row}, f) return row
def redownload_protocol(self): if self.committee.type == 'plenum': # TODO: Using managment command this way is an antipattern, a common service should be extracted and used from plenum.management.commands.parse_plenum_protocols_subcommands.download import \ download_for_existing_meeting download_for_existing_meeting(self) else: with KnessetDataCommitteeMeetingProtocol.get_from_url(self.src_url) as protocol: self.protocol_text = protocol.text self.protocol_text_update_date = datetime.now() self.save()
def create_protocol_parts(self, delete_existing=False, mks=None, mk_names=None): """ Create protocol parts from this instance's protocol_text Optionally, delete existing parts. If the meeting already has parts, and you don't ask to delete them, a ValidationError will be thrown, because it doesn't make sense to create the parts again. """ logger.debug('create_protocol_parts %s' % delete_existing) if delete_existing: ppct = ContentType.objects.get_for_model(ProtocolPart) annotations = Annotation.objects.filter( content_type=ppct, object_id__in=self.parts.all) logger.debug( 'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d' % (annotations.count(), self.id)) annotations.delete() self.parts.all().delete() else: if self.parts.count(): raise ValidationError( 'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.' ) if not self.protocol_text: # sometimes there are empty protocols return # then we don't need to do anything here. if self.committee.type == 'plenum': create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names) return else: def get_protocol_part(i, part): logger.debug('creating protocol part %s' % i) return ProtocolPart(meeting=self, order=i, header=part.header, body=part.body) with KnessetDataCommitteeMeetingProtocol.get_from_text( self.protocol_text) as protocol: # TODO: use bulk_create (I had a strange error when using it) # ProtocolPart.objects.bulk_create( # for testing, you could just save one part: # get_protocol_part(1, protocol.parts[0]).save() list([ get_protocol_part(i, part).save() for i, part in zip(range(1, len(protocol.parts) + 1), protocol.parts) ]) self.protocol_parts_update_date = datetime.now() self.save()
def _parse_doc_protocol(self, committee_id, meeting_id, protocol_filename, parts_filename, text_filename): try: with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol: with open(text_filename, "w") as f: f.write(protocol.text) logging.info("parsed doc to text -> {}".format(text_filename)) self._parse_protocol_parts(parts_filename, protocol) except (AntiwordException, subprocess.SubprocessError): logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id)) return False return True
def get_kns_committeesession_resource(): for committeesession_row in kns_committeesession_resource: if ( (not parameters.get("filter-meeting-id") or int(committeesession_row["CommitteeSessionID"]) in parameters["filter-meeting-id"]) and (not parameters.get("filter-committee-id") or int(committeesession_row["CommitteeID"]) in parameters["filter-committee-id"]) and (not parameters.get("filter-knesset-num") or int(committeesession_row["KnessetNum"]) in parameters["filter-knesset-num"]) ): if committeesession_row["text_object_name"]: protocol_text_url = "https://minio.oknesset.org/committees/" + committeesession_row["text_object_name"] text = requests.get(protocol_text_url).content.decode("utf-8") with CommitteeMeetingProtocol.get_from_text(text) as protocol: committeesession_row.update(protocol.attendees) yield committeesession_row
def get_resource(): for row_num, row in enumerate(download_rows): logging.info("{} / {}".format(row_num, len(download_rows))) try: original_filename = os.path.join("files", str(row["GroupTypeID"]), str(row["DocumentCommitteeSessionID"])[0], str(row["DocumentCommitteeSessionID"])[1], str(row["DocumentCommitteeSessionID"]) + "." + row["ApplicationDesc"]) ext = os.path.splitext(original_filename)[1].lower() output_filename = "files/{}/{}/{}.{}".format(str(row["CommitteeSessionID"])[0], str(row["CommitteeSessionID"])[1], str(row["CommitteeSessionID"]), "csv" if parse_type == "parts" else "txt") if not files_limit or stats["parsed files"] < files_limit: if download_from_path: download_filename = "../data/committees/download_document_committee_session/" + original_filename if os.path.exists(download_filename): with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file(f) as protocol: parse_protocol(output_filename, protocol) else: logging.warning("missing download_filename {}".format(download_filename)) elif download_from_remote_storage: url = download_from_remote_storage + original_filename with CommitteeMeetingProtocol.get_from_url(url) as protocol: parse_protocol(output_filename, protocol) else: raise Exception("no valid download option") row.update(protocol_extension=ext, parsed_filename=output_filename) yield row except Exception as e: # there is a bug in knesset-data-python which prevents getting the error message from the exception # TODO: fix this bug error_message = "failed to parse CommitteeSessionID {}".format(row["CommitteeSessionID"]) # , str(e)) logging.exception(error_message) row.update(error=error_message) errors.append(row)
def extract_speakers_from_txt_file(self,file_object_path,committee_id,meeting_id): text = object_storage.read(self.s3, "committees", file_object_path).decode() with CommitteeMeetingProtocol.get_from_text(text) as protocol: speakers = protocol.speakers if speakers is not None: for speaker in speakers: yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":speaker }
def _parse_doc_protocol(self, committee_id, meeting_id, bucket, protocol_object_name, parts_object_name, text_object_name): logging.info("parsing doc protocol {} --> {}, {}".format(protocol_object_name, parts_object_name, text_object_name)) with object_storage.temp_download(self.s3, bucket, protocol_object_name) as protocol_filename: try: with CommitteeMeetingProtocol.get_from_filename(protocol_filename) as protocol: object_storage.write(self.s3, bucket, text_object_name, protocol.text, public_bucket=True) self._parse_protocol_parts(bucket, parts_object_name, protocol) except ( AntiwordException, # see https://github.com/hasadna/knesset-data-pipelines/issues/15 subprocess.SubprocessError, xml.etree.ElementTree.ParseError # see https://github.com/hasadna/knesset-data-pipelines/issues/32 ): logging.exception("committee {} meeting {}: failed to parse doc file, skipping".format(committee_id, meeting_id)) return False return True
def find_attending_members(self, mks=None, mk_names=None): logger.debug('find_attending_members') if mks is None and mk_names is None: logger.debug('get_all_mk_names') mks, mk_names = get_all_mk_names() with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol: attended_mk_names = protocol.find_attending_members(mk_names) for name in attended_mk_names: i = mk_names.index(name) if not mks[i].party_at(self.date): # not a member at time of this meeting? continue # then don't search for this MK. self.mks_attended.add(mks[i]) logger.debug('meeting %d now has %d attending members' % ( self.id, self.mks_attended.count()))
def create_protocol_parts(self, delete_existing=False, mks=None, mk_names=None): """ Create protocol parts from this instance's protocol_text Optionally, delete existing parts. If the meeting already has parts, and you don't ask to delete them, a ValidationError will be thrown, because it doesn't make sense to create the parts again. """ logger.debug('create_protocol_parts %s' % delete_existing) if delete_existing: ppct = ContentType.objects.get_for_model(ProtocolPart) annotations = Annotation.objects.filter(content_type=ppct, object_id__in=self.parts.all) logger.debug( 'deleting %d annotations, because I was asked to delete the relevant protocol parts on cm.id=%d' % ( annotations.count(), self.id)) annotations.delete() self.parts.all().delete() else: if self.parts.count(): raise ValidationError( 'CommitteeMeeting already has parts. delete them if you want to run create_protocol_parts again.') if not self.protocol_text: # sometimes there are empty protocols return # then we don't need to do anything here. if self.committee.type == 'plenum': create_plenum_protocol_parts(self, mks=mks, mk_names=mk_names) return else: def get_protocol_part(i, part): logger.debug('creating protocol part %s' % i) return ProtocolPart(meeting=self, order=i, header=part.header, body=part.body) with KnessetDataCommitteeMeetingProtocol.get_from_text( self.protocol_text) as protocol: # TODO: use bulk_create (I had a strange error when using it) # ProtocolPart.objects.bulk_create( # for testing, you could just save one part: # get_protocol_part(1, protocol.parts[0]).save() list([ get_protocol_part(i, part).save() for i, part in zip(range(1, len(protocol.parts) + 1), protocol.parts) ]) self.protocol_parts_update_date = datetime.now() self.save()
def extract_speakers_from_txt_file(self, file_object_path, committee_id, meeting_id): text = object_storage.read(self.s3, "committees", file_object_path).decode() with CommitteeMeetingProtocol.get_from_text(text) as protocol: speakers = protocol.speakers if speakers is not None: for speaker in speakers: yield { "committee_id": committee_id, "meeting_id": meeting_id, "name": speaker }
def redownload_protocol(self): if self.committee.type == 'plenum': # TODO: Using managment command this way is an antipattern, a common service should be extracted and used from plenum.management.commands.parse_plenum_protocols_subcommands.download import \ download_for_existing_meeting download_for_existing_meeting(self) else: try: with KnessetDataCommitteeMeetingProtocol.get_from_url( self.src_url) as protocol: self.protocol_text = protocol.text self.protocol_text_update_date = datetime.now() self.save() except AntiwordException as e: logger.error(e.message, exc_info=True, extra={'output': e.output}) raise e
def _parse_rtf_protocol(self, committee_id, meeting_id, protocol_filename, parts_filename, text_filename): rtf_extractor = os.environ.get("RTF_EXTRACTOR_BIN") if rtf_extractor: cmd = rtf_extractor + ' ' + protocol_filename + ' ' + text_filename try: subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) with open(text_filename) as f: protocol_text = f.read() with CommitteeMeetingProtocol.get_from_text(protocol_text) as protocol: self._parse_protocol_parts(parts_filename, protocol) except subprocess.SubprocessError: logging.exception("committee {} meeting {}: failed to parse rtf file, skipping".format(committee_id, meeting_id)) return False return True else: logging.warning("missing RTF_EXTRACTOR_BIN environment variable, skipping rtf parsing") return False
def redownload_protocol(self): if self.committee.type == 'plenum': # TODO: Using managment command this way is an antipattern, a common service should be extracted and used from plenum.management.commands.parse_plenum_protocols_subcommands.download import \ download_for_existing_meeting download_for_existing_meeting(self) else: try: with KnessetDataCommitteeMeetingProtocol.get_from_url(self.src_url) as protocol: self.protocol_text = protocol.text self.protocol_text_update_date = datetime.now() self.save() except AntiwordException, e: logger.error( e.message, exc_info=True, extra={ 'output': e.output } ) raise e
def get_kns_committeesession_resource(): for committeesession_row in kns_committeesession_resource: if ((not parameters.get("filter-meeting-id") or int(committeesession_row["CommitteeSessionID"]) in parameters["filter-meeting-id"]) and (not parameters.get("filter-committee-id") or int(committeesession_row["CommitteeID"]) in parameters["filter-committee-id"]) and (not parameters.get("filter-knesset-num") or int(committeesession_row["KnessetNum"]) in parameters["filter-knesset-num"])): # text_file_name text_file_size # data/committees/meeting_protocols_text/files/5/7/570611.txt 72817 if (committeesession_row["text_file_name"] and committeesession_row["text_file_size"] and committeesession_row["text_file_size"] > 0): protocol_text_url = "https://storage.googleapis.com/knesset-data-pipelines/{}".format( committeesession_row["text_file_name"]) text = requests.get(protocol_text_url).content.decode("utf-8") with CommitteeMeetingProtocol.get_from_text(text) as protocol: committeesession_row.update(protocol.attendees) yield committeesession_row
def find_attending_members(self, mks=None, mk_names=None): logger.debug('find_attending_members') if mks is None and mk_names is None: logger.debug('get_all_mk_names') mks, mk_names = get_all_mk_names() try: with KnessetDataCommitteeMeetingProtocol.get_from_text(self.protocol_text) as protocol: attended_mk_names = protocol.find_attending_members(mk_names) for name in attended_mk_names: i = mk_names.index(name) if not mks[i].party_at(self.date): # not a member at time of this meeting? continue # then don't search for this MK. self.mks_attended.add(mks[i]) except Exception: exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() logger.debug("%s%s", ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback) ), '\nCommitteeMeeting.id=' + str(self.id)) logger.debug('meeting %d now has %d attending members' % ( self.id, self.mks_attended.count()))
def extract_attendees_from_txt_file(self,file_object_path,committee_id,meeting_id): text = object_storage.read(self.s3, "committees",file_object_path).decode() with CommitteeMeetingProtocol.get_from_text(text) as protocol: attendees = protocol.attendees if attendees is not None: for key in attendees.keys(): for attendee in attendees[key]: if key == "invitees": yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":attendee["name"], "role":"invitees", "additional_information":attendee["role"] if "role" in attendee.keys() else ""} else: yield {"committee_id":committee_id, "meeting_id":meeting_id, "name":attendee, "role":key, "additional_information":""}
def process_row(row, row_index, resource_descriptor, resource_index, parameters, stats): if resource_descriptor['name'] == 'kns_documentcommitteesession': t = parameters['type'] row[t + "_protocol_extension"] = None row[t + "_parsed_filename"] = None row[t + "_filesize"] = 0 row[t + "_crc32c"] = None row[t + "_error"] = None if (row['GroupTypeID'] == 23 and row['ApplicationDesc'] == 'DOC' and (row["FilePath"].lower().endswith('.doc') or row["FilePath"].lower().endswith('.docx'))): document_id = "{}-{}-{}".format(row["GroupTypeID"], row["DocumentCommitteeSessionID"], row["ApplicationDesc"]) original_filename, ext, output_filename, full_output_filename, download_filename, full_output_hash_filename = get_filenames( row, parameters) if os.path.exists(download_filename) and row.get( 'download_crc32c'): m = BASE_HASH_OBJ.copy() m.update(row['download_crc32c'].encode()) new_cache_hash = m.hexdigest() if os.path.exists(full_output_filename) and os.path.exists( full_output_hash_filename): with open(full_output_hash_filename) as f: old_cache_hash = f.read() else: old_cache_hash = None if old_cache_hash and new_cache_hash and new_cache_hash == old_cache_hash: stats[t + ": existing files"] += 1 row[t + "_protocol_extension"] = ext row[t + "_parsed_filename"] = output_filename row[t + "_filesize"] = os.path.getsize(full_output_filename) row[t + "_crc32c"] = get_crc32c(full_output_filename) elif parameters.get('files-limit') and parameters[ 'files-limit'] <= stats[t + ": parsed files"]: row[t + "_error"] = 'reached files-limit, skipping' stats[t + ": skipped files"] += 1 else: error_string = None try: with open(download_filename, "rb") as f: with CommitteeMeetingProtocol.get_from_file( f) as protocol: os.makedirs( os.path.dirname(full_output_filename), exist_ok=True) with utils.temp_file() as temp_filename: with open(temp_filename, "w") as of: if parameters['type'] == "text": of.write(protocol.text) else: csv_writer = csv.writer(of) csv_writer.writerow( ["header", "body"]) for part in protocol.parts: csv_writer.writerow( [part.header, part.body]) shutil.copy(temp_filename, full_output_filename) except Exception as e: logging.exception( 'exception parsing protocol for {}'.format( document_id)) try: error_string = str(e) except Exception: error_string = 'unexpected exception' if error_string: row[t + "_error"] = error_string stats[t + ': errored files'] += 1 else: row[t + "_protocol_extension"] = ext row[t + "_parsed_filename"] = output_filename row[t + "_filesize"] = os.path.getsize( full_output_filename) row[t + "_crc32c"] = get_crc32c(full_output_filename) stats[t + ": parsed files"] += 1 with open(full_output_hash_filename, 'w') as f: f.write(new_cache_hash) else: row[t + "_error"] = 'missing download file' stats[t + ': missing download files'] += 1 return row
class CommitteeMeeting(BaseKnessetDataServiceFunctionObject): ORDERED_FIELDS = [ ("id", KnessetDataServiceSimpleField( 'Committee_Agenda_id', 'integer', "the primary key of committee meetings")), ("committee_id", KnessetDataServiceSimpleField( 'Committee_Agenda_committee_id', 'integer', "id of the committee (linked to Committee object)")), ("datetime", KnessetDataServiceSimpleField('committee_agenda_date', 'datetime', "date/time when the meeting started")), ("title", KnessetDataServiceSimpleField('title', 'string', "title of the meeting")), ("session_content", KnessetDataServiceSimpleField( 'committee_agenda_session_content', 'string', "seems like in some committee meetings, the title field is empty, in that case title can be taken from this field" )), ("url", KnessetDataServiceSimpleField('url', 'string', "url to download the protocol")), # a CommitteeMeetingProtocol object which allows to get data from the protocol # because parsing the protocol requires heavy IO and processing - we provide it as a generator # see tests/test_meetings.py for usage example ("protocol", KnessetDataServiceLambdaField( lambda obj, entry: CommitteeMeetingProtocol.get_from_url( obj.url, proxies=obj._proxies) if obj.url else None)), ("location ", KnessetDataServiceSimpleField( 'committee_location', 'string', "this seems like a shorter name of the place where meeting took place" )), ("place ", KnessetDataServiceSimpleField( 'Committee_Agenda_place', 'string', "this looks like a longer field with the specific details of where the meeting took place" )), ("meeting_stop ", KnessetDataServiceSimpleField( 'meeting_stop', 'string', "date/time when the meeting ended - this is not always available, in some meetings it's empty" )), ### following fields seem less interesting ### ("agenda_canceled ", KnessetDataServiceSimpleField('Committee_Agenda_canceled')), ("agenda_sub ", KnessetDataServiceSimpleField('Committee_agenda_sub')), ("agenda_associated ", KnessetDataServiceSimpleField('Committee_agenda_associated')), ("agenda_associated_id ", KnessetDataServiceSimpleField('Committee_agenda_associated_id')), ("agenda_special ", KnessetDataServiceSimpleField('Committee_agenda_special')), ("agenda_invited1 ", KnessetDataServiceSimpleField('Committee_agenda_invited1')), ("agenda_invite ", KnessetDataServiceSimpleField('sd2committee_agenda_invite')), ("note ", KnessetDataServiceSimpleField('Committee_agenda_note')), ("start_datetime ", KnessetDataServiceSimpleField('StartDateTime')), ("topid_id ", KnessetDataServiceSimpleField('Topic_ID')), ("creation_date ", KnessetDataServiceSimpleField('Date_Creation')), ("streaming_url ", KnessetDataServiceSimpleField('streaming_url')), ("meeting_start ", KnessetDataServiceSimpleField('meeting_start')), ("is_paused ", KnessetDataServiceSimpleField('meeting_is_paused')), ("date_order ", KnessetDataServiceSimpleField('committee_date_order')), ("date ", KnessetDataServiceSimpleField('committee_date')), ("day ", KnessetDataServiceSimpleField('committee_day')), ("month ", KnessetDataServiceSimpleField('committee_month')), ("material_id ", KnessetDataServiceSimpleField('material_id')), ("material_committee_id ", KnessetDataServiceSimpleField('material_comittee_id')), ("material_expiration_date ", KnessetDataServiceSimpleField('material_expiration_date')), ("material_hour ", KnessetDataServiceSimpleField('committee_material_hour')), ("old_url ", KnessetDataServiceSimpleField('OldUrl')), ("background_page_link ", KnessetDataServiceSimpleField('CommitteeBackgroundPageLink')), ("agenda_invited ", KnessetDataServiceSimpleField('Committee_agenda_invited')), ] @classmethod def _get_url_base(cls): return "http://online.knesset.gov.il/WsinternetSps/KnessetDataService/CommitteeScheduleData.svc/CommitteeAgendaSearch" @classmethod def get(cls, committee_id, from_date, to_date=None, proxies=None): """ # example usage: >>> from datetime import datetime # get all meetings of committee 1 from Jan 01, 2016 >>> CommitteeMeeting.get(1, datetime(2016, 1, 1)) # get all meetings of committee 2 from Feb 01, 2015 to Feb 20, 2015 >>> CommitteeMeeting.get(2, datetime(2015, 2, 1), datetime(2015, 2, 20)) """ params = { "CommitteeId": "'%s'" % committee_id, "FromDate": "'%sT00:00:00'" % from_date.strftime('%Y-%m-%d') } if to_date: params["ToDate"] = "'%sT00:00:00'" % to_date.strftime('%Y-%m-%d') return super(CommitteeMeeting, cls).get(params, proxies=proxies)
def setUp(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_317899.doc') self.protocol_generator = CommitteeMeetingProtocol.get_from_filename( source_doc_file_name)
def setUp(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_317899.doc') self.protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name)
def test_protocol_attendenace_strange_title(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_321195.doc') protocol_generator = CommitteeMeetingProtocol.get_from_filename(source_doc_file_name) with protocol_generator as protocol: self.assertEqual([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"], protocol.find_attending_members([u"קארין אלהרר", u"דוד אמסלם", u"אוסאמה סעדי"]))
def test_docx_protocol_parts(self): source_doc_file_name = os.path.join(os.path.dirname(__file__), '20_ptv_502208.doc') protocol_generator = CommitteeMeetingProtocol.get_from_filename( source_doc_file_name) with protocol_generator as protocol: self.assertProtocolPartEquals( protocol.parts[0], '', u"""פרוטוקול של ישיבת ועדה הכנסת העשרים הכנסת 12 ועדת הכלכלה 27/06/2018 מושב רביעי פרוטוקול מס' 800 מישיבת ועדת הכלכלה יום רביעי, י"ד בתמוז התשע"ח (27 ביוני 2018), שעה 9:00""") self.assertProtocolPartEquals( protocol.parts[1], u"""סדר היום""", u"""הצעת חוק מועצת הצמחים (ייצור ושיווק) (תיקון מס' 10), התשע"ד-2014""" ) self.assertProtocolPartEquals(protocol.parts[2], u"""נכחו""", u"""""") self.assertProtocolPartEquals( protocol.parts[3], u"""חברי הוועדה:""", u"""איתן כבל – היו"ר יצחק וקנין עבד אל חכים חאג' יחיא""") self.assertProtocolPartEquals(protocol.parts[4], u"""חברי הכנסת""", u"""איתן ברושי שרן השכל""") self.assertProtocolPartEquals(protocol.parts[5], u"""נכחו:""", u"""""") self.assertProtocolPartEquals( protocol.parts[6], u"""מוזמנים:""", u"""צביקה כהן - סמנכ"ל בכיר למימון והשקעות, משרד החקלאות ופיתוח הכפר אורי צוק-בר - סמנכ"ל מחקר כלכלה ואסטרטגיה, משרד החקלאות ופיתוח הכפר אסף לוי - סמנכ"ל גורמי יצור, משרד החקלאות ופיתוח הכפר דפנה טיש עמרי איתן בן צבי עדי טל נוסבוים ליאורה עופרי – – – – עו"ד, משרד החקלאות ופיתוח הכפר עו"ד, מח' יעוץ וחקיקה, משרד המשפטים יועמ"ש, המשרד לביטחון פנים עו"ד, המשרד להגנת הסביבה צבי אלון - מנכ"ל, מועצת הצמחים אמיר שניידר ירון סולומון – – יועמ"ש, התאחדות האיכרים והחקלאים בישראל מנהל המחלקה להתיישבות, האיחוד החקלאי אריאל ארליך - ראש מחלקת ליטיגציה, פורום קהלת מיכל זליקוביץ - נציגה, פורום קהלת יעל שביט - שדלן/ית""")