def _create_new_object(self, dataservice_vote): return Vote.objects.create( src_id=dataservice_vote.id, title=u'{vote} - {sess}'.format(vote=dataservice_vote.item_dscr, sess=dataservice_vote.sess_item_dscr), time_string = u'יום '+hebrew_strftime(dataservice_vote.datetime), importance = 1, time = dataservice_vote.datetime, meeting_number = dataservice_vote.session_num, vote_number = dataservice_vote.nbr_in_sess, src_url = dataservice_vote.id )
def _create_new_object(self, dataservice_meeting): meeting = CommitteeMeeting.objects.create( committee=get_committee(dataservice_meeting.committee_id), date_string=hebrew_strftime(dataservice_meeting.datetime, u'%d/%m/%Y'), date=dataservice_meeting.date, topics=dataservice_meeting.agendum1, datetime=dataservice_meeting.datetime, knesset_id=dataservice_meeting.id, src_url=dataservice_meeting.link, ) meeting.reparse_protocol(mks=self.mks, mk_names=self.mk_names) return meeting
class Command(BaseKnessetDataserviceCollectionCommand): DATASERVICE_CLASS = DataserviceVote DATASERVICE_MODEL_MAP = { # model attribute name | dataservice attribute name, or lambda to get the value 'src_id': 'id', 'title': lambda vote: u'{vote} - {sess}'.format(vote=vote.item_dscr, sess=vote.sess_item_dscr), 'time_string': lambda vote: u'יום %s' % hebrew_strftime(vote.datetime), 'importance': lambda vote: 1, 'time': 'datetime', 'meeting_number': "session_num", 'vote_number': 'nbr_in_sess', 'src_url': lambda vote: "http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s" % vote.id } VALIDATE_FIELDS_TO_AUTOFIX = ['title', 'src_url'] option_list = BaseKnessetDataserviceCollectionCommand.option_list + ( make_option( '--validate-votes-pages', dest='validatevotepages', help= "validate votes between (and including) given page range\npages in this case are based on vote id ascending, so you'll have the same page number each time" ), make_option( '--validate-skip-to', dest='validateskipto', help= "skip to the given vote id (for use with --validate-votes-pages)"), make_option( '--create-vote-src-id', dest='createvotesrcid', help= "create the given vote/s from the comma-separated src ids (assuming they don't already exist in DB)" ), make_option( '--validate-output-file', dest='validateoutputfile', help="where to write the validation results to (defaults to stdout)" ), make_option( '--validate-fix', dest='validatefix', action='store_true', help= "try to fix some problems directly in DB which are safe to automatically fix" )) help = "Scrape votes data from the knesset" def _update_or_create_vote(self, dataservice_vote, oknesset_vote=None): vote_kwargs = self._get_dataservice_model_kwargs(dataservice_vote) if oknesset_vote: [setattr(oknesset_vote, k, v) for k, v in vote_kwargs.iteritems()] oknesset_vote.save() else: oknesset_vote = Vote.objects.create(**vote_kwargs) self._add_vote_actions(dataservice_vote, oknesset_vote) oknesset_vote.update_vote_properties() SyncdataCommand().find_synced_protocol(oknesset_vote) Link.objects.create( title=u'ההצבעה באתר הכנסת', url= 'http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s' % oknesset_vote.src_id, content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=str(oknesset_vote.id)) return oknesset_vote # if v.full_text_url != None: # l = Link(title=u'מסמך הצעת החוק באתר הכנסת', url=v.full_text_url, content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) # l.save() def _add_vote_actions(self, dataservice_vote, oknesset_vote): for member_id, vote_result_code in HtmlVote.get_from_vote_id( dataservice_vote.id).member_votes: member_qs = Member.objects.filter(pk=member_id) if member_qs.exists(): member = member_qs.first() vote_type = self._resolve_vote_type(vote_result_code) vote_action, created = VoteAction.objects.get_or_create( vote=oknesset_vote, member=member, defaults={ 'type': vote_type, 'party': member.current_party }) if created: vote_action.save() else: raise VoteScraperException( 'vote %s: could not find member id %s' % (dataservice_vote.id, member_id)) def _has_existing_object(self, dataservice_vote): qs = Vote.objects.filter(src_id=dataservice_vote.id) return qs.exists() def _create_new_object(self, dataservice_vote): return self._update_or_create_vote(dataservice_vote) def _resolve_vote_type(cls, vote_result_code): return { 'voted for': u'for', 'voted against': u'against', 'abstain': u'abstain', 'did not vote': u'no-vote', }[vote_result_code] def recreate_objects(self, vote_ids): recreated_votes = [] for vote_id in vote_ids: oknesset_vote = Vote.objects.get(id=int(vote_id)) vote_src_id = oknesset_vote.src_id dataservice_vote = self.DATASERVICE_CLASS.get(vote_src_id) VoteAction.objects.filter(vote=oknesset_vote).delete() Link.objects.filter( content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=oknesset_vote.id).delete() recreated_votes.append( self._update_or_create_vote(dataservice_vote, oknesset_vote)) return recreated_votes def _get_validate_header_row(self): return ['knesset vote id', 'open knesset vote id', 'error'] def _get_validate_error_row(self, dataservice_object, oknesset_object, error): return [dataservice_object.id, oknesset_object.id, error] def _get_validate_first_object_title(self, dataservice_object): return 'date: %s' % dataservice_object.datetime def _validate_attr_actual_expected(self, attr_name, actual_value, expected_value): if attr_name == 'time_string': # remove some unprintable artifacts which for some reason are in the old scraper's votes actual_value = actual_value.replace(u"\u200f", "").replace(u"\xa0", " ") elif attr_name == 'title' and actual_value != expected_value: # try a slightly different format which exists in DB in some cases actual_value = actual_value.replace(u" - הצעת חוק", u" - חוק") return super(Command, self)._validate_attr_actual_expected( attr_name, actual_value, expected_value)
class Command(BaseKnessetDataserviceCollectionCommand): DATASERVICE_CLASS = DataserviceVote option_list = BaseKnessetDataserviceCollectionCommand.option_list + ( make_option( '--validate-votes-pages', dest='validatevotepages', help= "validate votes between (and including) given page range\npages in this case are based on vote id ascending, so you'll have the same page number each time" ), make_option( '--validate-skip-to', dest='validateskipto', help= "skip to the given vote id (for use with --validate-votes-pages)"), make_option( '--create-vote-src-id', dest='createvotesrcid', help= "create the given vote/s from the comma-separated src ids (assuming they don't already exist in DB)" ), make_option( '--validate-output-file', dest='validateoutputfile', help="where to write the validation results to (defaults to stdout)" ), make_option( '--validate-fix', dest='validatefix', action='store_true', help= "try to fix some problems directly in DB which are safe to automatically fix" )) help = "Scrape votes data from the knesset" dataservice_model_map = { # model attribute name | dataservice attribute name, or lambda to get the value 'src_id': 'id', 'title': lambda vote: u'{vote} - {sess}'.format(vote=vote.item_dscr, sess=vote.sess_item_dscr), 'time_string': lambda vote: u'יום %s' % hebrew_strftime(vote.datetime), 'importance': lambda vote: 1, 'time': 'datetime', 'meeting_number': "session_num", 'vote_number': 'nbr_in_sess', 'src_url': lambda vote: "http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s" % vote.id } def _get_dataservice_model_kwargs(self, dataservice_vote): return { k: getattr(dataservice_vote, v) if isinstance(v, str) else v(dataservice_vote) for k, v in self.dataservice_model_map.iteritems() } def _update_or_create_vote(self, dataservice_vote, oknesset_vote=None): vote_kwargs = self._get_dataservice_model_kwargs(dataservice_vote) if oknesset_vote: [setattr(oknesset_vote, k, v) for k, v in vote_kwargs.iteritems()] oknesset_vote.save() else: oknesset_vote = Vote.objects.create(**vote_kwargs) self._add_vote_actions(dataservice_vote, oknesset_vote) oknesset_vote.update_vote_properties() SyncdataCommand().find_synced_protocol(oknesset_vote) Link.objects.create( title=u'ההצבעה באתר הכנסת', url= 'http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s' % oknesset_vote.src_id, content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=str(oknesset_vote.id)) return oknesset_vote # if v.full_text_url != None: # l = Link(title=u'מסמך הצעת החוק באתר הכנסת', url=v.full_text_url, content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) # l.save() def _add_vote_actions(self, dataservice_vote, oknesset_vote): for member_id, vote_result_code in HtmlVote.get_from_vote_id( dataservice_vote.id).member_votes: member_qs = Member.objects.filter(pk=member_id) if member_qs.exists(): member = member_qs.first() vote_type = self._resolve_vote_type(vote_result_code) vote_action, created = VoteAction.objects.get_or_create( vote=oknesset_vote, member=member, defaults={ 'type': vote_type, 'party': member.current_party }) if created: vote_action.save() else: raise VoteScraperException( 'vote %s: could not find member id %s' % (dataservice_vote.id, member_id)) def _has_existing_object(self, dataservice_vote): qs = Vote.objects.filter(src_id=dataservice_vote.id) return qs.exists() def _create_new_object(self, dataservice_vote): return self._update_or_create_vote(dataservice_vote) def _resolve_vote_type(cls, vote_result_code): return { 'voted for': u'for', 'voted against': u'against', 'abstain': u'abstain', 'did not vote': u'no-vote', }[vote_result_code] def recreate_objects(self, vote_ids): recreated_votes = [] for vote_id in vote_ids: oknesset_vote = Vote.objects.get(id=int(vote_id)) vote_src_id = oknesset_vote.src_id dataservice_vote = self.DATASERVICE_CLASS.get(vote_src_id) VoteAction.objects.filter(vote=oknesset_vote).delete() Link.objects.filter( content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=oknesset_vote.id).delete() recreated_votes.append( self._update_or_create_vote(dataservice_vote, oknesset_vote)) return recreated_votes def _validate_vote(self, dataservice_vote, csv_writer, fix=False): # check the basic metadata qs = Vote.objects.filter(src_id=dataservice_vote.id) if qs.count() != 1: if fix: self._log_info( 'could not find corresponding vote in DB, creating it now') self._create_new_object(dataservice_vote) else: error = 'could not find corresponding vote in DB (qs.count=%s)' % ( qs.count(), ) self._log_warn(error) csv_writer.writerow( [dataservice_vote.id, '', error.encode('utf-8')]) else: oknesset_vote = qs.first() for attr_name, expected_value in self._get_dataservice_model_kwargs( dataservice_vote).iteritems(): actual_value = getattr(oknesset_vote, attr_name) if attr_name == 'time_string': # remove some unprintable artifacts which for some reason are in the old scraper's votes actual_value = actual_value.replace(u"\u200f", "").replace( u"\xa0", " ") if attr_name == 'title' and actual_value != expected_value: # try a slightly different format which exists in DB in some cases actual_value = actual_value.replace( u" - הצעת חוק", u" - חוק") if actual_value != expected_value: if fix and attr_name in ['title', 'src_url']: self._log_info('fixing mismatch in %s attribute' % (attr_name, )) setattr(oknesset_vote, attr_name, expected_value) oknesset_vote.save() else: error = 'value mismatch for %s (expected="%s", actual="%s")' % ( attr_name, expected_value, actual_value) self._log_warn(error) csv_writer.writerow([ dataservice_vote.id, oknesset_vote.id, error.encode('utf-8') ]) # validate the vote counts for type_title, oknesset_count, dataservice_count in zip( ('for', 'against', 'abstain'), [ oknesset_vote.actions.filter(type=t).count() for t in 'for', 'against', 'abstain' ], [ int(getattr(dataservice_vote, t)) for t in 'total_for', 'total_against', 'total_abstain' ]): if oknesset_count != dataservice_count: error = 'mismatch in %s count (expected=%s, actual=%s)' % ( type_title, dataservice_count, oknesset_count) self._log_warn(error) csv_writer.writerow([ dataservice_vote.id, oknesset_vote.id, error.encode('utf-8') ])
class Command(BaseKnessetDataserviceCollectionCommand): DATASERVICE_CLASS = DataserviceVote DATASERVICE_MODEL_MAP = { # model attribute name | dataservice attribute name, or lambda to get the value 'src_id': 'id', 'title': lambda vote: u'{vote} - {sess}'.format(vote=vote.item_dscr, sess=vote.sess_item_dscr), 'time_string': lambda vote: u'יום %s' % hebrew_strftime(vote.datetime), 'importance': lambda vote: 1, 'time': 'datetime', 'meeting_number': "session_num", 'vote_number': 'nbr_in_sess', 'src_url': lambda vote: "http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s" % vote.id } VALIDATE_FIELDS_TO_AUTOFIX = ['title', 'src_url'] help = "Scrape votes data from the knesset" def _update_or_create_vote(self, dataservice_vote, oknesset_vote=None): vote_kwargs = self._get_dataservice_model_kwargs(dataservice_vote) if oknesset_vote: [setattr(oknesset_vote, k, v) for k, v in vote_kwargs.iteritems()] oknesset_vote.save() else: oknesset_vote = Vote.objects.create(**vote_kwargs) self._add_vote_actions(dataservice_vote, oknesset_vote) oknesset_vote.update_vote_properties() SyncdataCommand().find_synced_protocol(oknesset_vote) Link.objects.create( title=u'ההצבעה באתר הכנסת', url= 'http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s' % oknesset_vote.src_id, content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=str(oknesset_vote.id)) return oknesset_vote # if v.full_text_url != None: # l = Link(title=u'מסמך הצעת החוק באתר הכנסת', url=v.full_text_url, content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) # l.save() def _add_vote_actions(self, dataservice_vote, oknesset_vote): for member_id, vote_result_code in HtmlVote.get_from_vote_id( dataservice_vote.id).member_votes: member_qs = Member.objects.filter(pk=member_id) if member_qs.exists(): member = member_qs.first() vote_type = self._resolve_vote_type(vote_result_code) vote_action, created = VoteAction.objects.get_or_create( vote=oknesset_vote, member=member, defaults={ 'type': vote_type, 'party': member.current_party }) if created: vote_action.save() else: raise VoteScraperException( 'vote %s: could not find member id %s' % (dataservice_vote.id, member_id)) def _has_existing_object(self, dataservice_vote): qs = Vote.objects.filter(src_id=dataservice_vote.id) return qs.exists() def _get_existing_object(self, dataservice_object): return Vote.objects.get(src_id=dataservice_object.id) def _create_new_object(self, dataservice_vote): return self._update_or_create_vote(dataservice_vote) def _resolve_vote_type(cls, vote_result_code): return { 'voted for': u'for', 'voted against': u'against', 'abstain': u'abstain', 'did not vote': u'no-vote', }[vote_result_code] def recreate_objects(self, vote_ids): recreated_votes = [] for vote_id in vote_ids: oknesset_vote = Vote.objects.get(id=int(vote_id)) vote_src_id = oknesset_vote.src_id dataservice_vote = self.DATASERVICE_CLASS.get(vote_src_id) VoteAction.objects.filter(vote=oknesset_vote).delete() Link.objects.filter( content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=oknesset_vote.id).delete() recreated_votes.append( self._update_or_create_vote(dataservice_vote, oknesset_vote)) return recreated_votes def _get_validate_first_object_title(self, dataservice_object): return 'date: %s' % dataservice_object.datetime def _validate_attr_actual_expected(self, attr_name, actual_value, expected_value): if attr_name == 'time_string': # remove some unprintable artifacts which for some reason are in the old scraper's votes actual_value = actual_value.replace(u"\u200f", "").replace(u"\xa0", " ") elif attr_name == 'title' and actual_value != expected_value: # try a slightly different format which exists in DB in some cases actual_value = actual_value.replace(u" - הצעת חוק", u" - חוק") return super(Command, self)._validate_attr_actual_expected( attr_name, actual_value, expected_value) def _validate_dataservice_oknesset_object(self, dataservice_object, oknesset_object, writer, fix): oknesset_object.update_from_knesset_data() return None