def get_meeting(self, page): items = self.get_meeting_items(page) info = self.get_meeting_info(page) # FIXME: THIS DEPENDS ON USING collect() info['id'], info['guid'] = legistar_id_guid(self.url) info['link'] = self.url id, guid = legistar_id_guid(info['dept']) info['dept_id'] = id return dict(items=items, info=info)
def get_people_ids(self): anchors = self._get_people_anchors() ids = {} for anchor in anchors: name = anchor.text.strip() id, guid = legistar_id_guid(anchor['href']) ids[id] = name return ids
def add_collected_action(self, item_id, action): dbaction = Action() action['filetype'] = action['ftype'] main_keys = ['id', 'guid'] if not action['roll_call']: main_keys += ['action', 'action_text', 'result', 'filetype'] for key in main_keys: setattr(dbaction, key, action[key]) if not action['roll_call']: for key in ['agenda_note', 'minutes_note']: value = action[key].strip() if not value: value = None setattr(dbaction, key, value) for key in ['mover', 'seconder']: name, link = action[key] if not name: continue id, guid = legistar_id_guid(link) att = '%s_id' % key setattr(dbaction, att, id) file_id, link = action['file_id'] dbaction.file_id = file_id else: dbaction.action = 'Roll Call' self.session.add(dbaction) # flush here so the action can be referred by # foreign keys in 'item_action' and 'action_vote' self.session.flush() # make item_action object item_action = ItemAction(item_id, dbaction.id) self.session.add(item_action) # handle votes ward_num = 1 vote_attributes = dict(action_id=dbaction.id) for name, link, vote in action['votes']: person_id, ignore = legistar_id_guid(link) id_key = 'ward{}_person_id'.format(ward_num) vote_attributes[id_key] = person_id vote_attributes['ward{}'.format(ward_num)] = vote ward_num += 1 avote = ActionVote() for attribute, value in vote_attributes.items(): setattr(avote, attribute, value) self.session.add(avote)
def _get_depts(self, page): depts = [] # each entry is a tuple (id, guid, name) anchors = page.find_all('a', id=re.compile('.+_hypBody$')) for anchor in anchors: id, guid = legistar_id_guid(anchor['href']) name = anchor.text.strip() depts.append((id, guid, name)) return depts
def _add_meeting_from_rss(self, entry): with transaction.manager: m = Meeting() m.title = entry.title m.link = entry.link m.rss = entry m.id, m.guid = legistar_id_guid(entry.link) m.updated = datetime.now() self.session.add(m) return self.session.merge(m)
def add_meeting_from_rss(self, entry): transaction.begin() meeting = Meeting() meeting.title = entry.title meeting.link = entry.link meeting.rss = entry meeting.id, meeting.guid = legistar_id_guid(entry.link) meeting.updated = datetime.now() self.session.add(meeting) self.session.flush() transaction.commit()
def add(self, item_id, actiondata): with transaction.manager: a = Action() actiondata['filetype'] = actiondata['ftype'] main_keys = ['id', 'guid'] rollcall = actiondata['roll_call'] if not rollcall: main_keys += ['action', 'action_text', 'result', 'filetype'] for key in main_keys: setattr(a, key, actiondata[key]) if not rollcall: for key in ['agenda_note', 'minutes_note']: value = actiondata[key].strip() if not value: value = None setattr(a, key, value) for key in ['mover', 'seconder']: name, link = actiondata[key] if not name: continue id, guid = legistar_id_guid(link) attribute = '%s_id' % key setattr(a, attribute, id) file_id, link = actiondata['file_id'] a.file_id = file_id else: a.action = 'Roll Call' self.session.add(a) # flush here so the action can be referred by # foreign keys in 'item_action' and 'action_vote' self.session.flush() # make item_action object item_action = ItemAction(item_id, a.id) self.session.add(item_action) # handle votes for name, link, vote in actiondata['votes']: person_id, ignore = legistar_id_guid(link) avote = ActionVote(a.id, person_id, vote) self.session.add(avote) return self.session.merge(a)
def remote_legislation_item(self, link): item = self._remote_legislation_item(link) # add id, guid to item item['id'], item['guid'] = legistar_id_guid(link) for key in ['introduced', 'on_agenda', 'passed']: if key in item and item[key]: item[key] = make_true_date(item[key]) key = 'action_details' if len(item[key]): item['acted_on'] = True else: item['acted_on'] = False return item
def add_rss_meetings(self, url, rss=None): if rss is None: rss = self.manager.get_rss(url) for entry in rss.entries: id, guid = legistar_id_guid(entry.link) meeting = self.session.query(Meeting).get(id) if meeting is None: print("adding meeting {} from rss".format(id)) try: self.manager.add_meeting_from_rss(entry) self.session.commit() except IntegrityError: self.session.rollback() else: print("Meeting {} already present.".format(id))
def make_cache_object(self, type, link=None): from hubby.database import MainCache id = None if type in ['meeting', 'item', 'action']: id, guid = legistar_id_guid(link) filename = self._filename(type, id) dbname = self._dbname(type, id) if os.path.isfile(filename): content = Pickle.load(open(filename, 'rb'), encoding='utf-8') now = datetime.now() mc = MainCache() mc.name = dbname mc.retrieved = now mc.updated = now mc.content = content else: raise RuntimeError("No file present %s" % filename) return mc
def _get_person(self, page): markers = DATA_IDENTIFIERS item_keys = list(markers.keys()) item = {}.fromkeys(item_keys) for key in item_keys: if key == 'photo_link': no_pix = False #print "trying for key", key exp = re.compile('.+%s$' % markers[key]) tags = page.find_all('span', id=exp) ttype = 'span' if not tags: tags = page.find_all('a', id=exp) ttype = 'a' if not tags: tags = page.find_all('img', id=exp) ttype = 'img' if not tags: if ttype == 'img' and key == 'photo_link': no_pix = True else: raise RuntimeError("no tags found for %s" % key) if len(tags) > 1: print("len(%s) == %d" % (key, len(tags))) if key == 'photo_link': if not no_pix: tag = tags[0] item[key] = tag['src'] else: item[key] = None continue tag = tags[0] if key == 'website': #item[key] = tag['href'] item[key] = tag.text.strip() continue item[key] = tag.text.strip() print(key, item[key]) item['id'], item['guid'] = legistar_id_guid(self.url) return item
def add(self, itemdata): with transaction.manager: i = Item() for key in itemdata: if key == 'attachments': continue value = itemdata[key] setattr(i, key, value) self.session.add(i) if itemdata['attachments'] is not None: for name, link in itemdata['attachments']: id, guid = legistar_id_guid(link) if self.session.query(Attachment).get(id) is not None: raise RuntimeError("Duplicate attachment %d" % id) a = Attachment() a.id = id a.guid = guid a.name = name a.link = link a.item_id = i.id self.session.add(a) return self.session.merge(i)
def collect(self, type, link=None): id = None if type in ['meeting', 'item', 'action']: id, guid = legistar_id_guid(link) filename = self._filename(type, id) if not os.path.isfile(filename): print("Retrieving %s from legistar..." % filename) collector = self._collector(type) if link is not None: print('link is', link, type) if not link.startswith('http'): link = collector.url_prefix + link print("Retrieving", link) collector.set_url(link) collector.collect() data = dict(result=collector.result, content=collector.content) Pickle.dump(data, open(filename, 'wb')) try: data = Pickle.load(open(filename, 'rb')) except UnicodeDecodeError: data = Pickle.load(open(filename, 'rb'), encoding='bytes') return data['result']
def collect(self): self.retrieve_page(self.url) if b'Invalid parameters!' in self.content: item = dict() item['action_details'] = [] item['bad_url'] = url self.item = item self.result = self.item print("Invalid parameters found", self.result) return self.item = self._get_item(self.soup) for key in ['passed', 'introduced', 'on_agenda']: if key in self.item and not self.item[key]: self.item[key] = None else: self.item[key] = make_true_date(self.item[key]) if len(self.item['action_details']): self.item['acted_on'] = True else: self.item['acted_on'] = False self.item['id'], self.item['guid'] = legistar_id_guid(self.url) self.result = self.item
def _merge_pickled_meeting_items(self, meeting_id, collected): transaction.begin() items = collected['items'] item_count = 0 for item in items: item_count += 1 item_id, guid = legistar_id_guid(item['item_page']) query = self.session.query(MeetingItem) query = query.filter_by(meeting_id=meeting_id) query = query.filter_by(item_id=item_id) try: dbitem = query.one() except NoResultFound: dbitem = MeetingItem(meeting_id, item_id) agenda_num = item['agenda_num'] ########################################## ## Work around ##################### ## irregular entries ##################### ########################################## if agenda_num == '2011-0229': agenda_num = None if agenda_num == '1.': agenda_num = '1' ########################################## ## #################### ## #################### ########################################## # first agenda item is missing from meeting details if meeting_id == 302621: agenda_num = '2' if agenda_num is not None: dbitem.agenda_num = agenda_num dbitem.type, dbitem.order = convert_agenda_number(agenda_num) dbitem.item_order = item_count dbitem.version = int(item['version']) self.session.merge(dbitem) self.session.flush() transaction.commit()
def _add_collected_legislation_item(self, item): transaction.begin() dbitem = Item() for key in item: if key == 'attachments': continue value = item[key] setattr(dbitem, key, value) self.session.add(dbitem) if item['attachments'] is not None: for name, link in item['attachments']: id, guid = legistar_id_guid(link) dbobj = self.session.query(Attachment).get(id) if dbobj is None: attachment = Attachment() attachment.id, attachment.guid = id, guid attachment.name = name attachment.link = link attachment.item_id = dbitem.id self.session.add(attachment) else: msg = 'Duplicate attachment %d' % id raise RuntimeError(msg) transaction.commit()
def collect(self): self.retrieve_page(self.url) self.action = self._get_action(self.soup) self.action['id'], self.action['guid'] = legistar_id_guid(self.url) self.result = self.action