コード例 #1
0
    def scrape_membership(self):
        # Returns chambers membership list with the basic information data
        # for each member of every chamber for Armenia's parliament.
        print "\n\tScraping membership's data from Armenia's parliament...\n"
        mps = self.members_list()
        memberships = []
        roles = self.membership_correction()
        chambers = {}
        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber['identifiers'][0]["identifier"]] = chamber['id']

        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['name']] = member['id']

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                   ' ', ETA(), ' ', FileTransferSpeed(), '             ']
        pbar = ProgressBar(widgets=widgets)
        for member in pbar(mps):
            p_id = members[member['name']]
            o_id = chambers[member['term']]
            role = ""
            membership_label = member['membership']
            if member['membership'].encode('utf-8') in roles:
                role = roles["անդամ"]
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=full&ord=alpha&show_session=" + member['term']
            membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url)
            memberships.append(membership_json)

        print "\n\tScraping completed! \n\tScraped " + str(len(memberships)) + " members"
        return memberships
コード例 #2
0
    def scrape_membership(self):
        # Iterates in chamber member json document and
        # returns the list with the json document structure that Visegrad+ API accepts
        print "\n\tScraping chambers membership's data from Belarus Upperhouse parliament...\n"
        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['name']] = member['id']

        chambers = {}
        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber['identifiers'][0]['identifier']] = chamber['id']
        terms = parser.terms
        mps_list = parser.members_list()
        chambers_membership = []

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for member in pbar(mps_list):
            p_id = members[member['name']]
            o_id = chambers[member['term']]
            url = terms[member['term']]['url']
            membership_label = member['membership']
            role = member['role']
            chamber_membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url)
            chambers_membership.append(chamber_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_membership)) + " members"
        return chambers_membership
コード例 #3
0
    def scrape_parliamentary_group_membership(self):
        # Returns parliamentary groups membership list with the basic information data
        # for each member of every parliamentary group for Armenia's parliament.
        print "\n\tScraping parliamentary groups membership from Armenia's parliament...\n"
        chambers = {}
        groups = {}
        members = {}
        memberships = self.membership_correction()

        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber['identifiers'][0]["identifier"]] = chamber['id']

        all_groups = vpapi.getall('organizations', where={"classification": "parliamentary group"})
        for group in all_groups:
            groups[group['sources'][0]['url']] = group['id']

        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['name']] = member['id']

        parties_membership = []

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for term in pbar(list(reversed(sorted(self.terms.keys())))):
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=factions&SubscribeEmail=&show_session=" + str(term)
            soup = scrape.download_html_file(url)
            for each_div in soup.findAll('div', {"class": "content"}):
                party_name = each_div.find("center").find("b").get_text()
                party_name_ordered = party_name.replace("  ", " ")
                exist = vpapi.getfirst("organizations", where={'name': party_name_ordered,
                                                               "parent_id": chambers[str(term)]})
                if exist:
                    o_id = exist['id']
                for each_tr in each_div.find('table', {"style": "margin-top:10px; margin-bottom:10px;"}).findAll('tr'):
                    if each_tr.has_attr('bgcolor'):
                        continue
                    else:
                        td_array = each_tr.findAll('td')
                        names = td_array[0].find('a').get_text().split(' ')
                        first_name = names[1]
                        last_name = names[0]
                        middle_name = names[2]
                        name_ordered = "%s %s %s" % (first_name, middle_name, last_name)
                        membership = each_tr.find('span', {'class': "news_date"}).get_text()

                        if membership == "":
                            membership = "անդամ".decode('utf-8')
                        else:
                            membership = membership[1:len(membership)-1]

                        role = memberships[membership.encode('utf-8')]
                        if name_ordered in members:
                            p_id = members[name_ordered]
                        party_membership_json = self.build_memberships_doc(p_id, o_id, membership, role, url)
                        parties_membership.append(party_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(parties_membership)) + " members of parliamentary groups"
        return parties_membership
コード例 #4
0
    def scrape_committee_members(self):
        # Returns committee groups membership list with the basic information data
        # for each member of every committee group for Armenia's parliament.
        print "\n\tScraping committee groups membership from Armenia's parliament...\n"
        committees = self.committee_list()
        committee_membership = []
        chambers = {}
        groups = {}
        members = {}
        memberships = self.membership_correction()
        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber['identifiers'][0]["identifier"]] = chamber['id']

        all_groups = vpapi.getall('organizations', where={"classification": "committe"})
        for group in all_groups:
            groups[group['sources'][0]['url']] = group['id']

        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['name']] = member['id']
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for committee in pbar(committees):
            url = committee['url'].replace('show', "members")
            soup = scrape.download_html_file(url)
            for each_tr in soup.find('table', {"style": "margin-top:10px; margin-bottom:10px;"}).findAll('tr'):
                if each_tr.has_attr('bgcolor'):
                    continue
                else:
                    td_array = each_tr.findAll('td')
                    if td_array:
                        names = td_array[0].find('a').get_text().split(' ')
                        first_name = names[1]
                        last_name = names[0]
                        middle_name = names[2]
                        name_ordered = "%s %s %s" % (first_name, middle_name, last_name)
                        membership = each_tr.find('span', {'class': "news_date"}).get_text()

                        if url in groups:
                            o_id = groups[url]

                        if membership == "":
                            membership = "անդամ".decode('utf-8')
                        else:
                            membership = membership[1:len(membership)-1]

                        role = memberships[membership.encode('utf-8')]
                        if name_ordered in members:
                            p_id = members[name_ordered]
                        party_membership_json = self.build_memberships_doc(p_id, o_id, membership, role, url)
                        committee_membership.append(party_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership)) + " members of committee groups"
        return committee_membership
コード例 #5
0
    def get_group_id(self):
        # Returns the json with all the organization IDs
        groups = {}
        parties_ids = []
        all_groups = vpapi.getall("organizations", where={"classification": "parliamentary group"})
        for group in all_groups:
            parties_ids.append(group['id'])

        memberships = vpapi.getall("memberships")
        for member in memberships:
            if member['organization_id'] in parties_ids:
                groups[member['person_id']] = member['organization_id']
            else:
                groups[member['person_id']] = None
        return groups
コード例 #6
0
    def committe_list(self):
        # Returns the list of committee groups with basic information for each
        committee_list = []
        chambers_list = {}
        chambers_api = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in chambers_api:
            chambers_list[chamber['identifiers'][0]['identifier']] = chamber['id']

        chambers = self.chambers_list()
        for term in chambers:
            soup = scrape.download_html_file(chambers[term]['url'])
            for each_h2 in soup.find("div", {"id": "committee_bm_info"}).findAll("h2"):
                name = each_h2.find("a").get_text()
                url = each_h2.find("a").get("href")
                start_date = chambers[term]['start_date']
                if term != "5":
                    end_date = chambers[term]['end_date']
                else:
                    end_date = None
                identifiers = re.findall(r'\d+', url)
                if len(identifiers) > 2:
                    identifier = identifiers[1]
                else:
                    identifier = identifiers[0]
                chamber_id = chambers_list[term]
                committee_json = {
                    "identifier": identifier,
                    "parent_id": chamber_id,
                    "name": name,
                    "url": url,
                    "start_date": start_date,
                    "end_date": end_date
                }
                committee_list.append(committee_json)
        return committee_list
コード例 #7
0
	def save(self, update_only=False):
		"""If a compatible membership already exists, update it. Otherwise,
		create a new one. If `update_only` is True, only existing memberships
		are updated, no new one is created.
		Memberships are compatible if their fields `start_date`, `role` and `post`
		are compatible. Field 'end_date' is not checked to allow for later corrections
		of guessed end dates used when a member disappears from a group profile.
		"""
		memberships = vpapi.getall('memberships',
			where={'person_id': self.person_id, 'organization_id': self.organization_id},
			sort='-start_date')
		to_save = self.__dict__.copy()

		id = None
		for existing in memberships:
			if self._merge_values('start_date', to_save, existing) \
					and to_save.get('end_date', '9999-12-31') >= existing.get('start_date', '0001-01-01') \
					and self._merge_values('role', to_save, existing) \
					and self._merge_values('post', to_save, existing):
				id = existing['id']
				self._merge_values('end_date', to_save, existing)
				break
			else:
				to_save = self.__dict__.copy()

		if id:
			resp = vpapi.put('memberships', id, to_save)
		else:
			if update_only: return
			resp = vpapi.post('memberships', self.__dict__)

		if resp['_status'] != 'OK':
			raise Exception(self.name, resp)
コード例 #8
0
 def update_motion_url(self):
     print "\n\tUpdating url of motions"
     motions = vpapi.getall("motions")
     counter = 0
     widgets = [
         "        Progress: ",
         Percentage(),
         " ",
         Bar(marker="#", left="[", right="]"),
         " ",
         ETA(),
         " - Processed: ",
         Counter(),
         " events             ",
     ]
     pbar = ProgressBar(widgets=widgets)
     for motion in motions:
         counter += 1
         sources = motion["sources"]
         url = sources[0]["url"]
         print (str(counter))
         if "http://w1.c1.rada.gov.ua" not in url:
             motion_id = motion["id"]
             motion["sources"][0]["url"] = "http://w1.c1.rada.gov.ua" + url
             items_to_delete = ["created_at", "updated_at", "_links", "id"]
             for item_delete in items_to_delete:
                 del motion[item_delete]
             vpapi.put("motions", motion_id, motion, effective_date=self.effective_date())
         else:
             continue
     print "\n\tFinished updating motions url"
コード例 #9
0
    def scrape_membership(self):
        # Returns chambers membership list with the basic information data
        # for each member of every chamber for Moldova's parliament.
        chamber_membership = []
        print "\n\tScraping chambers membership from Moldova's parliament..."
        mps_list = self.mps_list()
        members = {}
        membership_correction = self.membership_correction()
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['identifiers'][0]['identifier']] = member['id']
        chamber_id = vpapi.getfirst("organizations",
                                    where={"identifiers": {
                                        "$elemMatch": {
                                            "identifier": "20", "scheme": "parlament.md"
                                        }
                                    }})
        deputy_list_url = "http://www.parlament.md/StructuraParlamentului/" \
                          "Deputies/tabid/87/language/ro-RO/Default.aspx"

        for member in mps_list:
            p_id = members[member['identifier']]
            role = membership_correction[member['membership'].encode('utf-8')]
            chamber_membership_json = self.build_memberships_doc(p_id, chamber_id['id'], member['membership'],
                                                                 role, deputy_list_url)
            chamber_membership.append(chamber_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership)) + " members of chambers \n"
        return chamber_membership
コード例 #10
0
    def get_all_member_ids_for_votes(self):
        members = {}
        api_members = vpapi.getall("people")

        for member in api_members:
            members[member['identifiers'][0]['identifier']] = member['id']

        return members
コード例 #11
0
 def test1(self):
     motions = vpapi.getall("motions")
     counter = 0
     for motion in motions:
         counter += 1
         print counter
         print motion["id"]
         print "------------------------------------------------>"
コード例 #12
0
    def scrape_parliamentary_groups(self):
        # Scrapes parliamentary groups and Returns the list of
        # parliamentary groups with all the information needed for each
        parties_list = []
        terms_ids = {}

        all_terms = vpapi.getall("organizations", where={"classification": "chamber"})

        for term in all_terms:
            terms_ids[term['identifiers'][0]['identifier']] = term['id']

        parties_doc = self.parliamentary_groups()

        print "\n\tScraping parliamentary groups from Armenia's parliament...\n"
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for term in pbar(parties_doc):
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=factions&SubscribeEmail=&show_session=" + term
            soup = scrape.download_html_file(url)
            all_divs = soup.findAll('div', {"class": "content"})
            for each_div in all_divs:
                name = each_div.find("center").find("b").get_text()
                name_ordered = name.replace("  ", " ")
                if name_ordered in parties_doc[term]:
                    identifier = parties_doc[term][name_ordered]['identifier']
                    url_faction = parties_doc[term][name_ordered]['url']
                    founding_date = self.terms[term]["start_date"]
                    parent_id = terms_ids[str(term)]

                    if each_div.find("center").find("a"):
                        email = each_div.find("center").find("a").get_text()

                    if term != "5":
                        dissolution_date = self.terms[term]["end_date"]
                    else:
                        dissolution_date = None

                    party_json = self.build_organization_doc("parliamentary group", name_ordered, identifier,
                                                             founding_date, dissolution_date, url_faction, email, parent_id)

                    if not dissolution_date:
                        del party_json['dissolution_date']

                    if not email or email == None:
                        del party_json['contact_details']

                    if not identifier:
                        del party_json['identifiers']

                    parties_list.append(party_json)
                else:
                    print "term: %s \nname: %s" % (term, name_ordered)
        print "\n\tScraping completed! \n\tScraped " + str(len(parties_list)) + " parliametary groups"
        return parties_list
コード例 #13
0
    def scrape_committee_members(self):
        # Returns committee groups membership list with the basic information data
        # for each member of every committee group for Moldova's parliament.
        print "\n\tScraping committees membership from Moldova's parliament..."
        committees_list = self.committee_list()
        membership_correction = self.membership_correction()
        committees = {}
        all_committees = vpapi.getall("organizations", where={"classification": "committe"})
        for committe in all_committees:
            committees[committe['identifiers'][0]['identifier']] = committe['id']

        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['identifiers'][0]['identifier']] = member['id']

        committees_membership = []
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed members from: ", Counter(), ' committees             ']
        pbar = ProgressBar(widgets=widgets)
        for committee in pbar(committees_list):
            committee_identifier = committee['identifier']
            soup_party = scrape.download_html_file(committee['url'])
            for each_tr in soup_party.find("fieldset", {"id": "dnn_ctr486_ViewCommissionPermanent_ctrlViewCommissionType_fsMembers"}).findAll('tr'):
                td_array = each_tr.findAll('td')
                link = td_array[1].find('a').get('href')
                index_start = link.index('/Id/') + 4
                index_end = link.index('/la')
                member_identifier = link[index_start:index_end]
                membership = td_array[2].get_text().strip()
                member_id = members[member_identifier]
                o_id = committees[committee_identifier]
                if membership == "":
                    membership = "Membru"
                role = membership_correction[membership.encode('utf-8')]
                committees_membership_json = self.build_memberships_doc(member_id, o_id, membership,
                                                                        role, committee['url'])
                committees_membership.append(committees_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(committees_membership)) + " members of committee groups\n"
        return committees_membership
コード例 #14
0
    def scrape_parliamentary_group_membership(self):
        # Returns parliamentary groups membership list with the basic information data
        # for each member of every parliamentary group for Moldova's parliament.
        print "\n\tScraping parliamentary groups membership from Moldova's parliament..."
        parties_list = self.parliamentary_group_list()
        membership_correction = self.membership_correction()
        parties = {}
        all_parties = vpapi.getall("organizations", where={'classification': "parliamentary group"})
        for party in all_parties:
            parties[party['identifiers'][0]['identifier']] = party['id']

        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['identifiers'][0]['identifier']] = member['id']

        parties_membership = []
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed members from: ", Counter(), ' parliamentary groups             ']
        pbar = ProgressBar(widgets=widgets)
        for party in pbar(parties_list):
            party_identifier = party['identifier']
            soup_party = scrape.download_html_file(party['url'])
            for each_tr in soup_party.find("fieldset", {"id": "dnn_ctr482_ViewFraction_fsMembers"}).findAll('tr'):
                td_array = each_tr.findAll('td')
                link = td_array[1].find('a').get('href')
                index_start = link.index('/Id/') + 4
                index_end = link.index('/la')
                member_identifier = link[index_start:index_end]
                membership = td_array[2].get_text().strip()
                member_id = members[member_identifier]
                o_id = parties[party_identifier]
                if membership == "":
                    membership = "Membru"
                role = membership_correction[membership.encode('utf-8')]
                party_membership_json = self.build_memberships_doc(member_id, o_id, membership, role, party['url'])
                parties_membership.append(party_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(parties_membership)) + " members of parties \n"
        return parties_membership
コード例 #15
0
    def scrape_membership(self):
        print "\n\tScraping chambers membership's data from Ukraine's parliament..."
        print "\tPlease wait. This may take a few moments...\n"
        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member["name"]] = member["id"]

        chambers = {}
        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber["identifiers"][0]["identifier"]] = chamber["id"]
        terms = parser.chambers()
        mps_list = parser.mps_list()
        chambers_membership = []
        widgets = [
            "        Progress: ",
            Percentage(),
            " ",
            Bar(marker="#", left="[", right="]"),
            " ",
            ETA(),
            " - Processed: ",
            Counter(),
            " items             ",
        ]
        pbar = ProgressBar(widgets=widgets)
        for member in pbar(mps_list):
            if member["name"] in members:
                p_id = members[member["name"]]
                o_id = chambers[member["term"]]
                url = terms[member["term"]]["url"]
                membership_label = member["membership"]
                role = member["role"]
                chamber_membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url)
                chambers_membership.append(chamber_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_membership)) + " members"
        return chambers_membership
コード例 #16
0
    def committee_membership(self):
        # Returns committee groups membership list with all needed information data
        # for each member of every committee group for Belarus Lower house parliament.
        committee_list = self.committee_list()
        element_positions = self.committee_membership_list()
        committee_members = {}
        members = {}
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['sources'][0]['url']] = member['id']

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' committees             ']
        pbar = ProgressBar(widgets=widgets)
        for committee in pbar(committee_list):
            identifier = int(committee['identifier']) + 2
            url = committee['url'].replace(committee['identifier'], str(identifier))
            soup = scrape.download_html_file(url)
            all_tr_elements = soup.find("table", {"cellpadding": "2"}).findAll('tr')
            all_tr = all_tr_elements[:len(all_tr_elements) - 2]
            committee_members[committee['identifier']] = {}
            committee_members[committee['identifier']]["Старшыня"] = []
            committee_members[committee['identifier']]["Намеснікі старшыні"] = []
            committee_members[committee['identifier']]["Члены камісіі"] = []

            if committee['identifier'] in element_positions:
                index_start_first = element_positions[committee['identifier']][0]
                index_start_middle = element_positions[committee['identifier']][1]
                index_penultimate = element_positions[committee['identifier']][2]
                index_start_last = element_positions[committee['identifier']][3]
                for each_tr in all_tr[index_start_first:index_start_middle]:
                    if each_tr.find("a"):
                        url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041")
                        member_id = members[url]
                        committee_members[committee['identifier']]["Старшыня"].append(member_id)

                for each_tr in all_tr[index_start_middle:index_penultimate]:
                    if each_tr.find("a"):
                        url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041")
                        member_id = members[url]
                        committee_members[committee['identifier']]["Намеснікі старшыні"].append(member_id)

                for each_tr in all_tr[index_penultimate:index_start_last]:
                    if each_tr.find("a"):
                        url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041")
                        member_id = members[url]
                        committee_members[committee['identifier']]["Члены камісіі"].append(member_id)

        return committee_members
コード例 #17
0
 def scrape_membership(self):
     # Iterates in chamber member json document and
     # returns the list with the json document structure that Visegrad+ API accepts
     print "\n\tScraping membership's data from Belarus Lowerhouese parliament..."
     mp_list = parser.mps_list()
     chamber_membership_list = []
     members = {}
     url = "http://house.gov.by/index.php/,17041,,,,2,,,0.html"
     all_members = vpapi.getall("people")
     for person in all_members:
         members[person['identifiers'][0]['identifier']] = person['id']
     chamber = vpapi.getfirst("organizations", where={"identifiers": {"$elemMatch": {"identifier": "2", "scheme": "house.by"}}})
     for member in mp_list:
         p_id = members[member['identifier']]
         o_id = chamber['id']
         chamber_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'], member['role'], url)
         chamber_membership_list.append(chamber_membership_json)
     print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership_list)) + " members"
     return chamber_membership_list
コード例 #18
0
 def scrape_committee_members(self):
     # Iterates in every committee member json doc and returns the
     # list with the json document structure that Visegrad+ API accepts
     print "\n\tScraping committee groups membership from Belarus Lowerhouse parliament..."
     committee_membership_list = []
     committee_list = parser.committee_membership()
     groups = {}
     all_groups = vpapi.getall("organizations", where={"classification": "committe"})
     for group in all_groups:
         groups[group['identifiers'][0]['identifier']] = group['id']
     roles = parser.membership_correction()
     for committee in committee_list:
         identifier = int(committee) + 2
         url = "http://house.gov.by/index.php/,17230,,,,2,,,0.html".replace("17230", str(identifier))
         for membership in committee_list[committee]:
             for members in committee_list[committee][membership]:
                 role = roles[membership]
                 membership_json = self.build_memberships_doc(members, groups[committee], membership, role, url)
                 committee_membership_list.append(membership_json)
     print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership_list)) + " members of committee groups"
     return committee_membership_list
コード例 #19
0
    def scrape_committee_members(self):
        # Iterates in every committee member json doc and returns the
        # list with the json document structure that Visegrad+ API accepts
        print "\n\tScraping committee groups from Belarus Upperhouse parliament...\n"
        members = {}
        committee_membership = []
        all_members = vpapi.getall("people")
        for member in all_members:
            if member['identifiers'][0]['identifier'] not in members:
                members[member['identifiers'][0]['identifier']] = member['id']
            else:
                continue

        committee_membership_list = parser.committee_membership()
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)

        print "\n\tProcessing members of committee groups from Belarus Upperhouse parliament...\n"
        for member in pbar(committee_membership_list):

            if member['identifier'] in members:
                p_id = members[member['identifier']]
            else:
                p_id = None
            existing = vpapi.getfirst("organizations", where={"name": member['committee_name'], "parent_id": member['committee_parent_id']})
            if existing:
                o_id = existing['id']
            else:
                o_id = None

            if p_id and o_id:
                committee_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'],
                                                                       member['role'], member['url'])
                committee_membership.append(committee_membership_json)
            else:
                continue
        print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership)) + " members"
        return committee_membership
コード例 #20
0
 def committee_list(self):
     # Returns the list of committee groups with basic information for each
     committee_list = []
     chambers = {}
     all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
     for chamber in all_chambers:
         chambers[chamber['identifiers'][0]['identifier']] = chamber['id']
     for i in range(3, 6):
         url = "http://www.parliament.am/committees.php?lang=arm&show_session=" + str(i)
         soup = scrape.download_html_file(url)
         for each_tr in soup.find("table", {"class": "com-table"}).findAll('tr', {"valign": "top"}):
             for each_td in each_tr.findAll('td'):
                 name = each_td.find('a', {"class": "blue_mid_b"}).get_text()
                 url = "http://www.parliament.am" + each_td.find('a', {"class": "blue_mid_b"}).get("href")
                 identifier = re.findall(r'\d+', url)
                 committee_json = {
                     "name": name,
                     "url": url,
                     "identifier": identifier[0],
                     "parent_id": chambers[str(i)]
                 }
                 committee_list.append(committee_json)
     return committee_list
コード例 #21
0
ファイル: answers.py プロジェクト: PetrFlegr/vaa2012-2
    },
    {
        "code_api" : "cz/senat",
        "code": "senat",
        "code_csv": "upper",
        "name": "Senát"
    }
    
    
]

for p in parliaments:
    vpapi.parliament(p['code_api'])
    for ve in ves:
        if ve[p['code_csv'] + '_vote_event_id'] != '':
            votes = vpapi.getall("votes",where={"vote_event_id":ve[p['code_csv'] + '_vote_event_id']})
            print(ve[p['code_csv'] + '_vote_event_id'])
            for v in votes:
                try:
                    data[p['code'] + '_' + v['voter_id']]
                except:
                    data[p['code'] + '_' + v['voter_id']] = {}
                    data[p['code'] + '_' + v['voter_id']]['votes'] = {}
                    data[p['code'] + '_' + v['voter_id']]['chamber'] = p['code']
                    data[p['code'] + '_' + v['voter_id']]['chamber_name'] = p['name']
                    data[p['code'] + '_' + v['voter_id']]['id'] = v['voter_id']
                data[p['code'] + '_' + v['voter_id']]['votes'][ve['id']] = o2o[v['option']] * int(ve[p['code_csv'] + '_polarity'])
                data[p['code'] + '_' + v['voter_id']]['group_id'] = v['group_id']
    
    os = {}   
    for k in data:
コード例 #22
0
    try:
        out = m.group(1)
    except:
        out = ""
    return out


organizations = {}
people = {}
vote_events = {}
motions = {}
votes = []



orgs = vpapi.getall("organizations")
for org in orgs:
    organizations[org['id']] = org
print("organizations downloaded: ",len(organizations))

peop = vpapi.getall("people")
for person in peop:
    people[person['id']] = person
print("people downloaded: ",len(people))

for term in terms:

    sli = term["since"].split('-')
    uli = term["until"].split('-')
    directory = "sk-nrsr-" + term["id"] + "-" + sli[0] + "-" + uli[0] + "-roll-call-votes"
コード例 #23
0
def scrape_new_debates(term):
	"""Scrape and save speeches from debates of the given term, one
	of those newer terms where transcripts of debates are published
	in parts assigned to individual speakers.

	Returns number of scraped speeches.
	"""
	debate_part_kinds = {
		'Uvádzajúci uvádza bod': 'speech',
		'Vstup predsedajúceho': 'speech',
		'Vystúpenie spoločného spravodajcu': 'speech',
		'Vystúpenie': 'speech',
		'Vystúpenie v rozprave': 'speech',
		'Vystúpenie s faktickou poznámkou': 'speech',
		'Vystúpenie s procedurálnym návrhom': 'speech',
		'Prednesenie otázky': 'question',
		'Zodpovedanie otázky': 'answer',
		'Doplňujúca otázka / reakcia zadávajúceho': 'question',
		'Prednesenie interpelácie': 'question',
		'Odpoveď na interpeláciu': 'answer',
		'scene': 'scene'
	}

	def insert_speech(kind):
		"""Insert a speech entity for the given debate part kind
		and data from parent scope variables and update end date
		of the corresponding session and sitting. Delete `text`
		variable."""
		nonlocal text, last_speech_enddatetime
		if not text: return
		speech = {
			'text': text.strip().replace('[', '(').replace(']', ')'),
			'date': start_datetime,
			'type': debate_part_kinds.get(kind, 'speech'),
			'position': len(speeches) + 1,
			'event_id': sitting_id,
			'sources' : [{
				'url': dpart_url,
				'note': 'Prepis časti debaty na webe NRSR'
			}]
		}
		if dpart_video:
			speech['video'] = dpart_video
		if kind != 'scene':
			speech['creator_id'] = speaker_id
			speech['attribution_text'] = attribution.strip()
		speeches.append(speech)
		text = ''

		if end_datetime > session_end_date:
			vpapi.patch('events', session_id, {'end_date': end_datetime})
		if end_datetime > sitting_end_date:
			vpapi.patch('events', sitting_id, {'end_date': end_datetime})
		last_speech_enddatetime = datetime.strptime(end_datetime, '%Y-%m-%dT%H:%M:%S')

	logging.info('Scraping debates of term `%s`' % term)
	chamber_id = get_chamber_id(term)

	# prepare mapping from MP's name to id
	people = vpapi.getall('people', projection={'name': 1})
	mps = {mp['name']: mp['id'] for mp in people}

	# load name corrections
	with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f:
		name_corrections = json.load(f)

	# scraping will start since the most recent sitting start date
	last_sitting = vpapi.getfirst('events',
		where={'type': 'sitting', 'organization_id': chamber_id},
		sort='-start_date')
	since_date = last_sitting['start_date'][:10] if last_sitting else None

	# scrape list of debate parts
	debate_parts = parse.new_debates_list(term, since_date)

	speech_count = 0
	session_name = ''
	speeches = []
	for dp in debate_parts['_items']:
		# stop at very recent debate parts (may be incomplete)
		start_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['od']))
		sd = datetime.strptime(start_datetime, '%Y-%m-%dT%H:%M:%S')
		if datetime.utcnow() - sd < timedelta(days=5):
			break

		# skip already scraped debate parts
		existing = vpapi.getfirst('speeches', where={'sources.url': dp['prepis']['url']})
		if existing: continue

		logging.info('Scraping debate part %s %s-%s (id=%s)' %
			(dp['dátum'], dp['trvanie']['od'], dp['trvanie']['do'], dp['prepis']['id']))
		dpart = parse.debate_of_terms56(dp['prepis']['id'])
		if not dpart['riadky']: continue

		end_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['do']))
		dpart_kind = dp['druh']
		dpart_url = dp['prepis']['url']
		dpart_video = dp['video']['url'] if 'video' in dp else None

		if not session_name.startswith('%s. ' % dp['schôdza']):
			# create new session event
			session_name = '%s. schôdza' % dp['schôdza']
			session = {
				'name': session_name,
				'identifier': dp['schôdza'],
				'organization_id': chamber_id,
				'type': 'session',
				'start_date': start_datetime,
				'end_date': end_datetime,
			}
			key = ('organization_id', 'type', 'identifier')
			session_id, _ = get_or_create('events', session, key)
			session_end_date = end_datetime

			# find the last moment of the last sitting of this session
			session_last_sitting = vpapi.getfirst('events',
				where={'type': 'sitting', 'parent_id': session_id},
				sort='-start_date')
			if session_last_sitting:
				last_speech_enddatetime = datetime.strptime(session_last_sitting['end_date'], '%Y-%m-%dT%H:%M:%S')
				sitting_identifier = session_last_sitting['identifier']
				sitting_id = session_last_sitting['id']
				sitting_end_date = session_last_sitting['end_date']
			else:
				last_speech_enddatetime = datetime.min
				sitting_identifier = '0'

		if sd - last_speech_enddatetime > timedelta(hours=5):
			# create new sitting event
			sitting_identifier = str(int(sitting_identifier) + 1)
			sitting_name = '%s. deň rokovania, %s' % (sitting_identifier, dp['dátum'])
			sitting = {
				'name': sitting_name,
				'identifier': sitting_identifier,
				'organization_id': chamber_id,
				'type': 'sitting',
				'start_date': start_datetime,
				'end_date': end_datetime,
				'parent_id': session_id,
			}
			key = ('parent_id', 'type', 'identifier')
			sitting_id, _ = get_or_create('events', sitting, key)
			sitting_end_date = end_datetime

			# save speeches of the previous sitting
			if len(speeches) > 0:
				vpapi.post('speeches', speeches)
				speech_count += len(speeches)
			if dp != debate_parts['_items'][0]:
				logging.info('Scraped %s speeches from previous sitting' % len(speeches))
			speeches = []

		# add the first speaker name that is sometimes missing
		first_speaker = '<strong>%s, %s</strong>' % (dp['osoba']['meno'], dp['osoba']['funkcia'])
		dpart['riadky'].insert(0, first_speaker)

		# extract speeches from the debate part
		text = ''
		within_scene = False
		for par in dpart['riadky']:
			if not par: continue
			par = par.replace('\n', ' ').strip()

			# skip eventual speech number
			if re.match('^(\d+)\.$', par): continue

			# convert brackets to parentheses
			par = re.sub(r'\[(.*?)\]', r'(\1)', par)
			# convert all inner nested parentheses to brackets
			n = 1
			while n >= 1:
				(par, n) = re.subn(r'\((.*?)\((\.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL)

			# process eventual multiparagraph scene
			if par.startswith('(') and par.count('(') > par.count(')'):
				# save eventual previous speech
				insert_speech(dpart_kind)

				text = '<p>%s</p>' % lxml.html.fromstring(par[1:]).text_content()
				within_scene = True
				continue
			if within_scene:
				if par.endswith(')') and par.count(')') > par.count('('):
					text += '\n\n<p>%s</p>' % lxml.html.fromstring(par[:-1]).text_content()
					insert_speech('scene')
					within_scene = False
				else:
					text += '\n\n<p>%s</p>' % lxml.html.fromstring(par).text_content()
				continue

			# process eventual new speaker
			# format `Doe, John, foreign minister`
			speech_start_pattern = r'<strong>(\w+), (\w+\.?)( (\w+\.?))?, (.*)</strong>'
			sp = re.match(speech_start_pattern, par, re.DOTALL)
			if sp:
				# save eventual previous speech
				insert_speech(dpart_kind)

				# identify speaker
				name = '%s %s' % (sp.group(2), sp.group(1))
				if (sp.group(4)):
					name = name.replace(' ', ' %s ' % sp.group(4))
				attribution = sp.group(5)
				if name in name_corrections:
					name = name_corrections[name]
				if len(name) == 0: continue
				speaker_id = mps.get(name)

				# create unknown speakers
				if not speaker_id:
					logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution))
					name_parts = re.match(r'(\w+\.?)( (\w+\.?))? (\w+)', name)
					person = {
						'name': name,
						'family_name': name_parts.group(4),
						'given_name': name_parts.group(1)
					}
					person['sort_name'] = '%s, %s' % (person['family_name'], person['given_name'])
					if name_parts.group(3):
						person['additional_name'] = name_parts.group(3)
						person['sort_name'] += ' %s' % person['additional_name']
					resp = vpapi.post('people', person)
					speaker_id = resp['id']
					mps[name] = speaker_id
				continue

			# remove HTML tags
			par = lxml.html.fromstring(par).text_content()

			# process eventual scene in this paragraph
			scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS
			while True:
				scene = re.match(scene_pattern, par, re.DOTALL)
				if not scene: break
				if scene.group(1):
					text += '\n\n<p>%s</p>' % scene.group(1).strip()
				insert_speech(dpart_kind)
				text = '<p>%s</p>' % scene.group(2).strip()
				insert_speech('scene')
				par = scene.group(3)

			if par:
				text += '\n\n<p>%s</p>' % par

		insert_speech(dpart_kind)

	if len(speeches) > 0:
		vpapi.post('speeches', speeches)
	logging.info('Scraped %s speeches' % len(speeches))
	speech_count += len(speeches)

	logging.info('Scraped %s speeches in total' % speech_count)
コード例 #24
0
def scrape_old_debates(term):
	"""Scrape and save speeches from debates of the given term, one
	of those older terms where transcripts of debates are stored in
	RTF files.

	Returns number of scraped speeches.
	"""

	def insert_speech(type):
		"""Insert a speech entity with the given type and data
		from parent scope variables and update end date of the
		corresponding session and sitting. Delete `text`
		variable."""
		nonlocal text, position
		if not text: return
		position = position + 1
		speech = {
			'text': text.strip().replace('[', '(').replace(']', ')'),
			'type': type,
			'position': position,
			'event_id': sitting_id,
			'sources' : [{
				'url': debate['url'],
				'note': 'Prepis debaty v Digitálnej knižnici na webe NRSR'
			}]
		}
		if type != 'scene':
			speech['creator_id'] = speaker_id
			speech['attribution_text'] = attribution.strip()
		speeches.append(speech)
		text = ''

		if date > session_end_date:
			vpapi.patch('events', session_id, {'end_date': date})
		if date > sitting_end_date:
			vpapi.patch('events', sitting_id, {'end_date': date})

	logging.info('Scraping debates of term `%s`' % term)
	chamber_id = get_chamber_id(term)

	# prepare mapping from MP's name to id
	people = vpapi.getall('people', projection={'given_name': 1, 'additional_name': 1, 'family_name': 1})
	mps = {}
	for mp in people:
		if 'additional_name' in mp:
			name = '%s. %s. %s' % (mp['given_name'][0], mp['additional_name'][0], mp['family_name'])
		else:
			name = '%s. %s' % (mp['given_name'][0], mp['family_name'])
		mps[name] = mp['id']

	# load name corrections
	with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f:
		name_corrections = json.load(f)

	# scrape list of debates
	debates = parse.old_debates_list(term)

	# add the debate missing in the list
	if term == '4':
		debates['_items'].append({
			'názov': 'Autorizovaná rozprava, 48. schôdza NR SR, 3. 2. 2010',
			'id': '2010_02_03',
			'url': 'http://www.nrsr.sk/dl/Browser/DsDocument?documentId=391413'
		})

	speech_count = 0
	session_identifier = None
	for debate in debates['_items']:
		# skip obsolete debates in the list
		if term == '1':
			if (debate['názov'] == 'Stenozáznam' and debate['id'] != '198550' or
					debate['id'] in ('65890', '65945', '65949')):
				continue
		elif term == '2':
			if debate['názov'].startswith('Stenografická') and debate['id'] != '92098':
				continue
		elif term == '3':
			if debate['id'] == '181047':
				continue

		logging.info('Scraping debate `%s` (id=%s)' % (debate['názov'], debate['id']))
		if term == '1':
			paragraphs = parse.debate_of_term1(debate['id'])
		else:
			paragraphs = parse.debate_of_terms234(debate['id'])

		# normalize header of the debate transcript
		if term == '2':
			# join first 4 paragraphs and add trailing underscores to mark the header
			paragraphs = ['%s %s %s %s\n___' % (paragraphs[0], paragraphs[1], paragraphs[2],
				paragraphs[3])] + paragraphs[4:]
		elif term in ('3', '4'):
			# join first paragraphs until " hodine" ending is found
			# and add trailing underscores to mark the header
			p = ''
			while True:
				p += ' ' + paragraphs.pop(0)
				if p.endswith('hodine'): break
			if paragraphs[0].startswith('___'):
				paragraphs.pop(0)
			paragraphs.insert(0, p + '\n___')

		# extract speeches from the debate
		speeches = []
		text = ''
		within_scene = False
		for par in paragraphs:
			par = par.replace('\n', ' ').strip()
			if not par: continue

			# fix last scene
			if re.search(r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.', par):
				if not par[0] in ('(', '[', '/'):
					par = '(%s)' % par

			# convert brackets to parentheses
			par = re.sub(r'\[(.*?)\]', r'(\1)', par)
			# slash pairs are converted to parentheses too in term 1
			if term == '1':
				par = re.sub(r'(^|\s)/(.*?)/(\s|$)', r'\1(\2)\3', par)
			# convert all inner nested parentheses to brackets
			n = 1
			while n >= 1:
				(par, n) = re.subn(r'\((.*?)\((.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL)

			# process eventual multiparagraph scene
			if par.startswith('(') and par.count('(') > par.count(')'):
				# save eventual previous speech
				insert_speech('speech')

				text = '<p>%s</p>' % par[1:]
				within_scene = True
				continue
			if within_scene:
				if par.endswith(')') and par.count(')') > par.count('('):
					text += '\n\n<p>%s</p>' % par[:-1]
					insert_speech('scene')
					within_scene = False
				else:
					text += '\n\n<p>%s</p>' % par
				continue

			# process eventual header
			header_pattern = r'((\(?(\d+)\.\)?\s+schôdz)|slávnostn).*?(\d+)\..*\b(\w{3,})\s+(\d{4})(.*?)_{3,}$'
			hd = re.search(header_pattern, par, re.DOTALL)
			if hd:
				# save eventual previous speech
				insert_speech('speech')

				sk_date = '%s. %s %s' % (hd.group(4), hd.group(5), hd.group(6))
				initial_time = re.search(r'\s+o\s+(.*?)\s+hodine', hd.group(7), re.DOTALL)
				if initial_time and initial_time.group(1) != '??':
					h, m = initial_time.group(1).strip('.').split('.')
					date = sk_to_utc(sk_date + ' %s:%s:00' % (h.strip().zfill(2), m.strip().zfill(2)))
				else:
					date = sk_to_utc(sk_date) + 'T00:00:00'

				if hd.group(1).startswith('sláv'):
					new_session_name = 'Mimoriadna schôdza'
					if term == '1':
						new_session_identifier = debate['časť']
					elif term == '2':
						new_session_identifier = '1000'
					else:
						sl = parse.session_list(term)
						d = '%s. %s. %s' % (int(date[8:10]), int(date[5:7]), int(date[0:4]))
						new_session_identifier = next((s['číslo'] for s in sl['_items'] if s['trvanie'] == d))
				else:
					new_session_name = '%s. schôdza' % hd.group(3)
					new_session_identifier = hd.group(3)

				if new_session_identifier != session_identifier:
					# create new session event
					session = {
						'name': new_session_name,
						'identifier': new_session_identifier,
						'organization_id': chamber_id,
						'type': 'session',
						'start_date': date,
					}
					key = ('organization_id', 'type', 'identifier')
					session_id, _ = get_or_create('events', session, key)
					session_identifier = new_session_identifier
					session_end_date = date
					sitting_count = 0

				# create new sitting event
				sitting_count += 1
				sitting = {
					'name': '%s. deň rokovania, %s' % (sitting_count, sk_date),
					'identifier': str(sitting_count),
					'organization_id': chamber_id,
					'type': 'sitting',
					'start_date': date,
					'parent_id': session_id,
				}
				key = ('parent_id', 'type', 'identifier')
				sitting_id, created = get_or_create('events', sitting, key)
				sitting_end_date = date
				position = 0

				# delete existing speeches of the sitting
				if not created:
					obsolete = vpapi.getall('speeches', where={'event_id': sitting_id})
					for speech in obsolete:
						vpapi.delete('speeches', speech['id'])
				continue

			# process eventual start of a speech
			if date < '2001-09-04':
				# format `Foreign minister J. Doe:`
				speech_start_pattern = r'(.*?)\b([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+):$'
			else:
				# format `J. Doe, foreign minister: speech`
				speech_start_pattern = r'([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+),\s+(.+?):(.+)$'
			sp = re.match(speech_start_pattern, par, re.DOTALL)
			if sp:
				# save eventual previous speech
				insert_speech('speech')

				# identify speaker
				if date < '2001-09-04':
					name = '%s. %s' % (sp.group(2), sp.group(5))
					if (sp.group(4)):
						name = name.replace(' ', ' %s. ' % sp.group(4))
					attribution = sp.group(1)
					par = ''
				else:
					name = '%s. %s' % (sp.group(1), sp.group(4))
					if (sp.group(3)):
						name = name.replace(' ', ' %s. ' % sp.group(3))
					attribution = sp.group(5)
					par = sp.group(6)

				if name in name_corrections:
					name = name_corrections[name]
				attribution = attribution[0].lower() + attribution[1:].strip()
				speaker_id = mps.get(name)

				# create unknown speakers
				if not speaker_id:
					logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution))
					name_parts = re.match(r'(\w)\. ((\w)\. )?(\w+)', name)
					person = {
						'name': name,
						'family_name': name_parts.group(4),
						'given_name': name_parts.group(1)
					}
					person['sort_name'] = '%s, %s.' % (person['family_name'], person['given_name'])
					if name_parts.group(3):
						person['additional_name'] = name_parts.group(3)
						person['sort_name'] += ' %s.' % person['additional_name']
					resp = vpapi.post('people', person)
					speaker_id = resp['id']
					mps[name] = speaker_id

			# recognize date(-time) stamps in transcripts
			ds = re.match(r'^\s*(\d+\.\s\w+\s\d{4})(.*hodine)?\s*$', par)
			if ds:
				dt = ds.group(1).strip()
				tm = re.search(r'o\s+(.*?)\s+', ds.group(2) or '')
				try:
					if tm:
						h, m = tm.group(1).strip('.').split('.')
						date = sk_to_utc('%s %s:%s:00' % (dt, h.strip().zfill(2), m.strip().zfill(2)))
					else:
						date = sk_to_utc(dt) + 'T00:00:00'
					continue
				except ValueError:
					pass

			# process eventual scene in this paragraph
			scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS
			while True:
				scene = re.match(scene_pattern, par, re.DOTALL)
				if not scene: break
				if scene.group(1):
					text += '\n\n<p>%s</p>' % scene.group(1).strip()
				insert_speech('speech')
				text = '<p>%s</p>' % scene.group(2).strip()
				insert_speech('scene')
				par = scene.group(3)

			if par:
				text += '\n\n<p>%s</p>' % par.strip()

		insert_speech('speech')

		# extract end time of the session
		final_time = re.search(
			r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.',
			speeches[-1]['text'])
		if final_time:
			tm = final_time.group(2)
			tm = tm.replace('O', '0').replace(',', '.')
			h, m = tm.strip('.').split('.')
			final_date = '%s.%s.%s %s:%s:00' % (date[8:10], date[5:7], date[0:4], h.strip().zfill(2), m.strip().zfill(2))
			final_date = sk_to_utc(final_date)
			vpapi.patch('events', session_id, {'end_date': final_date})
			vpapi.patch('events', sitting_id, {'end_date': final_date})

		vpapi.post('speeches', speeches)
		logging.info('Scraped %s speeches' % len(speeches))
		speech_count += len(speeches)

	logging.info('Scraped %s speeches in total' % speech_count)
コード例 #25
0
def scrape_motions(term):
	"""Scrape and save motions from the given term that are not scraped
	yet starting from the oldest ones. One Motion item, one VoteEvent
	item and many Vote items are created for each scraped motion detail
	page.

	Returns number of scraped motions.
	"""
	logging.info('Scraping motions of term `%s`' % term)

	# prepare mappings from source identifier to id for MPs and parliamentary groups
	chamber_id = get_chamber_id(term)

	people = vpapi.getall('people', projection={'identifiers': 1})
	mps = {mp['identifiers'][0]['identifier']: mp['id'] for mp in people if 'identifiers' in mp}

	orgs = vpapi.getall('organizations', where={'classification': 'parliamentary group', 'parent_id': chamber_id})
	parl_groups = {c['name']: c['id'] for c in orgs}

	# add differently spelled parliamentary groups
	group_corrections = {
		'2': {
			'Klub HZDS': 'Klub ĽS-HZDS',
			'Klub SMK': 'Klub SMK-MKP',
			'Klub Nezávislí': 'Klub Nezávislý',
		},
		'3': {
			'Klub HZDS': 'Klub ĽS-HZDS',
			'Klub SDKÚ': 'Klub SDKÚ-DS',
			'Klub Smer': 'Klub SMER-SD',
			'Klub Smer-SD': 'Klub SMER-SD',
			'Klub KNP': 'Klub nezávislých poslancov NR SR',
			'Klub Nezávislý': 'Klub nezávislých poslancov NR SR',
		},
	}
	for k, v in group_corrections.get(term, {}).items():
		parl_groups[k] = parl_groups[v]

	# prepare list of sessions that are not completely scraped yet
	sessions_to_scrape = []
	session_list = parse.session_list(term)
	for session in session_list['_items']:
		motions = parse.session(session['číslo'], term)
		if len(motions['_items']) == 0: continue
		last_motion_id = motions['_items'][-1]['id']
		m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % last_motion_id
		existing = vpapi.getfirst('motions', where={'sources.url': m_url})
		if existing: break
		sessions_to_scrape.append((session, motions))

	# scrape motions from those sessions
	scraped_motions_count = 0
	for s, motions in reversed(sessions_to_scrape):
		logging.info('Scraping session `%s`' % s['názov'])

		# insert the session event unless it already exists
		session = {
			'name': s['názov'],
			'identifier': s['číslo'],
			'organization_id': chamber_id,
			'type': 'session',
		}
		try:
			session['start_date'] = sk_to_utc(s['trvanie']) + 'T00:00:00'
			session['end_date'] = session['start_date']
		except ValueError:
			# multiday session contains votes; dates are set by debates scraping
			pass
		key = ('organization_id', 'type', 'identifier')
		session_id, _ = get_or_create('events', session, key)

		for i, m in enumerate(motions['_items']):
			# check if the motion is already present
			m_id = re.search(r'ID=(\d+)', m['url']['výsledok']).group(1)
			# we not use directly m['url']['kluby'] because it is not always present
			m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % m_id
			existing = vpapi.getfirst('motions', where={'sources.url': m_url})
			if existing: continue

			try:
				motion_id = None
				vote_event_id = None

				# insert motion
				logging.info('Scraping motion %s of %s (voted at %s)' % (i+1, len(motions['_items']), m['dátum']))
				parsed_motion = parse.motion(m['id'])
				motion = {
					'organization_id': chamber_id,
					'legislative_session_id': session_id,
					'identifier': parsed_motion['číslo'],
					'text': parsed_motion['názov'],
					'date': sk_to_utc(m['dátum']),
					'sources': [{
						'url': parsed_motion['url'],
						'note': 'Hlasovanie na webe NRSR'
					}],
				}
				if 'výsledok' in parsed_motion:
					motion['result'] = 'pass' if parsed_motion['výsledok'] == 'Návrh prešiel' else 'fail'
				resp = vpapi.post('motions', motion)
				motion_id = resp['id']

				# insert vote event
				vote_event = {
					'motion_id': motion_id,
					'organization_id': chamber_id,
					'legislative_session_id': session_id,
					'identifier': parsed_motion['číslo'],
					'start_date': motion['date'],
					'sources': [{
						'url': parsed_motion['url'],
						'note': 'Hlasovanie na webe NRSR'
					}],
				}
				if 'výsledok' in parsed_motion:
					vote_event['result'] = motion['result']
				if 'súčty' in parsed_motion:
					options = {
						'yes': '[z] za',
						'no': '[p] proti',
						'abstain': '[?] zdržalo sa',
						'absent': '[0] neprítomní',
						'not voting': '[n] nehlasovalo'
					}
					vote_event['counts'] = [
						{'option': o, 'value': int(parsed_motion['súčty'][s])}
						for o, s in options.items() if parsed_motion['súčty'][s] != ''
					]
					if len(vote_event['counts']) == 0:
						del vote_event['counts']
				resp = vpapi.post('vote-events', vote_event)
				vote_event_id = resp['id']

				# insert votes
				if 'hlasy' in parsed_motion and len(parsed_motion['hlasy']) > 0:
					vote_options = {
						'z': 'yes',
						'p': 'no',
						'?': 'abstain',
						'n': 'not voting',
						'0': 'absent'
					}
					votes = []
					for v in parsed_motion['hlasy']:
						# skip MPs not applying their mandate
						if v['hlas'] == '-': continue
						pg = normalize_parlgroup_name(v['klub'])
						votes.append({
							'vote_event_id': vote_event_id,
							'option': vote_options[v['hlas']],
							'voter_id': mps.get(v['id']),
							'group_id': parl_groups.get(pg),
						})
					if len(votes) > 0:
						resp = vpapi.post('votes', votes)

			# delete incomplete data if insertion of the motion, vote event or votes failed
			except:
				if motion_id:
					vpapi.delete('motions', motion_id)
				if vote_event_id:
					vpapi.delete('vote-events', vote_event_id)
				raise

			scraped_motions_count += 1

	logging.info('Scraped %s motions of term `%s`' % (scraped_motions_count, term))
	return scraped_motions_count
コード例 #26
0
 def test_ids(self):
     committees_ids = {}
     all_committees = vpapi.getall("organizations", where={"classification": "committe"})
     for committe in all_committees:
         committees_ids[committe["identifiers"][0]["identifier"]] = committe["id"]
     print len(committees_ids)
コード例 #27
0
	def scrape_from_group_and_save(group_type, id, term):
		"""Scrape memberships in a given group and save (or update) them.
		If group or MP referred by the membership does not exist, scrape
		and save it/him/her.
		"""
		group = parse.group(group_type, id)

		# if group is not scraped yet, scrape and save it
		g = vpapi.getfirst('organizations',
			where={
				'classification': group_type,
				'identifiers': {'$elemMatch': {'identifier': id, 'scheme': 'nrsr.sk'}}},
			projection={'id': 1})
		if g:
			oid = g['id']
		else:
			o = Organization.scrape(group_type, id)
			oid = o.save()

		roles = {
			'člen': 'member',
			'členka': 'member',
			'predseda': 'chairman',
			'predsedníčka': 'chairwoman',
			'podpredseda': 'vice-chairman',
			'podpredsedníčka': 'vice-chairwoman',
			'vedúci': 'chairman',
			'vedúca': 'chairwoman',
			'náhradník': 'substitute',
			'náhradníčka': 'substitute',
			'overovateľ': 'verifier',
			'overovateľka': 'verifier',
			'poverený vedením klubu': 'chairman',
			'podpredseda poverený vedením výboru': 'vice-chairman',
			'náhradný člen': 'substitute',
			'náhradná členka': 'substitute',
		}

		for member in group['členovia']:
			logging.info('Scraping membership of `%s`' % member['meno'])

			# if member MP is not scraped yet, scrape and save him
			existing = vpapi.getfirst('people',
				where={'identifiers': {'$elemMatch': {'identifier': member['id'], 'scheme': 'nrsr.sk'}}},
				projection={'id': 1})
			if existing:
				pid = existing['id']
			else:
				p = Person.scrape(member['id'], term)
				pid = p.save()

			m = Membership()
			m.person_id = pid
			m.organization_id = oid
			m.sources = [{
				'url': group['url'],
				'note': 'Profil na webe NRSR'
			}]
			# create or update all periods of the membership
			for period in member['obdobia']:
				if period.get('rola'):
					m.label = period['rola'].capitalize() + ' v skupine ' + group['názov']
					m.role = roles[period['rola'].lower()]
				else:
					m.label = 'V skupine ' + group['názov']
				if period.get('od'):
					m.start_date = sk_to_utc(period.get('od'))
				if period.get('do'):
					m.end_date = sk_to_utc(period.get('do'))
				m.save()
				for attr in ('role', 'start_date', 'end_date'):
					if hasattr(m, attr):
						delattr(m, attr)
		logging.info('Scraped %s memberships' % len(group['členovia']))

		# close all open memberships in this group that were not updated
		logging.info('Closing not updated open memberships')
		present = datetime.utcnow() - timedelta(minutes=10)
		query = {
			'organization_id': oid,
			'$or': [{'end_date': {'$exists': False}}, {'end_date': {'$in': [None, '']}}],
			'updated_at': {'$lt': present.isoformat()}
		}
		to_close = vpapi.getall('memberships', where=query)
		for m in to_close:
			vpapi.patch('memberships', m['id'], {'end_date': datestring_add(effective_date, -1)})
コード例 #28
0
    return -1
  if vote == 'abstain':
    return -1
  else:
    return 0

answers = {}
groups = {}
mps = {}

vpapi.parliament('sk/nrsr')
for ve in ves:
    print(ve)
    vedb = vpapi.get("vote-events", where={"sources.url":{"$regex":"ID="+ve+"$"}})
    idd = vedb['_items'][0]['id']
    r = vpapi.getall("votes",where={"vote_event_id":idd})
    for row in r:
        try:
            answers[row['voter_id']]
        except:
            answers[row['voter_id']] = {"vote":{}}
        answers[row['voter_id']]['vote'][ve] = vote2vote(row['option'])
        if row['group_id'] is not None:
            try:
                groups[row['group_id']]
            except:
                group = vpapi.get("organizations/"+row['group_id'])
                groups[row['group_id']] = {"name": group['name'].replace('Klub ','')}
                groups[row['group_id']]['slug'] = slugify.slugify(groups[row['group_id']]['name'])
                print(groups[row['group_id']]['slug'])
            answers[row['voter_id']]['friendly_name'] = groups[row['group_id']]['slug']
コード例 #29
0
    def export_speeches(self):
        speeches = self.load_json('speeches')
        people = {}
        prefix_regex = re.compile(
            ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\
(generalni sekretar )', re.U)

        for p in vpapi.getall('people'):
            name = self.normalize_name(p['name'])
            people[name] = p['id']

        for speech in speeches:
            session_id = speech.get('event_id')
            speech['event_id'] = self.events_ids[session_id]
            url = speech['sources'][0]['url']
            if url.endswith('.pdf'):
                parsed_speeches = self.download_pdf(url)
                for n, s in enumerate(parsed_speeches):
                    text_speech = speech.copy()
                    text_speech['text'] = s['text']
                    text_speech['position'] = n + 1
                    text_speech['type'] = 'speech'

                    creator = self.normalize_name(s['creator'])
                    creator = prefix_regex.sub('', creator)

                    if creator in people:
                        text_speech['creator_id'] = people[creator]
                    else:
                        creator_id = None

                        for name in people:
                            if name in creator:
                                creator_id = people[name]
                                break

                        if creator_id is None:
                            resp = vpapi.getfirst('people',
                                                  where={
                                                      'name': {
                                                          '$regex':
                                                          s['creator'],
                                                          'options': 'i'
                                                      }
                                                  })
                            if resp is None:
                                self.log(
                                    'Person "%(creator)s" not found. \
Creating one' % s, WARNING)
                                item = {
                                    'name': s['creator'],
                                    'sources': text_speech['sources']
                                }
                                resp = vpapi.post('people', item)
                            creator_id = resp['id']

                        people[creator] = creator_id
                        text_speech['creator_id'] = creator_id

                    self.get_or_create('speeches',
                                       text_speech,
                                       where_keys=['event_id', 'position'])
            else:
                self.get_or_create('speeches', speech)
コード例 #30
0
ファイル: scraper.py プロジェクト: PetrFlegr/vaa2012-2
        return 0


answers = {}
groups = {}
mps = {}

vpapi.parliament('sk/nrsr')
for ve in ves:
    print(ve)
    vedb = vpapi.get("vote-events",
                     where={"sources.url": {
                         "$regex": "ID=" + ve + "$"
                     }})
    idd = vedb['_items'][0]['id']
    r = vpapi.getall("votes", where={"vote_event_id": idd})
    for row in r:
        try:
            answers[row['voter_id']]
        except:
            answers[row['voter_id']] = {"vote": {}}
        answers[row['voter_id']]['vote'][ve] = vote2vote(row['option'])
        if row['group_id'] is not None:
            try:
                groups[row['group_id']]
            except:
                group = vpapi.get("organizations/" + row['group_id'])
                groups[row['group_id']] = {
                    "name": group['name'].replace('Klub ', '')
                }
                groups[row['group_id']]['slug'] = slugify.slugify(
コード例 #31
0
ファイル: votes.py プロジェクト: KohoVolit/scraper-psp.cz
            votesli = []
            existingvotes = {}
#            terms = {}
            for rowp in hl_poslanec:
#                print(rowp)

                try:
                    voteevents[rowp[1].strip()]
                except:
                    voteevents[rowp[1].strip()] = vpapi.get('vote-events', where={'identifier': rowp[1].strip()})
                r_voteevent = voteevents[rowp[1].strip()]

                try:
                    existingvotes[r_voteevent["_items"][0]["id"]]
                except:
                    rex = vpapi.getall('votes',where={"vote_event_id":r_voteevent["_items"][0]["id"]})
                    ids = []
                    for rowx in rex:
                        ids.append(rowx['id'])
                    if len(ids) > 0:
                        existingvotes[r_voteevent["_items"][0]["id"]] = True
                        if len(ids) < 200:
                            print(r_voteevent["_items"][0]["id"] + ": " + len(ids))
                    else:
                        existingvotes[r_voteevent["_items"][0]["id"]] = False
                    print(r_voteevent["_items"][0]["id"] + ": " + str(len(ids)))

#                print(existingvotes)
                if not existingvotes[r_voteevent["_items"][0]["id"]]:

                    try:
コード例 #32
0
ファイル: test.py プロジェクト: KohoVolit/scraper-psp.cz
import scrapeutils
import vpapi
import authentication

vpapi.parliament('cz/psp')
vpapi.authorize(authentication.username,authentication.password)
vpapi.timezone('Europe/Prague')

votes = {}
for vote_event in vpapi.getall("vote-events"):
    votes[vote_event['id']] = []
print(len(votes))
i = 0
for vote in vpapi.getall("votes"):
    if(i/100 == round(i/100)):
        print(i)
    votes[vote_event['id']].append(vote)
    i += 1
print(len(votes))          
for i in votes:    
    if not((len(votes[i]) == 200) or (len(votes[i]) == 400)):
        print (i + ":" + len(votes[i]))

    def export_speeches(self):
        speeches = self.load_json('speeches')
        people = {}
        prefix_regex = re.compile(
            ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\
(generalni sekretar )', re.U)

        for p in vpapi.getall('people'):
            name = self.normalize_name(p['name'])
            people[name] = p['id']

        for speech in speeches:
            session_id = speech.get('event_id')
            speech['event_id'] = self.events_ids[session_id]
            url = speech['sources'][0]['url']
            if url.endswith('.pdf'):
                parsed_speeches = self.download_pdf(url)
                for n, s in enumerate(parsed_speeches):
                    text_speech = speech.copy()
                    text_speech['text'] = s['text']
                    text_speech['position'] = n + 1
                    text_speech['type'] = 'speech'

                    creator = self.normalize_name(s['creator'])
                    creator = prefix_regex.sub('', creator)

                    if creator in people:
                        text_speech['creator_id'] = people[creator]
                    else:
                        creator_id = None

                        for name in people:
                            if name in creator:
                                creator_id = people[name]
                                break

                        if creator_id is None:
                            resp = vpapi.getfirst(
                                'people', where={
                                    'name': {
                                        '$regex': s['creator'],
                                        'options': 'i'
                                    }
                                }
                            )
                            if resp is None:
                                self.log('Person "%(creator)s" not found. \
Creating one' % s, WARNING)
                                item = {
                                    'name': s['creator'],
                                    'sources': text_speech['sources']
                                }
                                resp = vpapi.post('people', item)
                            creator_id = resp['id']

                        people[creator] = creator_id
                        text_speech['creator_id'] = creator_id

                    self.get_or_create(
                        'speeches',
                        text_speech,
                        where_keys=['event_id', 'position']
                    )
            else:
                self.get_or_create('speeches', speech)
コード例 #34
0
                    handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
logging.getLogger('requests').setLevel(logging.ERROR)

logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2')
db_log = vpapi.post('logs', {
    'status': 'running',
    'file': logname,
    'params': []
})

vpapi.parliament('cz/senat')
vpapi.authorize(authentication.username, authentication.password)
vpapi.timezone('Europe/Prague')

o2id = {}
organizations = vpapi.getall("organizations")
for org in organizations:
    o2id[org['name']] = org['id']

p2id = {}
persons = vpapi.getall('people')
for p in persons:
    p2id[p['name']] = p['id']


def pp2id(name, date, p2id):
    if name == 'Jiří Dienstbier':
        if date < '2011-01-08':
            return '218'
        else:
            return '253'