Esempio n. 1
0
 def download_get_basefiles(self, depts):
     for dept in depts:
         resp = requests.post(urljoin(self.start_url, 'sql_search_rsp.asp'),
                              {'departement': dept.encode('latin-1'),
                               'kom_nr': '',
                               'title': '',
                               'ACTION': '  SÖK  '.encode('latin-1')})
         soup = BeautifulSoup(resp.text, "lxml")
         hits = list(soup.find_all(True, text=re.compile(r'(\d{4}:\d+)')))
         self.log.debug("Searching for dept %s, %d results" % (dept, len(hits)))
         for hit in hits:
             link = hit.find_parent("a")
             # convert 2006:02 to 2006:2 for consistency
             segments = re.search("(\d+):(\d+)", link.text).groups()
             basefile = ":".join([str(int(x)) for x in segments])
             # we use link.absolute_url rather than relying on our
             # own basefile -> url code in remote_url. It seems
             # that in least one case the URL formatting rule is
             # not followed by the system...
             yield basefile, urljoin(self.start_url, link['href'])
Esempio n. 2
0
    def extract_metadata(self, rawhead, basefile):
        content = rawhead
        title = content.find("h1").string
        identifier_node = content.find("p", "lead")
        if identifier_node:
            identifier = identifier_node.text
        else:
            identifier = ""  # infer_metadata calls infer_identifier
                             # if this is falsy, which will be good
                             # enough. No need to warn.
        definitions = content.find("dl", "definitions")
        ansvarig = None
        if definitions:
            for dt in definitions.find_all("dt"):
                key = dt.get_text(strip=True)
                value = dt.find_next_sibling("dd").get_text(strip=True)
                if key in ("Utgiven:", "Publication date:"):
                    utgiven = self.parse_swedish_date(value)
                elif key in ("Avsändare:",):
                    ansvarig = value 

        sammanfattning = None
        if content.find("h2", text="Sammanfattning"):
            sums = content.find("h2", text="Sammanfattning").find_next_siblings("p")
            # "\n\n" doesn't seem to survive being stuffed in a rdfa
            # content attribute. Replace with simple space.
            sammanfattning = " ".join([x.get_text(strip=True) for x in sums])
        
        # find related documents
        re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+')
        # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss,
        # 4=proposition. Assume that relationships between documents
        # are reciprocal (ie if the page for a Kommittedirektiv
        # references a Proposition, the page for that Proposition
        # references the Kommittedirektiv.
        elements = {self.KOMMITTEDIREKTIV: [],
                    self.DS: ["legStep1"],
                    self.PROPOSITION: ["legStep1", "legStep2"],
                    self.SOU: ["legStep1"]}[self.document_type]
        utgarFran = []
        for elementid in elements:
            box = content.find(id=elementid)
            if not box:
                continue
            for listitem in box.find_all("li"):
                if not listitem.find("span", "info"):
                    continue
                infospans = [x.text.strip(
                ) for x in listitem.find_all("span", "info")]

                rel_basefile = None
                rel_identifier = None

                for infospan in infospans:
                    if re_basefile.search(infospan):
                        # scrub rel_identifier ("Dir. 2008:50" -> "2008:50" etc)
                        rel_basefile = re_basefile.search(infospan).group()
                        rel_identifier = infospan

                if not rel_basefile:
                    # this often means that a non-standard document
                    # type is used as preparatory work for this
                    # document (eg department memos not published in
                    # Ds, like "S2013/8074/PBB" -- seems to be common
                    # in Socialdepartementet and Finansdepartementet)
                    self.log.warning(
                        "%s: Couldn't find rel_basefile (elementid #%s) among %r" % (basefile, elementid, infospans))
                    continue

                attribs = {"rpubl:arsutgava": basefile.split(":")[0],
                           "rpubl:lopnummer": basefile.split(":")[1]}
                if elementid == "legStep1":
                    attribs["rdf:type"] = RPUBL.Kommittedirektiv
                elif elementid == "legStep2":
                    attribs["rdf:type"] = RPUBL.Utredningsbetankande
                    if rel_identifier.startswith("SOU"):
                        altlabel = "SOU"
                    elif rel_identifier.startswith(("Ds", "DS")):
                        altlabel = "Ds"
                    else:
                        self.log.warning(
                            "%s: Cannot find out what type of document the linked %s is (#%s)" % (basefile, rel_identifier, elementid))
                        continue
                    attribs["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel)
                elif elementid == "legStep3":
                    attribs["rdf:type"] = RPUBL.Proposition
                uri = self.minter.space.coin_uri(self.attributes_to_resource(attribs))
                utgarFran.append(uri)

        # find related pages
        related = content.find("h2", text="Relaterat")
        seealso = []
        if related:
            for link in related.findParent("div").find_all("a"):
                r = urljoin("http://www.regeringen.se/", link["href"])
                seealso.append(URIRef(r))

        a = self.metadata_from_basefile(basefile)
        a.update({'dcterms:title': title,
                  'dcterms:identifier': identifier,
                  'dcterms:issued': utgiven,
                  'rpubl:utgarFran': utgarFran
        })
        if ansvarig:
            a["rpubl:departement"] = ansvarig
        if seealso:
            a["rdfs:seeAlso"] = seealso
        if sammanfattning:
            a['dcterms:abstract'] = sammanfattning
        return a
Esempio n. 3
0
    def extract_metadata(self, rawhead, basefile):
        content = rawhead
        title = content.find("h1").string
        identifier_node = content.find("p", "lead")
        if identifier_node:
            identifier = identifier_node.text
        else:
            identifier = ""  # infer_metadata calls infer_identifier
                             # if this is falsy, which will be good
                             # enough. No need to warn.
        definitions = content.find("dl", "definitions")
        ansvarig = None
        if definitions:
            for dt in definitions.find_all("dt"):
                key = dt.get_text(strip=True)
                value = dt.find_next_sibling("dd").get_text(strip=True)
                if key in ("Utgiven:", "Publication date:"):
                    utgiven = self.parse_swedish_date(value)
                elif key in ("Avsändare:",):
                    ansvarig = value 

        sammanfattning = None
        if content.find("h2", text="Sammanfattning"):
            sums = content.find("h2", text="Sammanfattning").find_next_siblings("p")
            # "\n\n" doesn't seem to survive being stuffed in a rdfa
            # content attribute. Replace with simple space.
            sammanfattning = " ".join([x.get_text(strip=True) for x in sums])
        
        # find related documents
        re_basefile = re.compile(r'\d{4}(|/\d{2,4}):\d+')
        # legStep1=Kommittedirektiv, 2=Utredning, 3=lagrådsremiss,
        # 4=proposition. Assume that relationships between documents
        # are reciprocal (ie if the page for a Kommittedirektiv
        # references a Proposition, the page for that Proposition
        # references the Kommittedirektiv.
        elements = {self.KOMMITTEDIREKTIV: [],
                    self.DS: ["legStep1"],
                    self.PROPOSITION: ["legStep1", "legStep2"],
                    self.SOU: ["legStep1"]}[self.document_type]
        utgarFran = []
        for elementid in elements:
            box = content.find(id=elementid)
            if not box:
                continue
            for listitem in box.find_all("li"):
                if not listitem.find("span", "info"):
                    continue
                infospans = [x.text.strip(
                ) for x in listitem.find_all("span", "info")]

                rel_basefile = None
                rel_identifier = None

                for infospan in infospans:
                    if re_basefile.search(infospan):
                        # scrub rel_identifier ("Dir. 2008:50" -> "2008:50" etc)
                        rel_basefile = re_basefile.search(infospan).group()
                        rel_identifier = infospan

                if not rel_basefile:
                    # this often means that a non-standard document
                    # type is used as preparatory work for this
                    # document (eg department memos not published in
                    # Ds, like "S2013/8074/PBB" -- seems to be common
                    # in Socialdepartementet and Finansdepartementet)
                    self.log.warning(
                        "%s: Couldn't find rel_basefile (elementid #%s) among %r" % (basefile, elementid, infospans))
                    continue

                attribs = {"rpubl:arsutgava": basefile.split(":")[0],
                           "rpubl:lopnummer": basefile.split(":")[1]}
                if elementid == "legStep1":
                    attribs["rdf:type"] = RPUBL.Kommittedirektiv
                elif elementid == "legStep2":
                    attribs["rdf:type"] = RPUBL.Utredningsbetankande
                    if rel_identifier.startswith("SOU"):
                        altlabel = "SOU"
                    elif rel_identifier.startswith(("Ds", "DS")):
                        altlabel = "Ds"
                    else:
                        self.log.warning(
                            "%s: Cannot find out what type of document the linked %s is (#%s)" % (basefile, rel_identifier, elementid))
                        continue
                    attribs["rpubl:utrSerie"] = self.lookup_resource(altlabel, SKOS.altLabel)
                elif elementid == "legStep3":
                    attribs["rdf:type"] = RPUBL.Proposition
                uri = self.minter.space.coin_uri(self.attributes_to_resource(attribs))
                utgarFran.append(uri)

        # find related pages
        related = content.find("h2", text="Relaterat")
        seealso = []
        if related:
            for link in related.findParent("div").find_all("a"):
                r = urljoin("http://www.regeringen.se/", link["href"])
                seealso.append(URIRef(r))

        a = self.metadata_from_basefile(basefile)
        a.update({'dcterms:title': title,
                  'dcterms:identifier': identifier,
                  'dcterms:issued': utgiven,
                  'rpubl:utgarFran': utgarFran
        })
        if ansvarig:
            a["rpubl:departement"] = ansvarig
        if seealso:
            a["rdfs:seeAlso"] = seealso
        if sammanfattning:
            a['dcterms:abstract'] = sammanfattning
        return a