Ejemplo n.º 1
0
def parse_index(local_name):
    """Parses the index file, returns list of urls, or False"""

    try:
        f = open(local_name, 'rt')
    except IOError:
        print "ERROR opening " + local_name
        return False

    text = f.read()
    f.close()

    soup = BeautifulSoup(text)

    urls = []

    for a in soup.find_all("a"):
        href = a.get("href")
        if type(href) is str:
            if "/abgeordnete/biografien/daten" in href:
                urls.append(get_complete_url(base_url, a.get("href")))

    return urls
Ejemplo n.º 2
0
def parse_bio(url):
    """Parses a biography html and returns a class of Abgeordneter, or False."""

    local_name = get_bio_filename(url)
    if local_name == False: return False

    try:
        f = open(local_name, 'rt')
    except IOError:
        print "ERROR opening " + local_name
        return False

    text = f.read()
    f.close()

    soup = BeautifulSoup(text)

    bio = soup.find("div", "standardBox")
    if not type(bio) is Tag:
        print "ERROR: bio-box not found"
        return False

    # name
    h1 = bio.find("h1")
    if not type(h1) is Tag:
        print "ERROR: h1 name not found"
        return False

    # create a structure
    A = Abgeordneter()
    A.url = get_complete_url(base_url, url)
    A.gremium = u"Bundestag"
    A.period = u"16"

    texts = h1.text
    idx = texts.rfind(",")
    if idx < 0:
        A.name = texts.strip()
    else:
        A.name = texts[:idx].strip()
        A.party = texts[idx + 1:].strip()

    #print "[" + A.name + "]"

    # image url
    div = soup.find("div", "bildDivPortrait")
    if type(div) is Tag:
        img = div.find("img")
        if type(img) is Tag:
            src = img.get("src")
            if not src is None:
                A.img_url = get_complete_url(base_url, src)
                #print A.img_url

    # Wahlkreis
    for a in soup.find_all("a"):
        href = a.get("href")
        if not href == None:
            if "/wahlkreise" in href:
                if ":" in a.text:
                    A.wahlkreis = a.text.strip()
                    #print A.wahlkreis

    # parse next tags after name
    idx = 0
    for i in h1.next_siblings:
        if type(i) is Tag:
            tex = i.text.strip()
            if len(tex):
                #			print "---[" + tex + "]"
                if idx == 0: A.occupation = tex
                if idx == 1: A.birth = tex
                if idx > 1: A.addStatement(tex)
                idx = idx + 1

    # proceed with memberships
    for i in bio.next_siblings:
        if type(i) is Tag:
            # membership box
            if "Mitglied" in i.text:
                for div in i.find_all("div"):
                    for h3 in div.find_all("h3"):
                        tex = h3.text.strip()
                        member_types.add(tex)
                        #print "###[" + tex + "]"
                        # ordentliches Mitglied
                        if "Ordentlich" in tex:
                            memb = A.member[0]
                        else:
                            # stellvertretendes Mitglied
                            if "Stellver" in tex:
                                memb = A.member[1]
                            # Funktion in Firmen, Verbänden, etc...
                            else:
                                memb = []

                                for j in h3.next_siblings:
                                    if type(j) is Tag and j.name == "p":
                                        if j.get(
                                                "class"
                                        ) == None or not "kleinAbstand" in j.get(
                                                "class"):
                                            memb.append(j.text.strip()
                                                        )  #.encode("UTF-8") )
                                if "vor der Mitgliedschaft" in tex:
                                    A.member[3] += A.concat(memb)
                                else:
                                    A.member[2] += A.concat(memb)
                                break

                        # scan ordentlich & stellvertretend
                        for j in h3.next_siblings:
                            if type(j) is Tag:
                                #print j.name
                                if j.name == "h3": break
                                if j.name == "ul":
                                    for k in j.find_all("li"):
                                        memb.append(k.text.strip()
                                                    )  #.encode("UTF-8") )

    #print A
    return A
Ejemplo n.º 3
0
# parse index file for bios
urls = []
for i in range(0, 26):
    urlss = parse_index(get_index_url(i), get_index_filename(i))
    if urlss == False:
        print "parsing failed"
    else:
        urls += urlss

# download all bios
if do_download == True:
    for url in urls:
        fn = get_bio_filename(url)
        if fn == False: quit()
        download_file(get_complete_url(base_url, url), fn)

# parse all biographies
errors = 0
people = []
for url in urls:
    print "parsing " + url
    a = parse_bio(url)
    if a is False:
        print "in " + get_bio_filename(url)
        errors += 1
    else:
        #print unicode(a)
        people.append(a)
    #break;
Ejemplo n.º 4
0
def parse_bio(url):
    """Parses a biography html and returns a class of Abgeordneter, or False."""

    local_name = get_filename(url)

    try:
        f = open(local_name, 'rt')
    except IOError:
        print "ERROR opening " + local_name
        return False

    text = f.read()
    f.close()

    soup = BeautifulSoup(text)

    cont = soup.find("div", id="content")
    if not type(cont) is Tag:
        print "ERROR missing content div"
        return False

    #print cont

    A = Abgeordneter()
    A.url = url
    A.gremium = u"Landtag Thüringen"
    A.period = u"6"

    # main entry
    h2 = cont.find("h2", "alternativFont")
    if not type(h2) is Tag:
        print "ERROR missing main h2"
        return False
    A.name = h2.text

    # image url
    img = cont.find("img")
    if type(cont) is Tag:
        A.img_url = get_complete_url(base_url, img.get("src"))

    # info below name
    for i in h2.next_siblings:
        if type(i) is Tag:
            if i.name == "p" and len(i.text):
                # party
                if "Fraktion:" in i.text:
                    # split into party and Wahlkreis
                    text = i.text
                    text = text[10:]  # remove "Fraktion: "
                    idx = text.find(",")
                    if idx < 0:
                        A.party = text
                    else:
                        A.party = text[:idx]
                        A.wahlkreis = text[idx + 1:].strip()
                    continue
                # occupation
                else:
                    if not len(A.occupation):
                        # TODO: They use <br/>s to split different statements
                        A.occupation = i.text
                        continue
                    else:
                        if len(A.party): break
    # verify
    #if not len(A.occupation):
    #	print "ERROR missing p after h2"; return False
    if not len(A.party):
        print "ERROR missing Fraktion"
        return False

    # general data
    for h2 in cont.find_all("h2"):
        # birth
        if "nliche Daten" in h2.text:
            for i in h2.next_siblings:
                if type(i) is Tag:
                    if i.name == "ul":
                        for li in i.find_all("li"):
                            if len(A.birth): A.birth += "\n"
                            A.birth += li.text
                        break
        # memberships
        if "Funktionen" in h2.text:
            for i in h2.next_siblings:
                if type(i) is Tag:
                    if i.name == "ul":
                        for li in i.find_all("li"):
                            A.member[2].append(li.text)

    # biography
    dl = cont.find("dl", "vita")
    if type(dl) is Tag:
        first = ""
        for i in dl.children:
            if type(i) is Tag:
                if i.name == "dt": first = i.text
                if i.name == "dd":
                    A.statements.append(first + " " + i.text)
                    first = ""

    return A