Beispiel #1
1
def all_titles():
    for party in PARTIES:
        doc = load_doc(party)

        for i, h in enumerate(doc.findall('.//*')):
            if not h.tag in ['h1', 'h2']:
                continue
            #titles.upsert({
            #    'party': party,
            #    'index': i,
            #    'element': h.tag,
            #    'text': h.text
            #}, ['party', 'text'])
            print [party, h.tag, h.text]
            fp = '[%s:%s] %s' % (party, h.tag, h.text)
            try:
                entity = nomenklatura.lookup(fp)
                print [h.text, entity.name]
            except Exception, e:
                print e
Beispiel #2
0
def analyze():
    """
    Re-analyze all articles in the database, and refresh the classifier
    :return: 'OK' when done
    """
    common.load_doc()
    kmeans.analyze()
    visualize.doc_topic_pie()
    visualize.lsa_scatter()
    visualize.topic_word_cloud()
    common.doc_titles, common.doc_texts = [], []
    return 'OK', 200
Beispiel #3
0
def extract_sections(party):
    doc = load_doc(party)

    nomenklatura = Dataset('btw13-titles',
                           api_key=os.environ.get('NOMENKLATURA_API_KEY'))

    current = Section(party)
    for i, h in enumerate(doc.findall('.//*')):
        if h.tag in ['h1', 'h2']:
            if current.valid:
                yield current
            current = Section(party)
            current.title = h.text
            current.level = h.tag[1:]
            fp = '[%s:%s] %s' % (party, h.tag, h.text)
            try:
                entity = nomenklatura.lookup(fp)
                current.topic = entity.name
            except Exception, e:
                print [fp]
                print e
        #if h.getparent() == doc:
        #    #print "XXX", h
        current.texts.append(h.text)
Beispiel #4
0
def party_html(party, sections):
    doc = load_doc(party)
    next_section = 0
    section = None
    print[party, len(sections)]
    group = None

    doc.attrib["data-party"] = party
    doc.attrib["data-party-name"] = PARTIES[party]["name"]
    doc.attrib["class"] = "platform"

    for i, el in enumerate(doc.findall("./*")):
        assert el.tag != "li"
        p = el.getparent()
        if (el.tag in ["h1", "h2"] and sections[next_section].title == el.text) or section is None:
            section = sections[next_section]
            group = html.Element("div")
            group.attrib["data-key"] = section.key
            group.attrib["data-title"] = section.title or "Einleitung"
            group.attrib["data-topic"] = section.topic
            group.attrib["data-level"] = section.level or "1"
            group.attrib["data-topic-name"] = TOPICS[section.topic]["name"]
            # group.attrib['data-topic-color'] = TOPICS[section.topic]['color']
            group.attrib["class"] = "section"
            p.append(group)
            a = html.Element("a")
            a.attrib["name"] = section.key
            group.append(a)
            next_section += 1

        el.attrib["data-idx"] = str(i)
        p.remove(el)
        group.append(el)

    with open("data/html/%s.html" % party, "wb") as fh:
        fh.write(html.tostring(doc))