def all_titles(): for party in PARTIES: doc = load_doc(party) for i, h in enumerate(doc.findall('.//*')): if not h.tag in ['h1', 'h2']: continue #titles.upsert({ # 'party': party, # 'index': i, # 'element': h.tag, # 'text': h.text #}, ['party', 'text']) print [party, h.tag, h.text] fp = '[%s:%s] %s' % (party, h.tag, h.text) try: entity = nomenklatura.lookup(fp) print [h.text, entity.name] except Exception, e: print e
def analyze(): """ Re-analyze all articles in the database, and refresh the classifier :return: 'OK' when done """ common.load_doc() kmeans.analyze() visualize.doc_topic_pie() visualize.lsa_scatter() visualize.topic_word_cloud() common.doc_titles, common.doc_texts = [], [] return 'OK', 200
def extract_sections(party): doc = load_doc(party) nomenklatura = Dataset('btw13-titles', api_key=os.environ.get('NOMENKLATURA_API_KEY')) current = Section(party) for i, h in enumerate(doc.findall('.//*')): if h.tag in ['h1', 'h2']: if current.valid: yield current current = Section(party) current.title = h.text current.level = h.tag[1:] fp = '[%s:%s] %s' % (party, h.tag, h.text) try: entity = nomenklatura.lookup(fp) current.topic = entity.name except Exception, e: print [fp] print e #if h.getparent() == doc: # #print "XXX", h current.texts.append(h.text)
def party_html(party, sections): doc = load_doc(party) next_section = 0 section = None print[party, len(sections)] group = None doc.attrib["data-party"] = party doc.attrib["data-party-name"] = PARTIES[party]["name"] doc.attrib["class"] = "platform" for i, el in enumerate(doc.findall("./*")): assert el.tag != "li" p = el.getparent() if (el.tag in ["h1", "h2"] and sections[next_section].title == el.text) or section is None: section = sections[next_section] group = html.Element("div") group.attrib["data-key"] = section.key group.attrib["data-title"] = section.title or "Einleitung" group.attrib["data-topic"] = section.topic group.attrib["data-level"] = section.level or "1" group.attrib["data-topic-name"] = TOPICS[section.topic]["name"] # group.attrib['data-topic-color'] = TOPICS[section.topic]['color'] group.attrib["class"] = "section" p.append(group) a = html.Element("a") a.attrib["name"] = section.key group.append(a) next_section += 1 el.attrib["data-idx"] = str(i) p.remove(el) group.append(el) with open("data/html/%s.html" % party, "wb") as fh: fh.write(html.tostring(doc))