def generate_idf_model(): terms = {ALL: defaultdict(int)} print "Extracting..." counts = {ALL: 0} for party, sections in load_platforms().items(): counts[party] = 0 terms[party] = defaultdict(int) for section in sections: counts[party] += 1 counts[ALL] += 1 for token in section.tokens: terms[party][token] += 1 terms[ALL][token] += 1 print "Calculating IDF..." data = {} for party, tokens in terms.items(): num = float(counts[party]) data[party] = {} for term, count in tokens.items(): idf = math.log((num / (1 + count))) data[party][term] = idf with open("data/idf.json", "wb") as fh: json.dump(data, fh)
def classify(): decisive = map(norm, open('decisive.txt', 'rb').readlines()) loriot = list(tokenize(open('loriot.txt', 'rb').read().decode('utf-8'))) #print decisive #return platforms = load_platforms() scores = defaultdict(dict) for party, sections in platforms.items(): for section in sections: scores[party][section.key] = {'tokens': len(section)} text = normalize(section.text) n_decisive = 0.0 for phrase in decisive: if phrase in text: n_decisive += 1 scores[party][section.key]['decisive'] = n_decisive/len(section) n_loriot = 0.0 for token in loriot: if token in text: n_loriot += 1 scores[party][section.key]['loriot'] = n_loriot/len(section) #terms = section_terms(model, section) #terms = [(t, s) for t, s in terms] #print [party, section.title, [t for t, s in terms[:10]]] #pprint(scores) with open('data/language.json', 'wb') as fh: json.dump(dict(scores), fh, indent=2)
def test(): platforms = load_platforms() for party, sections in platforms.items(): for section in sections: tagged = split(parse(section.text)) for sentence in tagged: #if not sentence.is_question: # continue try: # for word in sentence.words: # print word.tags #dir(word) #print [sentence.subjects, sentence.verbs] #print [sentence.is_question] #print [sentence.words] print [sentence.text] except UnicodeEncodeError: pass
def save_sentences(): platforms = load_platforms() lengths = defaultdict(list) sentences.update({'valid': False}, {}) for party, sections in platforms.items(): for section in sections: for i, sentence in enumerate(split_sentences(section)): lengths[party].append(len(sentence.split())) data = { 'num': i, 'hash': hashlib.sha1(sentence.encode('ascii', 'replace')).hexdigest(), 'text': sentence, 'party': party, 'section': section.key } #if not check_valid(sentence): data['valid'] = check_valid(sentence) sentences.upsert(data, ['num', 'section']) for party, sens in lengths.items(): avg = sum(sens) / len(sens) print 'PARTY', party, 'AVG', avg
values = {} with open('data/values.txt', 'rb') as fh: for line in fh: if line.strip().startswith('#'): continue label, terms = line.split(':') terms = [t.strip() for t in terms.split(',')] for term in terms: values[term] = label pprint(values) return values if __name__ == '__main__': #model = load_idf_model() values = ['gerecht*', '*gerechtigkeit*', 'chancen*'] platforms = load_platforms() lengths = {} scores = defaultdict(lambda: defaultdict(int)) for party, sections in platforms.items(): lengths[party] = sum([len(s) for s in sections]) token_offset = 0 for section in sections: for token in section.tokens: for value in values: vr = value.replace('*', '') if value.endswith('*') and token.startswith(vr): scores[party][value] += 1 elif value.startswith('*') and token.endswith(vr): scores[party][value] += 1 elif vr == token: scores[party][value] += 1
from sections import load_platforms from collections import defaultdict from pprint import pprint import json SKIP_TOPIC = "intro" if __name__ == "__main__": result = {} data = load_platforms() for party, sections in data.items(): total_tokens = sum([len(s.tokens) for s in sections if s.topic != SKIP_TOPIC]) topics = defaultdict(float) for section in sections: if section.topic == SKIP_TOPIC: continue # print [section.topic, section.title] # print [len(section.tokens), (len(section.tokens)/float(total_tokens))*100] topics[section.topic] += (len(section.tokens) / float(total_tokens)) * 100 pprint(dict(topics)) print "TOTAL", sum(topics.values()) result[party] = dict(topics) with open("data/topic_shares.json", "wb") as fh: json.dump(result, fh)