def make_docs(docs): new_docs = [] for d in docs: lines = d[0] filename = d[1] atts = lines[0] text = '\n'.join(lines[1:]) new_docs.append(quanteda.Document(text, filename, atts)) return new_docs
def get_docs_names(path): manifs = quanteda.Corpus() for f in os.listdir(path): print f text = open(path + f).read() text = unicode(text, 'utf-8') bits = f.split('_') country = bits[0] level = bits[1] year = bits[2] lang = bits[3] party = bits[4].replace('.txt', '') d = quanteda.Document(text, fname=f, variables={"year":year, "country":country.upper(),\ "party":party, "lang":lang, "level":level}) d.preprocess() manifs.add_docs(d) return manifs
def get_docs_folders(path): manifs = quanteda.Corpus() for ctrcode in os.listdir(path): print ctrcode for year in os.listdir(path + ctrcode): for manif in os.listdir(path + ctrcode + '/' + year): text = open(path + ctrcode + '/' + year + '/' + manif).read() res = chardet.detect(text) text = text.decode(res['encoding']) party = manif.split('_')[0] d = quanteda.Document(text, fname=manif, variables={ "year": year, "country": ctrcode, "party": party }) d.preprocess() manifs.add_docs(d) return manifs
import os import quanteda import random import zipfile from nltk.classify import SklearnClassifier from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC leftParties = [ "Laba", "Lab", "Lib", "Comm", "LibSDP", "SF", "SEP", "TW", "Gr", "Resp" ] ukMan = quanteda.Corpus() with zipfile.ZipFile("/home/paul/UK_Manifestos.zip") as myzip: for n in myzip.namelist(): d = quanteda.Document(myzip.open(n).read(), n) ukMan.documents.append(d) n = n.replace('Con_a', 'Cona') n = n.replace('Lab_a', 'Laba') n = n.replace('.txt', '') v = n.split('_') wing = "None" if v[4] in leftParties: wing = "Left" else: wing = "Right" d.add_variables({ "elecType": v[1], "year": v[2], "lang": v[3], "party": v[4], "wing": wing })
path = "/home/paul/Dropbox/QUANTESS/corpora/UK Manifestos/" files = os.listdir(path) for fname in files: f = open(path + fname, 'r') text = f.read() text = text.decode('latin1') temp = fname.split('_') country = temp[0] year = temp[2] party = temp[4].replace('.txt', '') d = quanteda.Document(text, fname=fname, variables={ "year": year, "country": country, "party": party }) d.preprocess() manifs.add_docs([d]) print manifs popwords = read_dictionary('/home/paul/Dropbox/populism/dictionary.txt') for d in manifs.documents: popcounts = 0 nonpopcounts = 0 words = d.text.split() for w in words:
dictionary[label[2]] = words return (dictionary) manifs = quanteda.Corpus() path = "/home/paul/Dropbox/populism/" for ctrcode in os.listdir(path + '/txt/'): for year in os.listdir(path + '/txt/' + ctrcode): for manif in os.listdir(path + '/txt/' + ctrcode + '/' + year): text = open(path + '/txt/' + ctrcode + '/' + year + '/' + manif).read() d = quanteda.Document(text, fname=manif, variables={ "year": year, "country": ctrcode }) if d.variables['country'] == "IRL": d.preprocess() manifs.add_docs([d]) popwords = read_dictionary('/home/paul/Dropbox/populism/dictionary.txt') for d in manifs.documents: popcounts = 0 nonpopcounts = 0 words = d.text.split() for w in words: match = False for pw in popwords[d.variables['country']]: