if not os.path.exists(sourcedir + docid + '.tsv'):
            continue
        docs.append(row['volid'])
        logistic.append(float(row['logistic']))
        dates.append(float(row['dateused']))

logistic = np.array(logistic)
dates = np.array(dates)

numdocs = len(docs)

categories = dict()
for field in fields:
    categories[field] = np.zeros(numdocs)

wordcounts = filecab.get_wordcounts(sourcedir, '.tsv', docs)

for i, doc in enumerate(docs):
    ctcat = Counter()
    allcats = 0
    for word, count in wordcounts[doc].items():
        allcats += count
        for field in fields:
            if word in inquirer[field]:
                ctcat[field] += count
    for field in fields:
        categories[field][i] = ctcat[field] / (allcats + 1)

logresults = []
dateresults = []
Beispiel #2
0
logistic = dict()
realclass = dict()
titles = dict()
dates = dict()

with open('../metadata/prestigeset.csv', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        logistic[row['volid']] = float(row['logistic'])
        realclass[row['volid']] = row['prestige']
        titles[row['volid']] = row['title']
        dates[row['volid']] = int(row['dateused'])

sourcedir = '../sourcefiles/'
documents = filecab.get_wordcounts(sourcedir, '.tsv', set(logistic))

outrows = []

for docid, doc in documents.items():
    if docid not in logistic:
        continue
    else:
        allwords = 1
        colorct = 0

        for word, count in doc.items():
            allwords += count
            if word in colors:
                colorct += count