"""
This file creates vocabulary for the dataset,
length vector and file list so that it could be used to calculate complexity
"""

import file_reader
from nltk import word_tokenize, pos_tag, wordnet
import numpy as np

fs, cat = file_reader.readfile()

# word_list = set()
len_list = []
filename_list = []
for folder in fs.keys():
    for file in fs[folder]:
        text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ')
        if len(text) > 0:
            tokenized_text = word_tokenize(text)
            len_list.append(len(tokenized_text))
            filename_list.append(file)
        else:
            len_list.append(0)
            filename_list.append(file)
            # tmp = pos_tag(tokenized_text)

            # filtered_tok_text = list(filter(lambda x: ('NN' in x[1]) and len(x[0]) > 1, tmp))
            # if len(filtered_tok_text) > 0:
            #     (tok, tag) = zip(*filtered_tok_text)
            #     tok_count = Counter(tok)
            #     tok_list, count = zip(*tok_count.most_common(10))
        if len(x) == 0:
            x = [temp]
        else:
            x.append(temp)

    cts = CountVectorizer(input='content', binary=True)

    A = cts.fit_transform(x)
    r = A.toarray()

    r = np.insert(r, 0, label, axis=1)
    np.save("Topic_feat", r)


if __name__ == '__main__':
    file_structure, cat = file.readfile(300)
    label = []
    unreadable_files = []
    text = {}
    for folder in file_structure.keys():
        for file in file_structure[folder]:

            try:
                text[file] = open(file).read().lower().replace(u'\ufeff',
                                                               '').replace(
                                                                   u'\n', ' ')
                label.append(cat[folder])
            except:
                unreadable_files.append(file)
                print("Couldn't read ", file)
                continue
import numpy as np
import nltk
import file_reader
from pprint import pprint

X = np.array([])

categories = {}
flag = False

unreadable_files = []

mean_sen_len_cat = []

file_structure, categories = file_reader.readfile(num=30, random=False)

for folder in file_structure.keys():
    avg_sen_len_vec = []
    for file in file_structure[folder]:
        try:
            text = open(file).read().lower().replace(u'\ufeff',
                                                     '').replace(u'\n', ' ')

        except:
            unreadable_files.append(file)
            # print("Couldn't read ",file)
            continue

        if len(text) == 0:
            unreadable_files.append(file)
Exemple #4
0
import os
import file_reader
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, pos_tag_sents, BigramTagger
from scipy.spatial import distance
from sklearn import neighbors, datasets
import knn
#
# taglist = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
# pairlist =[]
#
# for i in taglist:
#     for j in taglist:
#         pairlist.append((i,j))
#
filelist, D = file_reader.readfile(30)

# def label(genre):
#     return D[genre]
#
# def featureCreate(bi,lab):
#     global pairlist
#     feature = np.array([lab])
#
#     for i in pairlist:
#
#         if i in bi.keys():
#             feature = np.append(feature,bi[i])
#         else:
#             feature = np.append(feature,0)
#     return feature.reshape(1,-1)
import numpy as np
import file_reader
from pprint import pprint
from textstat.textstat import textstat

X = np.array([])

categories = {}
flag = False

unreadable_files = []

mean_sen_len_cat = []

file_structure, categories = file_reader.readfile(num=30)

for folder in file_structure.keys():
    avg_sen_len_vec = []
    for file in file_structure[folder]:
        try:
            text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ')
        except:
            unreadable_files.append(file)
            continue

        if len(text) == 0:
            unreadable_files.append(file)
            continue

        complexity = textstat.dale_chall_readability_score(text)