""" This file creates vocabulary for the dataset, length vector and file list so that it could be used to calculate complexity """ import file_reader from nltk import word_tokenize, pos_tag, wordnet import numpy as np fs, cat = file_reader.readfile() # word_list = set() len_list = [] filename_list = [] for folder in fs.keys(): for file in fs[folder]: text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ') if len(text) > 0: tokenized_text = word_tokenize(text) len_list.append(len(tokenized_text)) filename_list.append(file) else: len_list.append(0) filename_list.append(file) # tmp = pos_tag(tokenized_text) # filtered_tok_text = list(filter(lambda x: ('NN' in x[1]) and len(x[0]) > 1, tmp)) # if len(filtered_tok_text) > 0: # (tok, tag) = zip(*filtered_tok_text) # tok_count = Counter(tok) # tok_list, count = zip(*tok_count.most_common(10))
if len(x) == 0: x = [temp] else: x.append(temp) cts = CountVectorizer(input='content', binary=True) A = cts.fit_transform(x) r = A.toarray() r = np.insert(r, 0, label, axis=1) np.save("Topic_feat", r) if __name__ == '__main__': file_structure, cat = file.readfile(300) label = [] unreadable_files = [] text = {} for folder in file_structure.keys(): for file in file_structure[folder]: try: text[file] = open(file).read().lower().replace(u'\ufeff', '').replace( u'\n', ' ') label.append(cat[folder]) except: unreadable_files.append(file) print("Couldn't read ", file) continue
import numpy as np import nltk import file_reader from pprint import pprint X = np.array([]) categories = {} flag = False unreadable_files = [] mean_sen_len_cat = [] file_structure, categories = file_reader.readfile(num=30, random=False) for folder in file_structure.keys(): avg_sen_len_vec = [] for file in file_structure[folder]: try: text = open(file).read().lower().replace(u'\ufeff', '').replace(u'\n', ' ') except: unreadable_files.append(file) # print("Couldn't read ",file) continue if len(text) == 0: unreadable_files.append(file)
import os import file_reader from nltk.tokenize import sent_tokenize, word_tokenize from nltk import pos_tag, pos_tag_sents, BigramTagger from scipy.spatial import distance from sklearn import neighbors, datasets import knn # # taglist = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'] # pairlist =[] # # for i in taglist: # for j in taglist: # pairlist.append((i,j)) # filelist, D = file_reader.readfile(30) # def label(genre): # return D[genre] # # def featureCreate(bi,lab): # global pairlist # feature = np.array([lab]) # # for i in pairlist: # # if i in bi.keys(): # feature = np.append(feature,bi[i]) # else: # feature = np.append(feature,0) # return feature.reshape(1,-1)
import numpy as np import file_reader from pprint import pprint from textstat.textstat import textstat X = np.array([]) categories = {} flag = False unreadable_files = [] mean_sen_len_cat = [] file_structure, categories = file_reader.readfile(num=30) for folder in file_structure.keys(): avg_sen_len_vec = [] for file in file_structure[folder]: try: text = open(file).read().replace(u'\ufeff', '').replace(u'\n', ' ') except: unreadable_files.append(file) continue if len(text) == 0: unreadable_files.append(file) continue complexity = textstat.dale_chall_readability_score(text)