def setUp(self): categories = [ 'red\n', 'YelLOw', 'green', 'blue' ] self.mapper = mapper(categories)
def setUp(self): categories = ['red\n', 'YelLOw', 'green', 'blue'] self.mapper = mapper(categories)
ytrain = csv.writer(open('data/ytrain.csv','w')) ytest = csv.writer(open('data/ytest.csv','w')) mindocs = round(0.01*len(s['train'])) maxdocs = round(0.99*len(s['train'])) #create the subject mapping print datetime.now(), 'creating subject mapping' #get the subjects subjects = [f['subject2_hierarchy'] for f in s['train']] #take the top-level element of each subject for each doc subjects = [[sub.split('/')[0] for sub in f] for f in subjects] #sort and take the first one subjects = [ sorted(sub)[0] for sub in subjects] mapper = words.mapper(subjects, subjectFile='data/subjects.txt') #setup word counters wordcounters = {} for textfield in text_fields: print datetime.now(), 'creating dictionary for %s' % (textfield) wordcounters[textfield] = words.counter( [f[textfield] for f in s['train']], mindocs=mindocs, maxdocs=maxdocs, dictionaryFile='data/dictionary-%s.txt' % (textfield)) #process the sample and write vectors print datetime.now(), 'converting texts to vectors and storing to csv' for doc in s['train']: subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0]
ytrain = csv.writer(open('data/ytrain.csv', 'w')) ytest = csv.writer(open('data/ytest.csv', 'w')) mindocs = round(0.01 * len(s['train'])) maxdocs = round(0.99 * len(s['train'])) #create the subject mapping print datetime.now(), 'creating subject mapping' #get the subjects subjects = [f['subject2_hierarchy'] for f in s['train']] #take the top-level element of each subject for each doc subjects = [[sub.split('/')[0] for sub in f] for f in subjects] #sort and take the first one subjects = [sorted(sub)[0] for sub in subjects] mapper = words.mapper(subjects, subjectFile='data/subjects.txt') #setup word counters wordcounters = {} for textfield in text_fields: print datetime.now(), 'creating dictionary for %s' % (textfield) wordcounters[textfield] = words.counter( [f[textfield] for f in s['train']], mindocs=mindocs, maxdocs=maxdocs, dictionaryFile='data/dictionary-%s.txt' % (textfield)) #process the sample and write vectors print datetime.now(), 'converting texts to vectors and storing to csv' for doc in s['train']: