def setUp(self):
        self.texts = [
            u'I visited S. F and visited the Conservatory of Flowers and saw the older "Wicked" plants!',
            u'There needs to be a third option because getting older or dying aren\'t working for me.'
        ]
        self.words = [
            u'conservatori', u'die', u'flower', u'get', u'need', u'older', 
            u'option', u'plant', u'saw', u'third', u'visit', u'wick', u'work'
        ]
        self.counts = [
            [1,0,1,0,0,1,0,1,1,0,2,1,0], #counts from the first text
            [0,1,0,1,1,1,1,0,0,1,0,0,1] #counts from the second text
        ] 
        self.idf_values = [ float(0.6931471805599453) for f in self.words ]
        self.idf_values[5] = float(0) #only 'older' is in both texts
 
        self.counter = counter(self.texts)
Exemple #2
0
    def setUp(self):
        self.texts = [
            u'I visited S. F and visited the Conservatory of Flowers and saw the older "Wicked" plants!',
            u'There needs to be a third option because getting older or dying aren\'t working for me.'
        ]
        self.words = [
            u'conservatori', u'die', u'flower', u'get', u'need', u'older',
            u'option', u'plant', u'saw', u'third', u'visit', u'wick', u'work'
        ]
        self.counts = [
            [1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1,
             0],  #counts from the first text
            [0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
             1]  #counts from the second text
        ]
        self.idf_values = [float(0.6931471805599453) for f in self.words]
        self.idf_values[5] = float(0)  #only 'older' is in both texts

        self.counter = counter(self.texts)
#get the subjects
subjects = [f['subject2_hierarchy'] for f in s['train']]
#take the top-level element of each subject for each doc
subjects = [[sub.split('/')[0] for sub in f] for f in subjects]
#sort and take the first one
subjects = [ sorted(sub)[0] for sub in subjects]

mapper = words.mapper(subjects, subjectFile='data/subjects.txt')

#setup word counters
wordcounters = {} 
for textfield in text_fields:
    print datetime.now(), 'creating dictionary for %s' % (textfield)

    wordcounters[textfield] = words.counter(
        [f[textfield] for f in s['train']],
        mindocs=mindocs, maxdocs=maxdocs,
        dictionaryFile='data/dictionary-%s.txt' % (textfield))

#process the sample and write vectors
print datetime.now(), 'converting texts to vectors and storing to csv'
for doc in s['train']:
    subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0]
    x = []
    for textfield in text_fields:
        x += wordcounters[textfield].tfidf_vector(doc[textfield])
    train.writerow(x)
    ytrain.writerow(mapper.vector(subject))

for doc in s['test']:
    subject = sorted([sub.split('/')[0] for sub in doc['subject2_hierarchy']])[0]
    x = []
Exemple #4
0
subjects = [f['subject2_hierarchy'] for f in s['train']]
#take the top-level element of each subject for each doc
subjects = [[sub.split('/')[0] for sub in f] for f in subjects]
#sort and take the first one
subjects = [sorted(sub)[0] for sub in subjects]

mapper = words.mapper(subjects, subjectFile='data/subjects.txt')

#setup word counters
wordcounters = {}
for textfield in text_fields:
    print datetime.now(), 'creating dictionary for %s' % (textfield)

    wordcounters[textfield] = words.counter(
        [f[textfield] for f in s['train']],
        mindocs=mindocs,
        maxdocs=maxdocs,
        dictionaryFile='data/dictionary-%s.txt' % (textfield))

#process the sample and write vectors
print datetime.now(), 'converting texts to vectors and storing to csv'
for doc in s['train']:
    subject = sorted([sub.split('/')[0]
                      for sub in doc['subject2_hierarchy']])[0]
    x = []
    for textfield in text_fields:
        x += wordcounters[textfield].tfidf_vector(doc[textfield])
    train.writerow(x)
    ytrain.writerow(mapper.vector(subject))

for doc in s['test']: