def test_db(): gl_vlist = VocabList() log('searching directory: %s' % FEED_DIR) for dir in os.listdir(FEED_DIR): if '.mk4' in dir[-4:]: log('found database: %s' % dir) # open database db = metakit.storage(os.path.join(FEED_DIR, dir), 0) data = read_database(db) if len(data) > 0: # feed content in database log('create library') lib = Library() for feed in data: lib.add_document(read_data(feed)) vlist = lib.gen_vocablist() vlist.clean(5) gl_vlist.merge(vlist) db = None # close database print gl_vlist
def gen_feature_vector(mask, text): ''' Generates a feature vector by applying the given mask to the specified document. The feature vector is generated by mapping the words in the mask list to the corresponding number of occurrences of these words in the document. Therefore, the length of the mask specifies the length of the output list. See also: 'Bag-of-words model' @param mask: a list of words that should be used as the mask @param text: the text for which the feature vector should be generated @return: a list of integers representing occurrences of words in the document ''' processed = preprocess(text) vlist = VocabList() vlist.expand_with(processed) fvector = [] for word in mask: fvector.append(vlist.quantity_of(word) / float(vlist.get_total_word_count())) return fvector