def features_gen(config): """ Reads the training data set and yields feature sets associated with class labels """ ratingsmean = get_ratingsmean(config) logging.debug("middle rating value used for training = %d"%ratingsmean) sourcetype = config['training']['type'] path = join( config['data_path'], config['training']['path'] ) dataclass = _dynamic_get_class("mpdata", sourcetype) data = dataclass( path, dialect="excel") for com in data: if 'rating' in com: if com['rating'] >= ratingsmean: yield (word_feats(mptokenize.tokenize(com['body'])), 'pos') else: yield (word_feats(mptokenize.tokenize(com['body'])), 'neg') logging.debug("total of bad lines : %d"%data.impossible_line)
def features_gen(config): """ Reads the training data set and yields feature sets associated with class labels """ ratingsmean = get_ratingsmean(config) logging.debug("middle rating value used for training = %d" % ratingsmean) sourcetype = config['training']['type'] path = join(config['data_path'], config['training']['path']) dataclass = _dynamic_get_class("mpdata", sourcetype) data = dataclass(path, dialect="excel") for com in data: if 'rating' in com: if com['rating'] >= ratingsmean: yield (word_feats(mptokenize.tokenize(com['body'])), 'pos') else: yield (word_feats(mptokenize.tokenize(com['body'])), 'neg') logging.debug("total of bad lines : %d" % data.impossible_line)
def articles_gen(config, dbconnect): content_gen = dbconnect.get_contents() total=0 try: while 1: alltokens = [] (id, contents) = content_gen.next() # tokenizes all selected fields from DB for field in contents: alltokens += mptokenize.tokenize( nltk.clean_html(field) ) yield (id, alltokens) total+=1 except StopIteration: logging.debug( "analyzed %d articles"%total )
def articles_gen(config, dbconnect): content_gen = dbconnect.get_contents() total = 0 try: while 1: alltokens = [] (id, contents) = content_gen.next() # tokenizes all selected fields from DB for field in contents: alltokens += mptokenize.tokenize(nltk.clean_html(field)) yield (id, alltokens) total += 1 except StopIteration: logging.debug("analyzed %d articles" % total)