Beispiel #1
0
def features_gen(config):
    """
    Reads the training data set and yields feature sets associated with class labels
    """
    ratingsmean = get_ratingsmean(config)
    logging.debug("middle rating value used for training = %d"%ratingsmean)
    sourcetype = config['training']['type']
    path = join( config['data_path'], config['training']['path'] )
    dataclass = _dynamic_get_class("mpdata", sourcetype)
    data = dataclass( path,  dialect="excel")
    for com in data:
        if 'rating' in com:
            if com['rating'] >= ratingsmean:
                yield (word_feats(mptokenize.tokenize(com['body'])), 'pos')
            else:
                yield (word_feats(mptokenize.tokenize(com['body'])), 'neg')
    logging.debug("total of bad lines : %d"%data.impossible_line)
Beispiel #2
0
def features_gen(config):
    """
    Reads the training data set and yields feature sets associated with class labels
    """
    ratingsmean = get_ratingsmean(config)
    logging.debug("middle rating value used for training = %d" % ratingsmean)
    sourcetype = config['training']['type']
    path = join(config['data_path'], config['training']['path'])
    dataclass = _dynamic_get_class("mpdata", sourcetype)
    data = dataclass(path, dialect="excel")
    for com in data:
        if 'rating' in com:
            if com['rating'] >= ratingsmean:
                yield (word_feats(mptokenize.tokenize(com['body'])), 'pos')
            else:
                yield (word_feats(mptokenize.tokenize(com['body'])), 'neg')
    logging.debug("total of bad lines : %d" % data.impossible_line)
Beispiel #3
0
def articles_gen(config, dbconnect):
    content_gen = dbconnect.get_contents()
    total=0
    try:
        while 1:
            alltokens = []
            (id, contents) = content_gen.next()
            # tokenizes all selected fields from DB
            for field in contents:
                alltokens += mptokenize.tokenize( nltk.clean_html(field) )
            yield (id, alltokens)
            total+=1
    except StopIteration:
        logging.debug( "analyzed %d articles"%total )
Beispiel #4
0
def articles_gen(config, dbconnect):
    content_gen = dbconnect.get_contents()
    total = 0
    try:
        while 1:
            alltokens = []
            (id, contents) = content_gen.next()
            # tokenizes all selected fields from DB
            for field in contents:
                alltokens += mptokenize.tokenize(nltk.clean_html(field))
            yield (id, alltokens)
            total += 1
    except StopIteration:
        logging.debug("analyzed %d articles" % total)