def db_init(db, create=True): """ Initializes the sqlite3 database. Keyword Arguments: db (str) -- Name of the database to use. create (bool) -- If creating the database for the first time. """ if not os.path.exists(config.SYNT_PATH): os.makedirs(config.SYNT_PATH) fp = os.path.join(config.SYNT_PATH, db) if not db_exists(db): conn = sqlite3.connect(fp) cursor = conn.cursor() if create: cursor.execute( '''CREATE TABLE item (id integer primary key, text text unique, sentiment text)''' ) else: conn = sqlite3.connect(fp) return conn
def db_init(db, create=True): """ Initializes the sqlite3 database. Keyword Arguments: db (str) -- Name of the database to use. create (bool) -- If creating the database for the first time. """ if not os.path.exists(config.SYNT_PATH): os.makedirs(config.SYNT_PATH) fp = os.path.join(config.SYNT_PATH, db) if not db_exists(db): conn = sqlite3.connect(fp) cursor = conn.cursor() if create: cursor.execute('''CREATE TABLE item (id integer primary key, text text unique, sentiment text)''') else: conn = sqlite3.connect(fp) return conn
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words', best_features=10000, processes=8, purge=False, redis_db=5): """ Train with samples from sqlite database and stores the resulting classifier in Redis. Arguments: db_name (str) -- Name of the training database to use stored in ~/.synt Keyword arguments: samples (int) -- Amount of samples to train on. classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'. extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'. best_features (int) -- Amount of highly informative features to store. processes (int) -- The amount of processes to be used for counting features in parallel. redis_db (int) -- Redis database to use for Redis Manager. """ m = RedisManager(db=redis_db, purge=purge) extractor = get_extractor(extractor_type) if not db_exists(db_name): raise ValueError("Database '%s' does not exist." % db_name) if classifier_type in m.r.keys(): print("Classifier exists in Redis. Purge to re-train.") return classifier = config.CLASSIFIERS.get(classifier_type) if not classifier: #classifier not supported raise ValueError("Classifier '%s' not supported." % classifier_type) #retrieve training samples from database train_samples = get_samples(db_name, samples) m.store_feature_counts(train_samples, processes=processes) m.store_freqdists() m.store_feature_scores() if best_features and best_features > 1: m.store_best_features(best_features) label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) #retreieve the actual samples processed for label neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get('positive_processed') label_freqdist.inc('negative', int(neg_processed)) label_freqdist.inc('positive', int(pos_processed)) conditional_fd = m.pickle_load('label_fd') labels = conditional_fd.conditions() #feature extraction feat_ex = extractor() extracted_set = set([feat_ex.extract(conditional_fd[label].keys(), as_list=True) for label in labels][0]) #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses for label in labels: samples = label_freqdist[label] for fname in extracted_set: trues = conditional_fd[label][fname] falses = samples - trues feature_freqdist[label, fname].inc(True, trues) feature_freqdist[label, fname].inc(False, falses) #create the P(label) distribution estimator = ELEProbDist label_probdist = estimator(label_freqdist) #create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=2) feature_probdist[label,fname] = probdist #TODO: naivebayes supports this prototype, future classifiers will most likely not trained_classifier = classifier(label_probdist, feature_probdist) m.pickle_store(classifier_type, trained_classifier) m.r.set('trained_to', samples) m.r.set('trained_db', db_name) m.r.set('trained_classifier', classifier_type) m.r.set('trained_extractor', extractor_type)
def train(db_name, samples=200000, classifier_type='naivebayes', extractor_type='words', best_features=10000, processes=8, purge=False): """ Train with samples from sqlite database and stores the resulting classifier in Redis. Arguments: db_name (str) -- Name of the training database to use stored in ~/.synt Keyword arguments: samples (int) -- Amount of samples to train on. classifier_type (str) -- Type of classifier to use. Available classifiers are 'naivebayes'. extractor_type (str) -- Type of extractor to use. Available extractors are 'words', 'stopwords', 'bestwords'. best_features (int) -- Amount of highly informative features to store. processes (int) -- The amount of processes to be used for counting features in parallel. purge (bool) -- If true will flush the redis database. """ m = RedisManager(purge=purge) extractor = get_extractor(extractor_type) if not db_exists(db_name): raise ValueError("Database '%s' does not exist." % db_name) if classifier_type in m.r.keys(): print("Classifier exists in Redis. Purge to re-train.") return classifier = config.CLASSIFIERS.get(classifier_type) if not classifier: #classifier not supported raise ValueError("Classifier '%s' not supported." % classifier_type) #retrieve training samples from database train_samples = get_samples(db_name, samples) m.store_feature_counts(train_samples, processes=processes) m.store_feature_scores() if best_features and best_features > 1: m.store_best_features(best_features) label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) #retreieve the actual samples processed for label neg_processed, pos_processed = m.r.get('negative_processed'), m.r.get( 'positive_processed') label_freqdist.inc('negative', int(neg_processed)) label_freqdist.inc('positive', int(pos_processed)) labeled_feature_freqs = m.pickle_load('labeled_feature_freqs') labels = labeled_feature_freqs.keys() #feature extraction feat_ex = extractor() extracted_set = set([ feat_ex.extract(labeled_feature_freqs[label].keys(), as_list=True) for label in labels ][0]) #increment the amount of times a given feature for label occured and fill in the missing occurences with Falses for label in labels: samples = label_freqdist[label] for fname in extracted_set: trues = labeled_feature_freqs[label].get(fname, 0) falses = samples - trues feature_freqdist[label, fname].inc(True, trues) feature_freqdist[label, fname].inc(False, falses) #create the P(label) distribution estimator = ELEProbDist label_probdist = estimator(label_freqdist) #create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=2) feature_probdist[label, fname] = probdist #TODO: naivebayes supports this prototype, future classifiers will most likely not trained_classifier = classifier(label_probdist, feature_probdist) m.pickle_store(classifier_type, trained_classifier) m.r.set('trained_to', samples) m.r.set('trained_db', db_name) m.r.set('trained_classifier', classifier_type) m.r.set('trained_extractor', extractor_type)