def train_classifiers(user): global args corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) cls = {} for sr in args.subreddits: document = corpus.get_train_documents(args.type, user['username'], sr, args.c[0]).encode('utf-8') cl = RedditPPM() cl.train(document) cls[sr] = cl del corpus return cls
def train_classifiers(user): global args corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) cls = {} for sr in subreddits: document = corpus.get_train_documents(args.type, user['username'], sr, args.c[0]).encode('utf-8') cl = RedditPPM() cl.train(document) cls[sr] = cl del corpus return cls
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace('\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x+100]) #print(where_clause) rows = corpus.run_sql('SELECT `id`, `type`, `feature` FROM `feature_map_test` '+where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
def feature_to_numeric(features): corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) where_list = [] for k in features: where_list.append(' (`type` = \'%s\' AND `feature` = \'%s\') ' % (k[0], k[1].replace('\\', '\\\\').replace( '\n', '\\n').replace('\'', '\\\''))) numeric = {} for x in range(0, len(features), 100): where_clause = 'WHERE ' + '\n OR'.join(where_list[x:x + 100]) #print(where_clause) rows = corpus.run_sql( 'SELECT `id`, `type`, `feature` FROM `feature_map_test` ' + where_clause, None) for row in rows: if (row['type'], row['feature']) not in features: continue numeric[row['id']] = features[(row['type'], row['feature'])] return numeric
featurize[('nwc%d' % n, ' '.join(k))] = wngram['ngram_word_clean'][n][k] words = ngram.get_word_ngram(text, n=1, clean=False) words = { k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) '
'\\1', atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in [ 'worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit' ]: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql(
def gen_feature(atuple): text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('npos%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('pos', word)) return set(aset) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(2) print('set up pool') chunk = 200 feature_set = set() for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo', 'gaming', 'news', 'AskReddit']: j = 0 while True: print('j=%d' % j) rows = corpus.run_sql('SELECT `body_pos` AS `text` FROM `comment_pos` ' 'LEFT JOIN `comment` ON (`comment`.`id`=`comment_pos`.`id`) ' 'LEFT JOIN `submission` ON (`comment`.`submission_id`=`submission`.`id`) ' 'LEFT JOIN `reddit` ON (`submission`.`reddit_id`=`reddit`.`id`) '
ranklist.append(rank) return ranklist if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--type', choices=['submission', 'comment'], required=True) parser.add_argument('n', type=int, nargs=1) parser.add_argument('c', type=int, nargs=1) parser.add_argument('subreddits', type=str, nargs='+') args = parser.parse_args(sys.argv[1:]) corpus = RedditMySQLCorpus(multiprocessing.cpu_count()) corpus.setup(**(cred.kwargs)) corpus.create() print(args.n, args.c, args.subreddits) userlist = corpus.get_user_list(args.type, args.c[0], args.subreddits) userlist = userlist[:args.n[0]] print('Got users') pprint.pprint(userlist) print('Downloading document list') corpora = {} for sr in args.subreddits: corpora[sr] = corpus.get_test_grouped_documents(args.type, sr) print('Downloaded %s' % sr) del corpus pool = multiprocessing.Pool(multiprocessing.cpu_count())
import graph from corpus.mysql.reddit import RedditMySQLCorpus import cred if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) limits = { 'ari': (-20, 100), 'flesch_reading_ease': (-150, 200), 'flesch_kincaid_grade_level': (-20, 50), 'gunning_fog_index': (0, 40), 'smog_index': (0, 20), 'coleman_liau_index': (-30, 30), 'lix': (0, 100), 'rix': (0, 10) } indices = [ 'ari', 'flesch_reading_ease', 'flesch_kincaid_grade_level', 'gunning_fog_index', 'smog_index', 'coleman_liau_index', 'lix', 'rix' ] for i in indices: bin_width = (limits[i][1] - limits[i][0]) / 20 if bin_width <= 1: bin_width = 1 result = corpus.run_sql( 'SELECT COUNT(*) AS count, FLOOR(FLOOR(%s)/%s)*%s AS bin '
words = ngram.get_word_ngram(text, n=1, clean=False) words = {k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector) if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) corpus.create() pool = multiprocessing.Pool(16) print('set up pool') chunk = 100 for reddit in ['worldnews', 'quantum', 'netsec', 'uwaterloo']: j = 0 i = 0 if reddit == 'worldnews': j = 275000 while True: print('j=%d' % j) rows = corpus.run_sql( 'SELECT `comment`.`id` AS `id`, `body` AS `text` FROM `comment` '
rank = i break ranklist.append(rank) return ranklist if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-t', '--type', choices=['submission', 'comment'], required=True) parser.add_argument('n', type=int, nargs=1) parser.add_argument('c', type=int, nargs=1) args = parser.parse_args(sys.argv[1:]) sr1 = 'worldnews' sr2 = 'gaming' subreddits = [ sr1, sr2 ] corpus = RedditMySQLCorpus(multiprocessing.cpu_count()) corpus.setup(**(cred.kwargs)) corpus.create() print(args.n, args.c, subreddits) userlist = corpus.get_user_list(args.type, args.c[0], subreddits) userlist = userlist[:args.n[0]] print('Got users') print('Downloading document list') corpora = {} for sr in subreddits: corpora[sr] = corpus.get_test_normalized_documents(args.type, sr, args.c[0]) print('Downloaded %s (%d)' % (sr, len(corpora[sr]))) del corpus maybe_users = list(corpora[sr2].keys()) corpora_users = [] for mu in maybe_users:
from main import cred def fn(arg): st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment',
__author__ = 'sharvey' import multiprocessing from corpus.mysql.reddit import RedditMySQLCorpus import cred if __name__ == '__main__': corpus = RedditMySQLCorpus(8) corpus.setup(**(cred.kwargs)) corpus.create() corpus.gen_test_read()
import graph from corpus.mysql.reddit import RedditMySQLCorpus import cred if __name__ == '__main__': corpus = RedditMySQLCorpus() corpus.setup(**(cred.kwargs)) limits = { 'ari': (-20, 100), 'flesch_reading_ease': (-150, 200), 'flesch_kincaid_grade_level': (-20, 50), 'gunning_fog_index': (0, 40), 'smog_index': (0, 20), 'coleman_liau_index': (-30, 30), 'lix': (0, 100), 'rix': (0, 10) } indices = ['ari', 'flesch_reading_ease', 'flesch_kincaid_grade_level', 'gunning_fog_index', 'smog_index', 'coleman_liau_index', 'lix', 'rix'] for i in indices: bin_width = (limits[i][1] - limits[i][0])/20 if bin_width <= 1: bin_width = 1 result = corpus.run_sql('SELECT COUNT(*) AS count, FLOOR(FLOOR(%s)/%s)*%s AS bin ' 'FROM comment_feature_read ' 'LEFT JOIN comment ON (comment.id=comment_feature_read.id) LEFT JOIN submission ON (submission.id=comment.submission_id) LEFT JOIN reddit ON (reddit.id=submission.reddit_id) ' 'WHERE reddit.name = \'netsec\' '
def fn(arg): st = time.clock() read = feature.simple.get_read_stats(arg['text'].encode('ascii', 'ignore')) et = time.clock() #tprint(('stats took %s' % (et - st))) return (arg['id'], read['ari'], read['flesch_reading_ease'], read['flesch_kincaid_grade_level'], read['gunning_fog_index'], read['smog_index'], read['coleman_liau_index'], read['lix'], read['rix']) corpus1 = RedditMySQLCorpus() corpus1.setup(**(cred.kwargs)) corpus1.create() corpus2 = RedditMySQLCorpus() corpus2.setup(**(cred.kwargs)) corpus2.create() fp = open('data/results.csv') for i in range(1000, 100000, 1000): st = time.clock() dgt = DataGetThread(corpus1, 'SELECT id, body AS text FROM comment', None, limit=i) tt = []