def main(): args = docopt(__doc__) feature_name = args['<feature_name>'] assert feature_name == 'words' assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys()) c = get_config() experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name) print "Computing foreground group sums using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) fg_groups = experiment_set.list_foreground_groups() cache = {} try: for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)): cache[group_name] = sum_vector except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Computing background sums..." bg_groups = experiment_set.list_background_groups() for g in bg_groups: sum_vector = experiment_set.compute_background_group_sum(g, cache) cache[g] = sum_vector print "Saving sums to ZODB..." zodb_root = open_zodb(read_only=False) if getattr(zodb_root, 'group_sums', None) is None: zodb_root.group_sums = BTrees.OOBTree.OOBTree() transaction.commit() if feature_name not in zodb_root.group_sums: zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree() transaction.commit() for k, v in cache.iteritems(): zodb_root.group_sums[feature_name][k] = v transaction.commit() print "Creating output db tables..." create_db(c.resultsdb_url) session_out = open_db(c.resultsdb_url) print "Computing overrepresentation using %d cores..." % c.num_cores exps = experiment_set.list_experiments() cls = experiment_set.result_table_class() try: for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)): for w, odds, pval in results: c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval) session_out.add(c) except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Committing..." session_out.commit() print "Done"
def country_words(country_code): session = open_db() zodb_root = open_zodb(read_only=True) ids = session.query(Speech.id).filter(Speech.lang == 'en').filter(Speech.country == country_code).all() all_words = set() for (id,) in ids: all_words = all_words.union(zodb_root.features['words'][id].keys()) return all_words
def compute_foreground_group_sum(self, group_name): session = open_db(self.config.db_url) zodb = open_zodb(self.config, read_only=True) result = Counter() wordset = zodb.all_words for (id,) in session.query(Speech.id).filter(Speech.country == group_name).filter(Speech.lang == 'en'): v = zodb.features[self.feature_name][id] for w in v: if w in wordset: result[w] += v[w] return result
def compute_foreground_group_sum(self, group_name): session = open_db(self.config.db_url) zodb = open_zodb(self.config, read_only=True) result = Counter() wordset = zodb.all_words y = int(group_name) dmin = datetime.date(y,1,1) dmax = datetime.date(y+1,1,1) for (id,) in session.query(Speech.id).filter(Speech.date >= dmin).filter(Speech.date < dmax).filter(Speech.lang == 'en'): v = zodb.features[self.feature_name][id] for w in v: if w in wordset: result[w] += v[w] return result
def extract_features(self, foreground_group_name, background_group_name): """Extract overrepresented words for a given foreground/background group. Runs on worker nodes. Uses zodb.group_sums[self.feature_name] data precomputed in previous steps.""" z = open_zodb(self.config, read_only=True) fg_counts = z.group_sums[self.feature_name][foreground_group_name] bg_counts = z.group_sums[self.feature_name][background_group_name] total_fg_count = sum(fg_counts.values()) total_bg_count = sum(bg_counts.values()) result = [] cutoff = 0.01/max(1, len(fg_counts)) for w in fg_counts: contingency = [[fg_counts[w], bg_counts[w] - fg_counts[w]], [total_fg_count - fg_counts[w], total_bg_count - total_fg_count - bg_counts[w] + fg_counts[w]]] odds, pval = fisher_exact(contingency, alternative='greater') if pval < cutoff: result.append((w, odds, pval)) return result
def main(): args = docopt(__doc__) extractor_name = args['<feature_name>'] extractor = getattr(talkofeuropewords.extract, extractor_name, None) if extractor is None: print "Unknown extractor name" sys.exit(1) c = get_config() s = open_db() print "Preparing ZODB" zodb_root = open_zodb(read_only=False) if getattr(zodb_root, 'features', None) is None: zodb_root.features = BTrees.OOBTree.OOBTree() transaction.commit() if extractor_name not in zodb_root.features: zodb_root.features[extractor_name] = BTrees.OOBTree.OOBTree() transaction.commit() runner = TaskRunner(extractor) print "Querying database..." speeches = s.query(Speech).filter(Speech.lang == 'en').all() total_speeches = len(speeches) print "Computing using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) try: for i, (id, result) in enumerate(progress.bar(pool.imap_unordered(runner, speeches), label='Progress ', expected_size=total_speeches, every=1000), 1): zodb_root.features[extractor_name][id] = result if i % 1000 == 0: transaction.commit() transaction.commit() except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Done"
def main(): args = docopt(__doc__) c = get_config() session = open_db() print "Finding 5 most active countries" countries = session.query(Speech.country, func.count(Speech.id)).filter(Speech.lang == 'en').group_by(Speech.country).order_by(desc(func.count(Speech.id))).limit(5).all() print countries country_codes = [c[0] for c in countries] print "Collecting words used by each country using 5 cores" pool = Pool(5, init_worker) try: word_sets = pool.map(country_words, country_codes) except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Collected word sets with sizes: ", map(len, word_sets) print "Computing intersection..." word_set = reduce(lambda x, y: x & y, word_sets) print "Result size: ", len(word_set) print "Subtracting stopwords..." nltk.download('stopwords') langs = ['english', 'dutch', 'french', 'italian', 'portuguese', 'swedish', 'german', 'spanish'] all_stopwords = reduce(lambda x, y: x | y, [set(nltk.corpus.stopwords.words(lng)) for lng in langs]) all_stopwords = set(map(unidecode, all_stopwords)) word_set = word_set - all_stopwords print "Resulting word set size: ", len(word_set) print "Saving..." zodb_root = open_zodb() zodb_root.all_words = word_set transaction.commit() print "Done"