Python open_zodb Exemples, talkofeuropewords.zodb.open_zodb Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : extract_significant_features.py Projet : johnfelipe/talkofeurope-wordclouds

def main():
    args = docopt(__doc__)
    feature_name = args['<feature_name>']
    assert feature_name == 'words'
    assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys())
    c = get_config()
    experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name)

    print "Computing foreground group sums using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    fg_groups = experiment_set.list_foreground_groups()
    cache = {}
    try:
        for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)):
            cache[group_name] = sum_vector
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Computing background sums..."
    bg_groups = experiment_set.list_background_groups()
    for g in bg_groups:
        sum_vector = experiment_set.compute_background_group_sum(g, cache)
        cache[g] = sum_vector

    print "Saving sums to ZODB..."
    zodb_root = open_zodb(read_only=False)
    if getattr(zodb_root, 'group_sums', None) is None:
        zodb_root.group_sums = BTrees.OOBTree.OOBTree()
        transaction.commit()
    if feature_name not in zodb_root.group_sums:
        zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree()
        transaction.commit()
    for k, v in cache.iteritems():
        zodb_root.group_sums[feature_name][k] = v
    transaction.commit()


    print "Creating output db tables..."
    create_db(c.resultsdb_url)
    session_out = open_db(c.resultsdb_url)

    print "Computing overrepresentation using %d cores..." % c.num_cores
    exps = experiment_set.list_experiments()
    cls = experiment_set.result_table_class()
    try:
        for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)):
            for w, odds, pval in results:
                c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval)
                session_out.add(c)
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Committing..."
    session_out.commit()
    print "Done"

Exemple #2

0

Afficher le fichier

Fichier : collect_words.py Projet : johnfelipe/talkofeurope-wordclouds

def country_words(country_code):
    session = open_db()
    zodb_root = open_zodb(read_only=True)
    ids = session.query(Speech.id).filter(Speech.lang == 'en').filter(Speech.country == country_code).all()
    all_words = set()
    for (id,) in ids:
        all_words = all_words.union(zodb_root.features['words'][id].keys())
    return all_words

Exemple #3

0

Afficher le fichier

Fichier : extract_significant_features.py Projet : johnfelipe/talkofeurope-wordclouds

 def compute_foreground_group_sum(self, group_name):
     session = open_db(self.config.db_url)
     zodb = open_zodb(self.config, read_only=True)
     result = Counter()
     wordset = zodb.all_words
     for (id,) in session.query(Speech.id).filter(Speech.country == group_name).filter(Speech.lang == 'en'):
         v = zodb.features[self.feature_name][id]
         for w in v:
             if w in wordset:
                 result[w] += v[w]
     return result

Exemple #4

0

Afficher le fichier

Fichier : extract_significant_features.py Projet : johnfelipe/talkofeurope-wordclouds

 def compute_foreground_group_sum(self, group_name):
     session = open_db(self.config.db_url)
     zodb = open_zodb(self.config, read_only=True)
     result = Counter()
     wordset = zodb.all_words
     y = int(group_name)
     dmin = datetime.date(y,1,1)
     dmax = datetime.date(y+1,1,1)
     for (id,) in session.query(Speech.id).filter(Speech.date >= dmin).filter(Speech.date < dmax).filter(Speech.lang == 'en'):
         v = zodb.features[self.feature_name][id]
         for w in v:
             if w in wordset:
                 result[w] += v[w]
     return result

Exemple #5

0

Afficher le fichier

Fichier : extract_significant_features.py Projet : johnfelipe/talkofeurope-wordclouds

    def extract_features(self, foreground_group_name, background_group_name):
        """Extract overrepresented words for a given foreground/background group. Runs on worker nodes.
           Uses zodb.group_sums[self.feature_name] data precomputed in previous steps."""
        z = open_zodb(self.config, read_only=True)
        fg_counts = z.group_sums[self.feature_name][foreground_group_name]
        bg_counts = z.group_sums[self.feature_name][background_group_name]
        total_fg_count = sum(fg_counts.values())
        total_bg_count = sum(bg_counts.values())

        result = []
        cutoff = 0.01/max(1, len(fg_counts))

        for w in fg_counts:
            contingency = [[fg_counts[w],                  bg_counts[w] - fg_counts[w]],
                           [total_fg_count - fg_counts[w], total_bg_count - total_fg_count - bg_counts[w] + fg_counts[w]]]
            odds, pval = fisher_exact(contingency, alternative='greater')
            if pval < cutoff:
              result.append((w, odds, pval))
        return result

Exemple #6

0

Afficher le fichier

Fichier : compute_features.py Projet : johnfelipe/talkofeurope-wordclouds

def main():
    args = docopt(__doc__)
    extractor_name = args['<feature_name>']
    extractor = getattr(talkofeuropewords.extract, extractor_name, None)
    if extractor is None:
        print "Unknown extractor name"
        sys.exit(1)
    c = get_config()
    s = open_db()

    print "Preparing ZODB"
    zodb_root = open_zodb(read_only=False)
    if getattr(zodb_root, 'features', None) is None:
        zodb_root.features = BTrees.OOBTree.OOBTree()
        transaction.commit()
    if extractor_name not in zodb_root.features:
        zodb_root.features[extractor_name] = BTrees.OOBTree.OOBTree()
        transaction.commit()

    runner = TaskRunner(extractor)

    print "Querying database..."
    speeches = s.query(Speech).filter(Speech.lang == 'en').all()
    total_speeches = len(speeches)

    print "Computing using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    try:
        for i, (id, result) in enumerate(progress.bar(pool.imap_unordered(runner, speeches), label='Progress ', expected_size=total_speeches, every=1000), 1):
            zodb_root.features[extractor_name][id] = result
            if i % 1000 == 0:
                transaction.commit()
        transaction.commit()
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Done"

Exemple #7

0

Afficher le fichier

Fichier : collect_words.py Projet : johnfelipe/talkofeurope-wordclouds

def main():
    args = docopt(__doc__)
    c = get_config()
    session = open_db()

    print "Finding 5 most active countries"
    countries = session.query(Speech.country, func.count(Speech.id)).filter(Speech.lang == 'en').group_by(Speech.country).order_by(desc(func.count(Speech.id))).limit(5).all()
    print countries
    country_codes = [c[0] for c in countries]

    print "Collecting words used by each country using 5 cores"
    pool = Pool(5, init_worker)
    try:
        word_sets = pool.map(country_words, country_codes)
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Collected word sets with sizes: ", map(len, word_sets)
    print "Computing intersection..."
    word_set = reduce(lambda x, y: x & y, word_sets)
    print "Result size: ", len(word_set)

    print "Subtracting stopwords..."
    nltk.download('stopwords')
    langs = ['english', 'dutch', 'french', 'italian', 'portuguese', 'swedish', 'german', 'spanish']
    all_stopwords = reduce(lambda x, y: x | y, [set(nltk.corpus.stopwords.words(lng)) for lng in langs])
    all_stopwords = set(map(unidecode, all_stopwords))
    word_set = word_set - all_stopwords
    print "Resulting word set size: ", len(word_set)

    print "Saving..."
    zodb_root = open_zodb()
    zodb_root.all_words = word_set
    transaction.commit()
    print "Done"