コード例 #1
0
def main():
    args = docopt(__doc__)
    c = get_config()

    # create new Graph object
    g = rdflib.Graph()

    print "Loading files: ",
    sys.stdout.flush()

    # parse data files (could take a while: 4,000,000 triples will take ~10 minutes )
    for fn in files:
        print fn,
        sys.stdout.flush()
        with gzip.open(os.path.join(c.ttl_dir, fn)) as f:
            result = g.parse(f, format='turtle')

    print "\nQuerying..."

    # compile query
    qres = g.query("""
        PREFIX dc: <http://purl.org/dc/elements/1.1/>
        PREFIX dcterms: <http://purl.org/dc/terms/>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX lp: <http://purl.org/linkedpolitics/>
        PREFIX lpv: <http://purl.org/linkedpolitics/vocabulary/>
        PREFIX xml: <http://www.w3.org/XML/1998/namespace>
        PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>

        SELECT ?date ?speaker ?firstname ?lastname ?country ?text
        WHERE {
            ?sessionday dcterms:hasPart ?agendaitem.
            ?sessionday dc:date ?date.
            ?agendaitem dcterms:hasPart ?speech.
            ?speech lpv:speaker ?speaker.
            ?speaker lpv:countryOfRepresentation ?countryobj.
            ?countryobj lpv:acronym ?country.
            ?speaker foaf:givenName ?firstname.
            ?speaker foaf:familyName ?lastname.
            ?speech lpv:text ?text.
        }
    """)

    # The query is actually executed now, it takes a while (~3 min)
    print "Found %d records" % len(qres)

    # Write out csv file
    with gzip.open(os.path.join(c.textdb_dir, 'English.csv.gz'), 'wb') as csvfile:
        csv_headers = ['Date', 'SpeakerURI', 'Firstname', 'Lastname', 'Country', 'Speech']
        speechwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        speechwriter.writerow(csv_headers)

        for row in progress.bar(qres, label='Writing CSV ', expected_size=len(qres), every=1000):
            csv_line = [x.encode('utf8').strip() for x in row]
            speechwriter.writerow(csv_line)

    print 'Done'
def main():
    args = docopt(__doc__)
    feature_name = args['<feature_name>']
    assert feature_name == 'words'
    assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys())
    c = get_config()
    experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name)

    print "Computing foreground group sums using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    fg_groups = experiment_set.list_foreground_groups()
    cache = {}
    try:
        for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)):
            cache[group_name] = sum_vector
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Computing background sums..."
    bg_groups = experiment_set.list_background_groups()
    for g in bg_groups:
        sum_vector = experiment_set.compute_background_group_sum(g, cache)
        cache[g] = sum_vector

    print "Saving sums to ZODB..."
    zodb_root = open_zodb(read_only=False)
    if getattr(zodb_root, 'group_sums', None) is None:
        zodb_root.group_sums = BTrees.OOBTree.OOBTree()
        transaction.commit()
    if feature_name not in zodb_root.group_sums:
        zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree()
        transaction.commit()
    for k, v in cache.iteritems():
        zodb_root.group_sums[feature_name][k] = v
    transaction.commit()


    print "Creating output db tables..."
    create_db(c.resultsdb_url)
    session_out = open_db(c.resultsdb_url)

    print "Computing overrepresentation using %d cores..." % c.num_cores
    exps = experiment_set.list_experiments()
    cls = experiment_set.result_table_class()
    try:
        for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)):
            for w, odds, pval in results:
                c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval)
                session_out.add(c)
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Committing..."
    session_out.commit()
    print "Done"
コード例 #3
0
def open_db(db_url = None):
    "Returns an initialized Session object. If db_url is not specified, uses get_config().db_url"
    if db_url is None:
        from talkofeuropedb.config import get_config
        db_url = get_config().db_url
    e = create_engine(db_url)
    Session = sessionmaker(e)
    return Session()
コード例 #4
0
def main():
    args = docopt(__doc__)
    c = get_config()
    print "Downloading files into %s" % c.ttl_dir
    for g in graph_list:
        target_file = os.path.join(c.ttl_dir, g.split('/')[-1] + '.ttl.gz')
        source_url = "http://linkedpolitics.ops.few.vu.nl/api/export_graph?graph=%s&mimetype=text%%2Fplain&format=turtle" % urllib.quote(g, '')
        print "Downloading %s..." % g
        download_gzipped(source_url, target_file)
    print "Done"
コード例 #5
0
def open_zodb(config=None, read_only=False):
    "Opens a Zope database and returns a root object. If config not specified get_config() is used."

    if config is None:
        from talkofeuropedb.config import get_config
        config = get_config()
    storage = ZODB.FileStorage.FileStorage(os.path.join(config.zodb_dir, 'zodb.fs'), read_only=read_only)
    db = ZODB.DB(storage)
    connection = db.open()
    root = connection.root
    return root
コード例 #6
0
def main():
    args = docopt(__doc__)
    c = get_config()
    session = open_db()
    speeches = session.query(Speech).all()
    total_speeches = len(speeches)  # For progress bar purposes

    print "Computing using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    try:
        for id, lang in progress.bar(pool.imap_unordered(detect_language, speeches), label='Progress ', expected_size=total_speeches, every=1000):
            session.query(Speech).get(id).lang = lang
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Committing..."
    session.commit()

    num_english_texts = session.query(Speech).filter(Speech.lang == 'en').count()
    print "Done. English texts: %d" % num_english_texts
コード例 #7
0
def main():
    args = docopt(__doc__)
    c = get_config()
    e = create_engine(c.db_url)
    Base.metadata.drop_all(e)
    Base.metadata.create_all(e)
    Session = sessionmaker(e)
    s = Session()
    with gzip.open(os.path.join(c.textdb_dir, 'English.csv.gz'), 'rb') as csv_file:
        reader = csv.reader(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        reader.next()   # Skip header
        for row in progress.mill(reader, label='Writing to DB ', expected_size=254253, every=1000):
            sp = Speech(date=datetime.strptime(row[0], '%Y-%m-%d'),
                        speaker_uri=unicode(row[1], 'utf-8'),
                        first_name=unicode(row[2], 'utf-8'),
                        last_name=unicode(row[3], 'utf-8'),
                        country=row[4],
                        speech=unicode(row[5], 'utf-8'))
            s.add(sp)
    print "Committing..."
    s.commit()
    print "Done"
コード例 #8
0
def main():
    args = docopt(__doc__)
    extractor_name = args['<feature_name>']
    extractor = getattr(talkofeuropewords.extract, extractor_name, None)
    if extractor is None:
        print "Unknown extractor name"
        sys.exit(1)
    c = get_config()
    s = open_db()

    print "Preparing ZODB"
    zodb_root = open_zodb(read_only=False)
    if getattr(zodb_root, 'features', None) is None:
        zodb_root.features = BTrees.OOBTree.OOBTree()
        transaction.commit()
    if extractor_name not in zodb_root.features:
        zodb_root.features[extractor_name] = BTrees.OOBTree.OOBTree()
        transaction.commit()

    runner = TaskRunner(extractor)

    print "Querying database..."
    speeches = s.query(Speech).filter(Speech.lang == 'en').all()
    total_speeches = len(speeches)

    print "Computing using %d cores..." % c.num_cores
    pool = Pool(c.num_cores, init_worker)
    try:
        for i, (id, result) in enumerate(progress.bar(pool.imap_unordered(runner, speeches), label='Progress ', expected_size=total_speeches, every=1000), 1):
            zodb_root.features[extractor_name][id] = result
            if i % 1000 == 0:
                transaction.commit()
        transaction.commit()
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Done"
コード例 #9
0
def main():
    args = docopt(__doc__)
    c = get_config()
    session = open_db()

    print "Finding 5 most active countries"
    countries = session.query(Speech.country, func.count(Speech.id)).filter(Speech.lang == 'en').group_by(Speech.country).order_by(desc(func.count(Speech.id))).limit(5).all()
    print countries
    country_codes = [c[0] for c in countries]

    print "Collecting words used by each country using 5 cores"
    pool = Pool(5, init_worker)
    try:
        word_sets = pool.map(country_words, country_codes)
    except KeyboardInterrupt:
        print "Terminating pool.."
        pool.terminate()
        pool.join()

    print "Collected word sets with sizes: ", map(len, word_sets)
    print "Computing intersection..."
    word_set = reduce(lambda x, y: x & y, word_sets)
    print "Result size: ", len(word_set)

    print "Subtracting stopwords..."
    nltk.download('stopwords')
    langs = ['english', 'dutch', 'french', 'italian', 'portuguese', 'swedish', 'german', 'spanish']
    all_stopwords = reduce(lambda x, y: x | y, [set(nltk.corpus.stopwords.words(lng)) for lng in langs])
    all_stopwords = set(map(unidecode, all_stopwords))
    word_set = word_set - all_stopwords
    print "Resulting word set size: ", len(word_set)

    print "Saving..."
    zodb_root = open_zodb()
    zodb_root.all_words = word_set
    transaction.commit()
    print "Done"
 def __init__(self, pval_cutoff=0.01, feature_name='words'):
     self.pval_cutoff = pval_cutoff
     self.feature_name = feature_name
     self.config = get_config()