for k, v in items.items():
        if k == 'text' or k == 'sentence' or k == 'w':
            result[k] = v
        elif len(v) == 1:
            result[k] = v[0]
        else:
            result[k] = v
    return result
    #return { k: v[0] if len(v) == 1 else v for k, v in items.items() }


# saving a parsed top level element given in python dictionary to a mongodb in json format. Changing top level
# attribute 'id' to mongo specific '_id'. If connection fails auto reconnects 5 times.
def save_to_db(user):
    user[0]["_id"] = user[0]["id"]
    for i in range(5):
        try:
            db.collection.save(user[0])
            break
        except pymongo.errors.AutoReconnect:
            time.sleep(pow(2, i))


if __name__ == "__main__":
    #db = thtdb.ThtConnection(collectionName='pldebatt_october_multi')
    #db = thtdb.ThtConnection(host='squib.de', dbName='karinas_twitter_db', collectionName='twitter-pldebatt-131006-new')
    db = thtdb.ThtConnection(dbName='local',
                             collectionName='twitter-pldebatt-130612')
    parse_xml(input_path + "twitter-pldebatt-130612.xml")
    db.client.disconnect()
Example #2
0
    total_term_frequency =  map(list, zip(*total_term_frequency))
    containing_doc_number = [sum([1 for x in y if x > 0]) for y in total_term_frequency]
    is_more = [sum([1 for x in y if x > 1]) for y in total_term_frequency]
    print "Number of times the terms appear: "
    PrettyPrintList([sum(x) for x in total_term_frequency])
    print "Number of docs the term appears in: "
    PrettyPrintList(containing_doc_number)
    print "idf: "
    PrettyPrintListFloat([math.log(doc_number/(x+1),10) for x in containing_doc_number])
    print "Number of tweets having the term frequency calculated: ", [len(x) for x in total_term_frequency]
    print "This many tweets have the term more than once: ", is_more
'''

if __name__ == "__main__":
    db = thtdb.ThtConnection(host='squib.de',
                             dbName='karinas_twitter_db',
                             collectionName='twitter-pldebatt-131006')
    '''
    print 'saving crime tweets...'
    #createTopicDocuments('crime')
    print 'crime tweets done.'
    print 'saving school tweets...'
    #createTopicDocuments('school')
    print 'school tweets done.'
    print 'saving climate tweets...'
    #createTopicDocuments('climate')
    print 'climate tweets done.'
    print 'saving tax tweets...'
    createTopicDocuments('tax')
    print 'tax tweets done.'
    print 'saving immigration tweets...'
    print db.collection.find({
        "created": {
            "$gt": d
        }
    }).sort("username").explain()["nscanned"]

    print "find indexed users created..."
    d = "2008-01-01"
    db.collection.create_index([("created", DESCENDING),
                                ("username", ASCENDING)])
    print db.collection.find({
        "created": {
            "$lt": d
        }
    }).sort("username").explain()["cursor"]
    print db.collection.find({
        "created": {
            "$lt": d
        }
    }).sort("username").explain()["nscanned"]
    for user in db.collection.find({"created": {"$lt": d}}).sort("username"):
        print user["username"]
    print "aggregation example..."


if __name__ == "__main__":
    #client = Connection('localhost', 27017)
    #db = client.local
    #collection = db.test_3
    db = thtdb.ThtConnection(collectionName='test_pldebatt_june')
    querries()
        max_key = max(user_stat.iterkeys(), key=(lambda key: user_stat[key]))
        max_val = user_stat[max_key]
        user_stat_file.write(
            max_key.encode('utf8') + ',' + str(max_val) + '\n')
        del user_stat[max_key]

    while hash_stat != {}:
        max_key = max(hash_stat.iterkeys(), key=(lambda key: hash_stat[key]))
        max_val = hash_stat[max_key]
        hash_stat_file.write(
            max_key.encode('utf8') + ',' + str(max_val) + '\n')
        del hash_stat[max_key]

    while word_stat != {}:
        max_key = max(word_stat.iterkeys(), key=(lambda key: word_stat[key]))
        max_val = word_stat[max_key]
        word_stat_file.write(
            max_key.encode('utf8') + ',' + str(max_val) + '\n')
        del word_stat[max_key]


if __name__ == "__main__":
    #db = thtdb.ThtConnection(collectionName='test_short_1')
    db = thtdb.ThtConnection(host='squib.de',
                             dbName='karinas_twitter_db',
                             collectionName='pldebatt_october')
    querries("word_stat_october_test", "user_stat_october_test",
             "hash_stat_october_test")
    #querries_aggregated()
    #querries_mapreduce_test()
                items[elem.tag].append(temp)
            else:
                temp_dict = pt(context, elem)
                items[elem.tag].append(temp_dict)
        elif action == "end":
            text = elem.text.strip() if elem.text else ""
            break
        if elem.tag == 'user':
            save_to_db(items[elem.tag])
            del items[elem.tag]
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context
    if len(items) == 0:
        return text
    return {k: v[0] if len(v) == 1 else v for k, v in items.items()}


def save_to_db(user):
    user[0]["_id"] = user[0]["id"]
    db.collection.save(user[0])


if __name__ == "__main__":
    db = thtdb.ThtConnection(collectionName='test_1b')
    #db = thtdb.ThtConnection(host='squib.de')

    parse_xml("/Users/karinabunyik/BTSync/Data/twitter-pldebatt.xml")
    db.client.disconnect()