def save_to_collection():

    for lower_entity in entity_pseudos.keys():
        for entity in entity_pseudos[lower_entity]:
            topic_id = entity_topic[lower_entity]
            coll_name = TOPIC_COLLECTION_NAME(topic_id)
            check_or_create_collection(TOPIC_TWEETS_DB_NAME, coll_name, Collection.TOPIC)
            coll = topic_db[coll_name]
            copy_into_collection(raw_collection.find({ENTITIES: entity}), coll)
Esempio n. 2
0
def aggregate_urls(topic_ids):
    for topic_id in topic_ids:
        coll_name = TOPIC_COLLECTION_NAME(topic_id)
        results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id)

        check_or_create_collection(TOPIC_TWEETS_DB_NAME, coll_name, Collection.TOPIC)
        check_or_create_collection(TOPIC_TWEETS_DB_NAME, results_coll_name, Collection.URL_RESULT)

        coll = topics_db[coll_name]
        coll.map_reduce(MAP_FUNCTION, REDUCE_FUNCTION, results_coll_name)
Esempio n. 3
0
def aggregate_urls(topic_ids):
    for topic_id in topic_ids:
        coll_name = TOPIC_COLLECTION_NAME(topic_id)
        results_coll_name = TOPIC_URL_AGGR_COLLECTION_NAME(topic_id)

        check_or_create_collection(TOPIC_TWEETS_DB_NAME, coll_name,
                                   Collection.TOPIC)
        check_or_create_collection(TOPIC_TWEETS_DB_NAME, results_coll_name,
                                   Collection.URL_RESULT)

        coll = topics_db[coll_name]
        coll.map_reduce(MAP_FUNCTION, REDUCE_FUNCTION, results_coll_name)
Esempio n. 4
0
import pytz

from pymongo import MongoClient

from utilities.constants import *
from utilities.entities.Collection import Collection
from utilities.miscellaneous import display_percentage
from utilities.mongo import check_or_create_collection, insert_many
from utilities.time_management import datetime, start_timing, stop_timing
from utilities.os_util import dirname, get_dir, get_files_in_dir


ENGINE_ROOT = dirname(get_dir(__file__))
TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR)

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
collection = db[TEMP_RAW_COLLECTION_NAME]


def is_retweet(tweet):
    return RETWEETED_STATUS in tweet.keys()


def extract_data(file_path):  # load json file into a list of dictionaries
    tweets = []
    tweets_file = open(file_path, 'r')
    for line in tweets_file:
        try:
Esempio n. 5
0
from utilities.constants import *

from utilities.entities.Collection import Collection
from utilities.mongo import check_or_create_collection, copy_into_collection
from utilities.os_util import get_dir
from utilities.time_management import get_today, get_date_string, start_timing, stop_timing

ROOT = get_dir(__file__)
JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR)

TODAY = get_today()
TODAY_STRING = get_date_string(TODAY)
COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING
RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME,
                           Collection.TEMP)
check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW)
check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME,
                           Collection.ENTITY_RESULT)
check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME,
                           Collection.ENTITY_RESULT)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
coll = db[COLLECTION_NAME]
temp_raw = db[TEMP_RAW_COLLECTION_NAME]
temp_results = db[TEMP_RESULTS_COLLECTION_NAME]


def execute():
    print 'Started Entity Aggregation... ',
Esempio n. 6
0
from utilities.entities.Collection import Collection
from utilities.mongo import check_or_create_collection, copy_into_collection
from utilities.os_util import get_dir
from utilities.time_management import get_today, get_date_string, start_timing, stop_timing


ROOT = get_dir(__file__)
JAVASCRIPT_PATH = join(ROOT, JAVASCRIPT_DIR)


TODAY = get_today()
TODAY_STRING = get_date_string(TODAY)
COLLECTION_NAME = RAW_COLLECTION_PREFIX + TODAY_STRING
RESULTS_COLLECTION_NAME = ENTITY_RESULTS_COLLECTION_PREFIX + TODAY_STRING

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP)
check_or_create_collection(RAW_TWEETS_DB_NAME, COLLECTION_NAME, Collection.RAW)
check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT)
check_or_create_collection(RAW_TWEETS_DB_NAME, RESULTS_COLLECTION_NAME, Collection.ENTITY_RESULT)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
coll = db[COLLECTION_NAME]
temp_raw = db[TEMP_RAW_COLLECTION_NAME]
temp_results = db[TEMP_RESULTS_COLLECTION_NAME]


def execute():
    print 'Started Entity Aggregation... ',

    start_timing()
Esempio n. 7
0
import pytz

from pymongo import MongoClient

from utilities.constants import *
from utilities.entities.Collection import Collection
from utilities.miscellaneous import display_percentage
from utilities.mongo import check_or_create_collection, insert_many
from utilities.time_management import datetime, start_timing, stop_timing
from utilities.os_util import dirname, get_dir, get_files_in_dir


ENGINE_ROOT = dirname(get_dir(__file__))
TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR)

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
collection = db[TEMP_RAW_COLLECTION_NAME]


def is_retweet(tweet):
    return RETWEETED_STATUS in tweet.keys()


def extract_data(file_path):  # load json file into a list of dictionaries
    tweets = []
    tweets_file = open(file_path, 'r')
    for line in tweets_file:
        try: