Example #1
0
Save the LDA model.
"""
from os.path import join

from gensim import corpora, models

from utilities.config import NUMBER_OF_TOPICS, NUMBER_OF_PASSES, ALPHA
from utilities.constants import *
from utilities.os_util import get_dir
from utilities.time_management import start_timing, stop_timing, get_time, get_today, get_date_string


TODAY = get_today()
TODAY_STRING = get_date_string(TODAY)

ROOT = get_dir(__file__)
DICTIONARY_PATH = join(ROOT, DATA_DIR, DICTIONARY_PREFIX + TODAY_STRING + DICT)
CORPUS_PATH = join(ROOT, DATA_DIR, CORPUS_PREFIX + TODAY_STRING + MM)
LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA)

CORPUS = corpora.MmCorpus(CORPUS_PATH)
DICTIONARY = corpora.Dictionary.load(DICTIONARY_PATH)


def execute():

    print 'Started LDA at ' + get_time() + '... ',

    start_timing()

    lda = models.LdaModel(CORPUS, id2word=DICTIONARY,
Example #2
0
from tweepy import OAuthHandler, Stream
from tweepy.streaming import StreamListener

from utilities.config import *
from utilities.constants import *
from utilities.miscellaneous import is_json
from utilities.os_util import get_dir
from utilities.time_management import get_time

display_number = DISPLAY_COMPLETED_TWEETS_INTERVAL

file_number = 1
tweets_cnt = 0
total_tweets_cnt = 0

ROOT = get_dir(__file__)

FILE_PATH = join(ROOT, DATA_DIR)
TEMP_PATH = join(ROOT, TEMP_DIR)


def get_filename(directory, number):
    return join(directory,
                DATA_FILE_PREFIX + FILE_NAME_FORMATTER % number + JSON)


def change_file():
    global tweets_file, file_name, file_number

    tweets_file.close()
    rename(file_name, get_filename(TEMP_PATH,
Example #3
0
import json
from os import remove
from os.path import join
import pytz

from pymongo import MongoClient

from utilities.constants import *
from utilities.entities.Collection import Collection
from utilities.miscellaneous import display_percentage
from utilities.mongo import check_or_create_collection, insert_many
from utilities.time_management import datetime, start_timing, stop_timing
from utilities.os_util import dirname, get_dir, get_files_in_dir


ENGINE_ROOT = dirname(get_dir(__file__))
TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR)

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
collection = db[TEMP_RAW_COLLECTION_NAME]


def is_retweet(tweet):
    return RETWEETED_STATUS in tweet.keys()


def extract_data(file_path):  # load json file into a list of dictionaries
    tweets = []
from string import Template

from gensim import models
from pytagcloud import create_tag_image, create_html_data, make_tags, \
    LAYOUT_HORIZONTAL, LAYOUTS
from pytagcloud.colors import COLOR_SCHEMES
from pytagcloud.lang.counter import get_tag_counts

from utilities.constants import *
from utilities.os_util import get_dir
from utilities.time_management import get_prev_day, start_timing, stop_timing, get_today, get_date_string, get_time, get_differenced_day

TODAY = get_today()
TODAY_STRING = get_date_string(TODAY)

ROOT = get_dir(__file__)
PROJECT_ROOT = get_dir(get_dir(ROOT))

LDA_PATH = join(ROOT, DATA_DIR, LDA_MODEL_PREFIX + TODAY_STRING + LDA)
MODEL_DATA_PATH = join(ROOT, MODEL_DATA_DIR)
WORDCLOUD_PATH = join(PROJECT_ROOT, WEBSITE_DIR, STATIC_DIR, WORDCLOUD_DIR)

LDA_MODEL = models.LdaModel.load(LDA_PATH)


def normalize(arr):
    sum = 0
    for i in arr:
        sum += i
    for i in range(len(arr)):
        arr[i] = arr[i]/sum
Example #5
0
import json
from os import remove
from os.path import join
import pytz

from pymongo import MongoClient

from utilities.constants import *
from utilities.entities.Collection import Collection
from utilities.miscellaneous import display_percentage
from utilities.mongo import check_or_create_collection, insert_many
from utilities.time_management import datetime, start_timing, stop_timing
from utilities.os_util import dirname, get_dir, get_files_in_dir


ENGINE_ROOT = dirname(get_dir(__file__))
TEMP_PATH = join(ENGINE_ROOT, EXTRACTOR_DIR, TEMP_DIR)

check_or_create_collection(RAW_TWEETS_DB_NAME, TEMP_RAW_COLLECTION_NAME, Collection.TEMP)

client = MongoClient()
db = client[RAW_TWEETS_DB_NAME]
collection = db[TEMP_RAW_COLLECTION_NAME]


def is_retweet(tweet):
    return RETWEETED_STATUS in tweet.keys()


def extract_data(file_path):  # load json file into a list of dictionaries
    tweets = []
Example #6
0
Total 24*60 data points
Stores in a .tsv file
"""
import os
from os.path import join

import pytz
from pymongo import MongoClient

from utilities.config import DAY_START, APPROXIMATION_RANGE, NUMBER_OF_TOPICS
from utilities.constants import *
from utilities.os_util import get_dir, get_files_in_dir
from utilities.time_management import get_today, get_prev_day, get_datetime_from_string, get_next_day, \
    get_date_time_string, convert_datetime_to_local, localize_datetime, get_differenced_day

PROJECT_ROOT = get_dir(get_dir(get_dir(__file__)))
TSV_DIR_PATH = join(PROJECT_ROOT, WEBSITE_DIR, STATIC_DIR, TSV_DIR)
START_DATE = get_differenced_day(get_today(), -16) #get_prev_day(get_today())
print START_DATE
client = MongoClient()
topics_db = client['tweets']

entities = []


def remove_previous_data():
    tsv_files = get_files_in_dir(TSV_DIR_PATH, TSV)
    for tsv_file in tsv_files:
        os.remove(join(TSV_DIR_PATH, tsv_file))