Example #1
0
def main():
    parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.')
    parser.add_argument('date', metavar='yyyy-mm-dd',
                        help='the date to process')

    args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, args.date)
    db_summary = DBSummary(environment)

    date_skey = db.get_date_skey(args.date)

    actions = list()
    action = {'type': 'trends'}
    actions.append(action)
    action = {'type': 'mentions'}
    actions.append(action)

    action_ind = 0
    tweets = list()
    stat_tweet_count = 0
    while action_ind < len(actions):
        action = actions[action_ind]['type']
        # tweeters = None
        stats = Stats(args.date, action, db, actions, environment,
                      actions[action_ind]['trend'] if action == 'trenders' else None)

        i = 100
        is_tweetable = True

        if action == "trenders":
            tweet = stats.write_tweet(i)
            if not stats.is_trenders_tweet_postable(tweet) or stat_tweet_count >= DAILY_STAT_TWEET_LIMIT:
                is_tweetable = False
        elif action == "trends":
            tweet = stats.write_tweet(i)
        elif action == "mentions":
            tweet = stats.write_tweet(i)

        if is_tweetable:
            db_summary.save_tweet(tweet)
            stat_tweet_count += 1

        if tweet is not None:
            tweets.append(tweet)
        if len(tweets) >= 2:
            data = {'tweets': tweets, 'date': args.date}
            Publisher.publish(environment, data, 'draft')
            tweets = list()
            time.sleep(10)
        action_ind += 1

    db_summary.disconnect()

    # Now get app metrics
    rows = db.get_tweeter_category_counts()

    metric_dict = {'date': args.date, 'other': 0}
    for cat, count in rows:
        if cat is None:
            cat = ' '
        if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '):
            metric_dict[cat] = count
        else:
            metric_dict['other'] += count

    # Get count of total tweets and tweets by category
    rows = db.get_tweeter_category_tweet_counts(date_skey)

    metric_dict['tweets_total'] = 0
    metric_dict['tweets_other'] = 0
    for cat, count in rows:
        metric_dict['tweets_total'] += count

        if cat is None:
            cat = ' '
        if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '):
            metric_dict['tweets_' + cat] = count
        else:
            metric_dict['tweets_other'] += count

    # Add file sizes
    metric_dict['fact_db_size'] = os.path.getsize(environment.database)
    metric_dict['dim_db_size'] = os.path.getsize(environment.dimension_database)
    metric_dict['summ_db_size'] = os.path.getsize(environment.summary_database)

    followers_count = db.get_tweeter_followers_count('pakpolstats')
    metric_dict['account_followers'] = followers_count

    data = {'tweets': tweets, 'metrics': metric_dict, 'date': args.date}
    Publisher.publish(environment, data, 'draft')
Example #2
0
class Words:
    def __init__(self, environment, hashtag, tags_list):
        self.env = environment
        self.hashtag = hashtag

        self.boring_words = {}
        self.banned_tags = {}
        self.data = {}

        self.t_new = 0
        self.t_foreign = 0
        self.t_skip = 0
        self.t_retweet = 0

        # self.ns_tweet_count = []
        # self.ns_total_score = []
        self.ns_score_log = []
        # self.ns_index = -1

        self.retweets = []

        self.date = today()
        self.db = DB(environment, self.date)
        # self.c = self.db.connect(self.date)
        self.db_summary = DBSummary(environment)
        self.load_metadata()

        self.CONSUMER_KEY = self.env.consumer_key
        self.CONSUMER_SECRET = self.env.consumer_secret
        self.current_token = -1

        self.hash_tags_re = re.compile(r'(?i)(?<!\w)#[\w\u064b-\u0657]+',
                                       re.UNICODE)

        self.twitters = list()
        for token in self.db_summary.get_all_tokens():
            api = twitter.Api(consumer_key=self.CONSUMER_KEY,
                              consumer_secret=self.CONSUMER_SECRET,
                              access_token_key=token.key,
                              access_token_secret=token.secret,
                              sleep_on_rate_limit=True)
            self.twitters.append(api)

        self.today_skey = 0

        self.score_names = False
        if hashtag == 'trends':
            if os.path.isfile('metadata/name_score.csv'):
                self.score_names = True
                logger.info(
                    "metadata/name_score.csv will be used for name scoring.")
            else:
                logger.info(
                    "Warning: metadata/name_score.csv does not exist so name scoring is disabled."
                )

        self.batch_id = self.db.get_next_batch_id()
        self.baseline_tweet_id = self.db.get_baseline_tweet_id()

        self.today_skey = self.db.get_date_skey(self.date)
        self.loop_pos = -1

        self.all_trends = None
        if hashtag == 'trends':
            self.all_trends = self.db.get_trends()
            self.loop_pos = 0
            if tags_list is None:
                tags_list = []
                for (tag, result) in self.all_trends.items():
                    if result in ('AUTO_ADD', 'MAN_ADD'):
                        tags_list.append({'tag': tag})
            orig_tags_list = tags_list
            tags_list = []
            for tagdata in orig_tags_list:
                tags_list.append(
                    self.db.get_tag_ranges(tagdata['tag'],
                                           self.baseline_tweet_id))
            print('Tags_list:', tags_list)
            self.pull_trends(tags_list)
            self.write_data()
        elif hashtag == 'home_timeline':
            status_count = self.pull_data(hashtag)
            logger.info('{} statuses pulled.'.format(status_count))
            self.write_data()
        elif hashtag == 'lists':
            lists = self.twitters[
                self.db_summary.polling_token_index].GetLists(
                    screen_name=self.env.polling_account)
            logger.info('{} lists for account {}.'.format(
                len(lists), self.env.polling_account))
            for l in lists:
                status_count = self.pull_data(l.slug)
                logger.info('{} statuses pulled for list {}.'.format(
                    status_count, l.slug))
            self.write_data()

        self.db.disconnect()

    @property
    def api(self):
        self.current_token += 1
        if self.current_token >= len(self.twitters):
            self.current_token = 0
        return self.twitters[self.current_token]

    def pull_trend(self, trend, trend_count, trend_position):
        self.ns_score_log = []
        status_count = 0
        request_count = 0
        index = 0
        while index < len(trend.ranges):
            # for id_range in trend.ranges:
            id_range = trend.ranges[index]
            if not id_range.processed:
                max_id = id_range.max_id
                since_id = id_range.min_id
                statuses = None
                logger.info('Range: {:>9} {:35} {:20} {:20}'.format(
                    '{:4d}/{:4d}'.format(trend_position,
                                         trend_count), trend.name, since_id,
                    'None' if max_id is None else max_id))
                while statuses is None or len(statuses) >= 50:
                    if request_count >= 100:
                        new_range = Range(min_id=since_id, max_id=max_id)
                        id_range.min_id = max_id
                        trend.ranges.insert(index + 1, new_range)
                        return request_count, status_count, False

                    statuses = self.api.GetSearch(term=trend.name,
                                                  result_type='recent',
                                                  count=100,
                                                  include_entities=False,
                                                  max_id=max_id,
                                                  since_id=since_id)
                    self.get_words(statuses, trend=trend, source=trend.name)
                    status_count += len(statuses)
                    score = trend.get_average_score(10)
                    if len(statuses) > 0:
                        max_id = statuses[-1].id - 1
                    if id_range.max_id is None and len(statuses) > 0:
                        id_range.max_id = statuses[0].id
                    id_range.processed = True
                    request_count += 1

                    logger.info(
                        '{:40}  {:20} {:20} {:3} {:5} {:5.2f} {}'.format(
                            trend.name, since_id,
                            'None' if max_id is None else max_id,
                            request_count, trend.get_status_count(10), score,
                            trend.state))

                    if score < 0.0 and status_count > 150:
                        trend.state = 'AUTO_DEL'
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Negative')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    if score < 0.5 and status_count > 1000:
                        trend.state = 'AUTO_DEL'
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Hot_Ambiguous')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    if score > 2.0:
                        if trend.state not in ('AUTO_ADD', 'MAN_ADD'):
                            trend.state = 'AUTO_ADD'

                    # after 500 tweets if we still haven't got an indication, give up
                    if trend.get_status_count(
                            10) > 500 and trend.state == 'AUTO_DEL':
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Ambiguous')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    # Not needed for raspberry pi
                    if request_count % 100 == 0:
                        logger.info(
                            f'Sleeping 20 seconds at {request_count} requests.'
                        )
                        sleep(20)
            index += 1

        return request_count, status_count, True

    def pull_trends(self, trends):
        total_status_count = 0
        last_write = 0
        total_request_count = 0
        trend_count = len(trends)
        for i, trend in enumerate(trends):
            completed = False
            while not completed:
                request_count, status_count, completed = self.pull_trend(
                    trend, trend_count, i + 1)
                total_request_count += request_count
                total_status_count += status_count
                self.db.tag_history.append(trend)
                if total_request_count >= last_write + 20:
                    self.write_data()
                    self.batch_id += 1
                    last_write = total_request_count
        return total_status_count

    def load_metadata(self):
        f = open('metadata/boring.txt', 'r')
        for line in f:
            self.boring_words[line.rstrip()] = 1
        f.close()

        f = open('metadata/banned_tags.txt', 'r')
        for line in f:
            self.banned_tags[line.rstrip()] = 1
        f.close()

    def cleanup_exit(self):
        logger.info("Deleting batch %i" % self.batch_id)
        self.db.delete_batch(self.batch_id)
        self.db.disconnect()
        exit(1)

    def twitter_search(self, q, sinceid, maxid):
        self.current_token += 1
        if self.current_token >= len(self.twitters):
            self.current_token = 0
        if maxid is None:
            result = self.twitters[self.current_token].GetSearch(
                term=q,
                result_type='recent',
                count='100',
                include_entities='false',
                since_id=sinceid)
        else:
            result = self.twitters[self.current_token].GetSearch(
                term=q,
                result_type='recent',
                count='100',
                include_entities='false',
                since_id=sinceid,
                max_id=maxid)
        return result

    # def oauthReq(self, url, key, secret, http_method="GET", post_body='',
    #              http_headers=None):
    #     consumer = oauth.Consumer(key=self.CONSUMER_KEY, secret=self.CONSUMER_SECRET)
    #     if self.hashtag == 'home_timeline':
    #         token = self.db.getDefaultToken()
    #     else:
    #         token = self.db.getNextToken()
    #     client = oauth.Client(consumer, token)
    #     resp, content = client.request(
    #         url,
    #         method=http_method,
    #         body=post_body,
    #         headers=http_headers  # , force_auth_header=True
    #     )
    #     #	print "*** %s ***" % content
    #     #	exit()
    #     return content

    def write_data(self):
        logger.info("Writing data.")

        for tweetdate, stats in self.data.items():
            logger.info("Saving data for %s." % tweetdate)
            for tag, words in self.data[tweetdate]['tag_words'].items():
                for i, v in words.items():
                    self.db.write_hashtag_word(tweetdate, tag, i, v)

            for tweeter, words in self.data[tweetdate]['tweeter_words'].items(
            ):
                for i, v in words.items():
                    self.db.write_tweeter_word(tweetdate, tweeter, i, v)

            for tweeter, words in self.data[tweetdate][
                    'tweeter_mentions'].items():
                for i, v in words.items():
                    self.db.write_tweeter_mention(tweetdate, tweeter, i, v)

            for tag, count in self.data[tweetdate]['tags'].items():
                self.db.write_tag(tweetdate, tag, count)

            for tag, tweeters in self.data[tweetdate]['tag_tweeters'].items():
                for i, v in tweeters.items():
                    self.db.write_tag_tweeter(tweetdate, tag, i, v)

            for tag, tags in self.data[tweetdate]['tag_tags'].items():
                for i, v in tags.items():
                    self.db.write_tag_tag(tweetdate, tag, i, v)

        self.db.write_tweets()
        self.db.write_tag_history()

        self.db.commit()
        logger.info("Data saved.")
        self.data = {}

    @staticmethod
    def save_score_log(score_log, trend, reject_reason):
        filename = "log/reject_%s_%s_%s.log" % (trend, reject_reason,
                                                file_timestamp())
        secommon.save_list(score_log, filename)

    def calculate_name_score(self, status, trend):
        tweeter = status.user.screen_name
        tweeter_name = status.user.name
        score_candidate = (tweeter_name if status.retweeted_status is None else
                           status.retweeted_status.user.name)
        score_candidate_sn = (tweeter if status.retweeted_status is None else
                              status.retweeted_status.user.screen_name)
        trend.name_scores[-1].status_count += 1
        # self.ns_tweet_count[self.ns_index] += 1
        name_score = self.db.get_name_score(score_candidate,
                                            score_candidate_sn,
                                            status.user.location,
                                            status.user.time_zone)
        # self.ns_total_score[self.ns_index] += name_score
        trend.name_scores[-1].total_score += name_score
        score3 = '{:.2f}'.format(trend.get_average_score(3))
        score6 = '{:.2f}'.format(trend.get_average_score(6))
        self.ns_score_log.append([
            score_candidate, score_candidate_sn,
            trend.name_scores[-1].status_count, name_score,
            trend.name_scores[-1].total_score, score3, score6, tweeter_name,
            tweeter, status.id
        ])

    def process_status_words(self, status_id, status_date, status_text,
                             tweeter):
        if status_date not in self.data:
            self.data[status_date] = {}
            self.data[status_date]['tweeter_mentions'] = {}
            self.data[status_date]['tag_words'] = {}
            self.data[status_date]['tweeter_words'] = {}
            self.data[status_date]['tags'] = {}
            self.data[status_date]['tag_tweeters'] = {}
            self.data[status_date]['tag_tags'] = {}

        # get all relevant hashtags
        relevant_hashtags = re.findall(r'(?<![A-Za-z0-9_])#([A-Za-z0-9_]+)',
                                       status_text.lower())

        tweet_hashtags = set(self.hash_tags_re.findall(status_text))
        tweet_tags = [ht[1:] for ht in tweet_hashtags]

        for tag in tweet_tags:
            if tag in self.data[status_date]['tags']:
                self.data[status_date]['tags'][tag] += 1
            else:
                self.data[status_date]['tags'][tag] = 1

            if tag not in self.data[status_date]['tag_tweeters']:
                self.data[status_date]['tag_tweeters'][tag] = {}
            if tweeter in self.data[status_date]['tag_tweeters'][tag]:
                self.data[status_date]['tag_tweeters'][tag][tweeter] += 1
            else:
                self.data[status_date]['tag_tweeters'][tag][tweeter] = 1

            if tag not in self.data[status_date]['tag_tags']:
                self.data[status_date]['tag_tags'][tag] = {}
            for tag2 in tweet_tags:
                if tag2 != tag:
                    if tag2 in self.data[status_date]['tag_tags'][tag]:
                        self.data[status_date]['tag_tags'][tag][tag2] += 1
                    else:
                        self.data[status_date]['tag_tags'][tag][tag2] = 1

        # remove links
        text = re.sub(r"(?<![A-Za-z0-9_])https?://[^ ,;'()\[\]<>{}]+",
                      '',
                      status_text,
                      flags=re.IGNORECASE)

        alist = re.split('[, .;\'\"(){\}\[\]<>:?/=+\\\`~!#^&*\\r\\n\-]+', text)
        tweetwords = list()
        for item in alist:
            nitem = item.strip(' ,.-+()[]:\'\"').lower()
            if u"\u2026" in nitem:  # ignore words truncated with ellipsis (...)
                continue
            if nitem == '':
                continue
            if nitem in self.boring_words:
                continue
            if nitem[:1] == '@' and len(nitem) > 2:
                # Tweeter mentions
                if tweeter not in self.data[status_date]['tweeter_mentions']:
                    self.data[status_date]['tweeter_mentions'][tweeter] = {}
                if nitem[1:] in self.data[status_date]['tweeter_mentions'][
                        tweeter]:
                    self.data[status_date]['tweeter_mentions'][tweeter][
                        nitem[1:]] += 1
                else:
                    self.data[status_date]['tweeter_mentions'][tweeter][
                        nitem[1:]] = 1
                continue

            tweetwords.append(nitem)
            for tag in relevant_hashtags:
                if tag not in self.data[status_date]['tag_words']:
                    self.data[status_date]['tag_words'][tag] = {}
                if nitem in self.data[status_date]['tag_words'][tag]:
                    self.data[status_date]['tag_words'][tag][nitem] += 1
                else:
                    self.data[status_date]['tag_words'][tag][nitem] = 1

            # Tweeter words
            if tweeter.lower() in self.db.rated_tweeters:
                if tweeter not in self.data[status_date]['tweeter_words']:
                    self.data[status_date]['tweeter_words'][tweeter] = {}
                if nitem in self.data[status_date]['tweeter_words'][tweeter]:
                    self.data[status_date]['tweeter_words'][tweeter][
                        nitem] += 1
                else:
                    self.data[status_date]['tweeter_words'][tweeter][nitem] = 1

        tweet_words_text = u'~' + u'~'.join([
            self.db.get_word_skey(x, self.date)[1]
            for x in sorted(set(tweetwords))
        ]) + u'~'
        self.db.update_tweet_words(status_id, tweet_words_text)

    def get_words(self, statuses, trend=None, source=None):
        # max_id = 0
        # min_id = MAX_STATUS_ID
        for status in statuses:
            # max_id = max(status.id, max_id)
            # min_id = min(status.id, min_id)

            tweeter = status.user.screen_name
            tweeter_name = status.user.name
            tweeter_created_at = self.env.get_local_date(
                status.user.created_at)
            tweeter_skey = self.db.get_tweeter_skey(
                screen_name=tweeter,
                name=tweeter_name,
                followers_count=status.user.followers_count,
                friends_count=status.user.friends_count,
                lang=status.user.lang,
                time_zone=status.user.time_zone,
                verified=status.user.verified,
                statuses_count=status.user.statuses_count,
                profile_image_url=status.user.profile_image_url,
                created_at=tweeter_created_at,
                location=status.user.location)

            tweet_text = status.text
            retweet_id = status.retweeted_status.id if status.retweeted_status is not None else 0

            if retweet_id != 0:
                tweet_text = "RT " + status.retweeted_status.user.screen_name + ": " + \
                             status.retweeted_status.text

            if self.score_names:
                self.calculate_name_score(status, trend)

            if status.user.followers_count > 0 and tweeter.lower(
            ) in self.db.rated_tweeters:
                self.db.write_daily_followers(tweeter_skey, self.today_skey,
                                              status.user.followers_count)

            retweet_created_at = ''
            retweet_screen_name = ''
            retweet_count = status.retweet_count
            if status.retweeted_status is not None:
                self.retweets.append(status.retweeted_status)

                retweet_created_at = self.env.get_local_timestamp(
                    status.retweeted_status.created_at)
                retweet_screen_name = status.retweeted_status.user.screen_name

            # check if duplicate and insert if not duplicate
            status_date = self.env.get_local_date(status.created_at)
            status_created_at = self.env.get_local_timestamp(status.created_at)
            date_skey = self.db.get_date_skey(status_date)
            if self.db.tweet_is_duplicate(
                    id_=status.id,
                    created_at=status_created_at,
                    screen_name=status.user.screen_name,
                    text=tweet_text,
                    tweeter_skey=tweeter_skey,
                    retweet_count=retweet_count,
                    in_reply_to_status_id=status.in_reply_to_status_id,
                    date_skey=date_skey,
                    retweet_id=retweet_id,
                    retweet_created_at=retweet_created_at,
                    retweet_screen_name=retweet_screen_name,
                    batch_id=self.batch_id,
                    source=source):
                self.t_skip += 1
                continue

            self.process_status_words(status_id=status.id,
                                      status_date=status_date,
                                      status_text=tweet_text,
                                      tweeter=tweeter)

    # @timeout(7)
    def pull_data(self, list_name):
        since_id = self.db.get_baseline_tweet_id()
        max_id = None
        all_statuses = []
        statuses = None
        while statuses is None or len(statuses) > 0:
            if list_name == 'home_timeline':
                statuses = self.twitters[
                    self.db_summary.default_token_index].GetHomeTimeline(
                        count=200,
                        since_id=since_id,
                        max_id=max_id,
                        include_entities=False)
            else:
                statuses = self.twitters[
                    self.db_summary.polling_token_index].GetListTimeline(
                        owner_screen_name=self.env.polling_account,
                        slug=list_name,
                        count=200,
                        since_id=since_id,
                        max_id=max_id,
                        include_entities=False)

            if len(statuses) > 0:
                self.get_words(statuses, source=list_name)
                all_statuses.extend(statuses)
                max_id = statuses[-1].id - 1
                logger.info('{}  {}'.format(statuses[-1].id, len(statuses)))

        if len(all_statuses) > 0:
            max_id = max([status.id for status in all_statuses])
            min_id = min([status.id for status in all_statuses])
            self.db.write_list_max_id(list_name, max_id, min_id)

        return len(all_statuses)
Example #3
0
class NewDB:
    def __init__(self, environment, date):
        self.environment = environment
        old_date = yesterday(date)
        archive_file = environment.database_old.format(
            old_date.replace('-', '_'))
        if os.path.isfile(archive_file):
            raise ArchiveFileExistsError()

        self.old_db = DB(environment, old_date)

        # don't connect to new db via regular function, the global connection on db.py should be the main db
        conn2 = sqlite3.connect(environment.database_temp)
        self.new_c = conn2.cursor()

        db_copy = DBCopy(self.old_db.c, self.new_c)

        if date == 'empty':
            self.old_db.disconnect()
            conn2.commit()
            conn2.close()
            return

        date_dt = datetime.strptime(date, '%Y-%m-%d')
        three_days_ago = (date_dt - timedelta(days=3)).strftime('%Y-%m-%d')
        recent_date = (date_dt - timedelta(days=7)).strftime('%Y-%m-%d')

        logger.info('Recent date: [' + recent_date + ']')

        date_skey = self.old_db.get_date_skey(date)

        # Special case for db_baseline
        baseline_id = 0
        logger.info('db_baseline')
        cnt = 0
        sql = """select max(id) from fact_status where date_skey < ?"""
        t = (date_skey, )
        self.old_db.c.execute(sql, t)
        row = self.old_db.c.fetchone()
        if row is not None:
            cnt += 1
            baseline_id = row[0]
            t = (row[0], date)
            self.new_c.execute(
                'insert into db_baseline (min_tweet_id, min_date) values (?, ?)',
                t)
        logger.info(f'{cnt} rows')

        # The remaining tables
        sql = """select {} from tag_history th
                 where max_id >= ?"""
        t = (baseline_id, )
        db_copy.copy_table('tag_history', sql, t)

        sql = """select {} from (select d.*
            from tag_discovery d
            left join tag_score s
            on d.tag = s.tag
            group by d.tag, d.result, d.discovery_time
            having discovery_time >= ? or sum(s.tweet_count) > ?)"""
        t = (three_days_ago, 50)
        db_copy.copy_table('tag_discovery', sql, t)

        sql = """select {}
            from tag_score
            where score_time >= ?"""
        t = (three_days_ago, )
        db_copy.copy_table('tag_score', sql, t)

        # FACT_DAILY_FOLLOWERS
        sql = 'select {} from fact_daily_followers where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_followers', sql, t)

        # FACT_DAILY_HASHTAG
        sql = 'select {} from fact_daily_hashtag where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag', sql, t)

        sql = 'select {} from fact_daily_hashtag_hashtag where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_hashtag', sql, t)

        sql = 'select {} from fact_daily_hashtag_tweeter where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_tweeter', sql, t)

        sql = 'select {} from fact_daily_hashtag_word where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_word', sql, t)

        sql = """select {}
            from fact_daily_tweeter_mention where date_skey >= ?"""
        t = (date_skey, )
        db_copy.copy_table('fact_daily_tweeter_mention', sql, t)

        sql = 'select {} from fact_daily_tweeter_word where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_tweeter_word', sql, t)

        # FACT_STATUS RETWEETED
        sql = 'select {} from fact_status where date_skey = ? and retweeted is not null'
        t = (date_skey - 1, )
        db_copy.copy_table('fact_status', sql, t)

        # FACT_STATUS
        sql = 'select {} from fact_status where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_status', sql, t)

        self.old_db.disconnect()
        conn2.commit()
        conn2.close()

        db_copy.switch_files(current_file=environment.database,
                             archive_file=archive_file,
                             new_file=environment.database_temp)