Exemple #1
0
def main():
    environment = defaults.get_environment()
    db = DB(environment, today())

    _ = FindBotsBehaviour(environment, db)

    db.disconnect()
def main():
    # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.')
    # parser.add_argument('date', metavar='yyyy-mm-dd',
    #                     help='the date to process')
    #
    # args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, today())
    db_summary = DBSummary(environment)

    api = TwitterAPI(environment, db_summary)

    commands = db.get_commands(screen_names=db_summary.get_account_screen_names())
    processed_commands = db_summary.get_processed_commands(since_id=db.get_baseline_tweet_id())

    for command in commands:
        if command.id in processed_commands:
            logger.info(f'Skipping {command.id}. Already processed: {command.text}')
        else:
            m = re.match('\+([a-zA-Z0-9_]+) ([A-Z][AB]?)( t ([0-9]+))?( dl ([0-9]+))?', command.text)
            if m:
                screen_name = m.group(1)
                category = m.group(2)
                rt_threshold = m.group(4)
                rt_daily_limit = m.group(6)

                db.set_tweeter_category(screen_name=screen_name,
                                        category=category,
                                        rt_threshold=rt_threshold,
                                        rt_daily_limit=rt_daily_limit)

                status_text = f'+{screen_name} set to {category}'
                if rt_threshold is not None:
                    status_text += f' rt threshold {rt_threshold}'
                if rt_daily_limit is not None:
                    status_text += f' dl {rt_daily_limit}'
                save_command(command, status_text, db_summary, api.polling_api())
            elif command.text.lower()[:5] == 'add #':
                tag_name = command.text[5:]
                logger.info(f'Adding {tag_name}')
                call('python3.7 words.py ' + tag_name, shell=True)
                tag = db.get_tag_ranges(tag=f'#{tag_name}', min_override=db.get_baseline_tweet_id())
                print(tag.name_scores)
                name_score = tag.name_scores[-2] if len(tag.name_scores) > 1 else None
                score_text = '{} / {} = {:.1f}'.format(name_score.total_score,
                                                       name_score.status_count,
                                                       name_score.total_score / max(name_score.status_count, 1)
                                                       ) if name_score is not None else ''
                status_text = f'-{tag_name} added. {score_text} {tag.state}'
                save_command(command, status_text, db_summary, api.polling_api())
            else:
                if command.text[:2] not in ('To', 'RT'):
                    logger.info(f'Unknown command {command.id}: {command.text}')

    db_summary.disconnect()
    db.disconnect()
Exemple #3
0
def main():
    environment = defaults.get_environment()
    db = DB(environment, today())
    db_summary = DBSummary(environment)

    tl = TwitterList(environment, db, db_summary)

    tl.add_to_lists()
    tl.remove_from_lists()

    db.disconnect()
Exemple #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Find bots based on activity on the given day.')
    parser.add_argument('date',
                        metavar='yyyy-mm-dd',
                        help='the date to process')

    args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, args.date)

    date_to = args.date

    _ = FindBots(db, date_to)

    db.disconnect()
Exemple #5
0
def main():
    # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.')
    # parser.add_argument('date', metavar='yyyy-mm-dd',
    #                     help='the date to process')
    #
    # args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, today())

    _ = DraftTrends(environment, db)
Exemple #6
0
def main():
    environment = defaults.get_environment()
    db = DB(environment, today())

    start_date = str(datetime.date.today() - timedelta(days=7))
    end_date = str(datetime.date.today() - timedelta(days=1))
    logger.info(f'Dates: {start_date} {end_date}')
    words = {}
    rows = db.get_top_hashtags(start_date, end_date)
    for row in rows:
        words[row[0].lower()] = row[0]

    logger.info(f'{len(words)} words')

    for word, hashtag in words.items():
        db.set_word_hashtag(word, hashtag)
        logger.debug(f'{word:>30} {hashtag}')

    db.disconnect()
Exemple #7
0
    def __init__(self, environment, hashtag, tags_list):
        self.env = environment
        self.hashtag = hashtag

        self.boring_words = {}
        self.banned_tags = {}
        self.data = {}

        self.t_new = 0
        self.t_foreign = 0
        self.t_skip = 0
        self.t_retweet = 0

        # self.ns_tweet_count = []
        # self.ns_total_score = []
        self.ns_score_log = []
        # self.ns_index = -1

        self.retweets = []

        self.date = today()
        self.db = DB(environment, self.date)
        # self.c = self.db.connect(self.date)
        self.db_summary = DBSummary(environment)
        self.load_metadata()

        self.CONSUMER_KEY = self.env.consumer_key
        self.CONSUMER_SECRET = self.env.consumer_secret
        self.current_token = -1

        self.hash_tags_re = re.compile(r'(?i)(?<!\w)#[\w\u064b-\u0657]+',
                                       re.UNICODE)

        self.twitters = list()
        for token in self.db_summary.get_all_tokens():
            api = twitter.Api(consumer_key=self.CONSUMER_KEY,
                              consumer_secret=self.CONSUMER_SECRET,
                              access_token_key=token.key,
                              access_token_secret=token.secret,
                              sleep_on_rate_limit=True)
            self.twitters.append(api)

        self.today_skey = 0

        self.score_names = False
        if hashtag == 'trends':
            if os.path.isfile('metadata/name_score.csv'):
                self.score_names = True
                logger.info(
                    "metadata/name_score.csv will be used for name scoring.")
            else:
                logger.info(
                    "Warning: metadata/name_score.csv does not exist so name scoring is disabled."
                )

        self.batch_id = self.db.get_next_batch_id()
        self.baseline_tweet_id = self.db.get_baseline_tweet_id()

        self.today_skey = self.db.get_date_skey(self.date)
        self.loop_pos = -1

        self.all_trends = None
        if hashtag == 'trends':
            self.all_trends = self.db.get_trends()
            self.loop_pos = 0
            if tags_list is None:
                tags_list = []
                for (tag, result) in self.all_trends.items():
                    if result in ('AUTO_ADD', 'MAN_ADD'):
                        tags_list.append({'tag': tag})
            orig_tags_list = tags_list
            tags_list = []
            for tagdata in orig_tags_list:
                tags_list.append(
                    self.db.get_tag_ranges(tagdata['tag'],
                                           self.baseline_tweet_id))
            print('Tags_list:', tags_list)
            self.pull_trends(tags_list)
            self.write_data()
        elif hashtag == 'home_timeline':
            status_count = self.pull_data(hashtag)
            logger.info('{} statuses pulled.'.format(status_count))
            self.write_data()
        elif hashtag == 'lists':
            lists = self.twitters[
                self.db_summary.polling_token_index].GetLists(
                    screen_name=self.env.polling_account)
            logger.info('{} lists for account {}.'.format(
                len(lists), self.env.polling_account))
            for l in lists:
                status_count = self.pull_data(l.slug)
                logger.info('{} statuses pulled for list {}.'.format(
                    status_count, l.slug))
            self.write_data()

        self.db.disconnect()
Exemple #8
0
class Words:
    def __init__(self, environment, hashtag, tags_list):
        self.env = environment
        self.hashtag = hashtag

        self.boring_words = {}
        self.banned_tags = {}
        self.data = {}

        self.t_new = 0
        self.t_foreign = 0
        self.t_skip = 0
        self.t_retweet = 0

        # self.ns_tweet_count = []
        # self.ns_total_score = []
        self.ns_score_log = []
        # self.ns_index = -1

        self.retweets = []

        self.date = today()
        self.db = DB(environment, self.date)
        # self.c = self.db.connect(self.date)
        self.db_summary = DBSummary(environment)
        self.load_metadata()

        self.CONSUMER_KEY = self.env.consumer_key
        self.CONSUMER_SECRET = self.env.consumer_secret
        self.current_token = -1

        self.hash_tags_re = re.compile(r'(?i)(?<!\w)#[\w\u064b-\u0657]+',
                                       re.UNICODE)

        self.twitters = list()
        for token in self.db_summary.get_all_tokens():
            api = twitter.Api(consumer_key=self.CONSUMER_KEY,
                              consumer_secret=self.CONSUMER_SECRET,
                              access_token_key=token.key,
                              access_token_secret=token.secret,
                              sleep_on_rate_limit=True)
            self.twitters.append(api)

        self.today_skey = 0

        self.score_names = False
        if hashtag == 'trends':
            if os.path.isfile('metadata/name_score.csv'):
                self.score_names = True
                logger.info(
                    "metadata/name_score.csv will be used for name scoring.")
            else:
                logger.info(
                    "Warning: metadata/name_score.csv does not exist so name scoring is disabled."
                )

        self.batch_id = self.db.get_next_batch_id()
        self.baseline_tweet_id = self.db.get_baseline_tweet_id()

        self.today_skey = self.db.get_date_skey(self.date)
        self.loop_pos = -1

        self.all_trends = None
        if hashtag == 'trends':
            self.all_trends = self.db.get_trends()
            self.loop_pos = 0
            if tags_list is None:
                tags_list = []
                for (tag, result) in self.all_trends.items():
                    if result in ('AUTO_ADD', 'MAN_ADD'):
                        tags_list.append({'tag': tag})
            orig_tags_list = tags_list
            tags_list = []
            for tagdata in orig_tags_list:
                tags_list.append(
                    self.db.get_tag_ranges(tagdata['tag'],
                                           self.baseline_tweet_id))
            print('Tags_list:', tags_list)
            self.pull_trends(tags_list)
            self.write_data()
        elif hashtag == 'home_timeline':
            status_count = self.pull_data(hashtag)
            logger.info('{} statuses pulled.'.format(status_count))
            self.write_data()
        elif hashtag == 'lists':
            lists = self.twitters[
                self.db_summary.polling_token_index].GetLists(
                    screen_name=self.env.polling_account)
            logger.info('{} lists for account {}.'.format(
                len(lists), self.env.polling_account))
            for l in lists:
                status_count = self.pull_data(l.slug)
                logger.info('{} statuses pulled for list {}.'.format(
                    status_count, l.slug))
            self.write_data()

        self.db.disconnect()

    @property
    def api(self):
        self.current_token += 1
        if self.current_token >= len(self.twitters):
            self.current_token = 0
        return self.twitters[self.current_token]

    def pull_trend(self, trend, trend_count, trend_position):
        self.ns_score_log = []
        status_count = 0
        request_count = 0
        index = 0
        while index < len(trend.ranges):
            # for id_range in trend.ranges:
            id_range = trend.ranges[index]
            if not id_range.processed:
                max_id = id_range.max_id
                since_id = id_range.min_id
                statuses = None
                logger.info('Range: {:>9} {:35} {:20} {:20}'.format(
                    '{:4d}/{:4d}'.format(trend_position,
                                         trend_count), trend.name, since_id,
                    'None' if max_id is None else max_id))
                while statuses is None or len(statuses) >= 50:
                    if request_count >= 100:
                        new_range = Range(min_id=since_id, max_id=max_id)
                        id_range.min_id = max_id
                        trend.ranges.insert(index + 1, new_range)
                        return request_count, status_count, False

                    statuses = self.api.GetSearch(term=trend.name,
                                                  result_type='recent',
                                                  count=100,
                                                  include_entities=False,
                                                  max_id=max_id,
                                                  since_id=since_id)
                    self.get_words(statuses, trend=trend, source=trend.name)
                    status_count += len(statuses)
                    score = trend.get_average_score(10)
                    if len(statuses) > 0:
                        max_id = statuses[-1].id - 1
                    if id_range.max_id is None and len(statuses) > 0:
                        id_range.max_id = statuses[0].id
                    id_range.processed = True
                    request_count += 1

                    logger.info(
                        '{:40}  {:20} {:20} {:3} {:5} {:5.2f} {}'.format(
                            trend.name, since_id,
                            'None' if max_id is None else max_id,
                            request_count, trend.get_status_count(10), score,
                            trend.state))

                    if score < 0.0 and status_count > 150:
                        trend.state = 'AUTO_DEL'
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Negative')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    if score < 0.5 and status_count > 1000:
                        trend.state = 'AUTO_DEL'
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Hot_Ambiguous')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    if score > 2.0:
                        if trend.state not in ('AUTO_ADD', 'MAN_ADD'):
                            trend.state = 'AUTO_ADD'

                    # after 500 tweets if we still haven't got an indication, give up
                    if trend.get_status_count(
                            10) > 500 and trend.state == 'AUTO_DEL':
                        self.save_score_log(self.ns_score_log, trend.name,
                                            'Ambiguous')
                        id_range.min_id = max_id
                        return request_count, status_count, True

                    # Not needed for raspberry pi
                    if request_count % 100 == 0:
                        logger.info(
                            f'Sleeping 20 seconds at {request_count} requests.'
                        )
                        sleep(20)
            index += 1

        return request_count, status_count, True

    def pull_trends(self, trends):
        total_status_count = 0
        last_write = 0
        total_request_count = 0
        trend_count = len(trends)
        for i, trend in enumerate(trends):
            completed = False
            while not completed:
                request_count, status_count, completed = self.pull_trend(
                    trend, trend_count, i + 1)
                total_request_count += request_count
                total_status_count += status_count
                self.db.tag_history.append(trend)
                if total_request_count >= last_write + 20:
                    self.write_data()
                    self.batch_id += 1
                    last_write = total_request_count
        return total_status_count

    def load_metadata(self):
        f = open('metadata/boring.txt', 'r')
        for line in f:
            self.boring_words[line.rstrip()] = 1
        f.close()

        f = open('metadata/banned_tags.txt', 'r')
        for line in f:
            self.banned_tags[line.rstrip()] = 1
        f.close()

    def cleanup_exit(self):
        logger.info("Deleting batch %i" % self.batch_id)
        self.db.delete_batch(self.batch_id)
        self.db.disconnect()
        exit(1)

    def twitter_search(self, q, sinceid, maxid):
        self.current_token += 1
        if self.current_token >= len(self.twitters):
            self.current_token = 0
        if maxid is None:
            result = self.twitters[self.current_token].GetSearch(
                term=q,
                result_type='recent',
                count='100',
                include_entities='false',
                since_id=sinceid)
        else:
            result = self.twitters[self.current_token].GetSearch(
                term=q,
                result_type='recent',
                count='100',
                include_entities='false',
                since_id=sinceid,
                max_id=maxid)
        return result

    # def oauthReq(self, url, key, secret, http_method="GET", post_body='',
    #              http_headers=None):
    #     consumer = oauth.Consumer(key=self.CONSUMER_KEY, secret=self.CONSUMER_SECRET)
    #     if self.hashtag == 'home_timeline':
    #         token = self.db.getDefaultToken()
    #     else:
    #         token = self.db.getNextToken()
    #     client = oauth.Client(consumer, token)
    #     resp, content = client.request(
    #         url,
    #         method=http_method,
    #         body=post_body,
    #         headers=http_headers  # , force_auth_header=True
    #     )
    #     #	print "*** %s ***" % content
    #     #	exit()
    #     return content

    def write_data(self):
        logger.info("Writing data.")

        for tweetdate, stats in self.data.items():
            logger.info("Saving data for %s." % tweetdate)
            for tag, words in self.data[tweetdate]['tag_words'].items():
                for i, v in words.items():
                    self.db.write_hashtag_word(tweetdate, tag, i, v)

            for tweeter, words in self.data[tweetdate]['tweeter_words'].items(
            ):
                for i, v in words.items():
                    self.db.write_tweeter_word(tweetdate, tweeter, i, v)

            for tweeter, words in self.data[tweetdate][
                    'tweeter_mentions'].items():
                for i, v in words.items():
                    self.db.write_tweeter_mention(tweetdate, tweeter, i, v)

            for tag, count in self.data[tweetdate]['tags'].items():
                self.db.write_tag(tweetdate, tag, count)

            for tag, tweeters in self.data[tweetdate]['tag_tweeters'].items():
                for i, v in tweeters.items():
                    self.db.write_tag_tweeter(tweetdate, tag, i, v)

            for tag, tags in self.data[tweetdate]['tag_tags'].items():
                for i, v in tags.items():
                    self.db.write_tag_tag(tweetdate, tag, i, v)

        self.db.write_tweets()
        self.db.write_tag_history()

        self.db.commit()
        logger.info("Data saved.")
        self.data = {}

    @staticmethod
    def save_score_log(score_log, trend, reject_reason):
        filename = "log/reject_%s_%s_%s.log" % (trend, reject_reason,
                                                file_timestamp())
        secommon.save_list(score_log, filename)

    def calculate_name_score(self, status, trend):
        tweeter = status.user.screen_name
        tweeter_name = status.user.name
        score_candidate = (tweeter_name if status.retweeted_status is None else
                           status.retweeted_status.user.name)
        score_candidate_sn = (tweeter if status.retweeted_status is None else
                              status.retweeted_status.user.screen_name)
        trend.name_scores[-1].status_count += 1
        # self.ns_tweet_count[self.ns_index] += 1
        name_score = self.db.get_name_score(score_candidate,
                                            score_candidate_sn,
                                            status.user.location,
                                            status.user.time_zone)
        # self.ns_total_score[self.ns_index] += name_score
        trend.name_scores[-1].total_score += name_score
        score3 = '{:.2f}'.format(trend.get_average_score(3))
        score6 = '{:.2f}'.format(trend.get_average_score(6))
        self.ns_score_log.append([
            score_candidate, score_candidate_sn,
            trend.name_scores[-1].status_count, name_score,
            trend.name_scores[-1].total_score, score3, score6, tweeter_name,
            tweeter, status.id
        ])

    def process_status_words(self, status_id, status_date, status_text,
                             tweeter):
        if status_date not in self.data:
            self.data[status_date] = {}
            self.data[status_date]['tweeter_mentions'] = {}
            self.data[status_date]['tag_words'] = {}
            self.data[status_date]['tweeter_words'] = {}
            self.data[status_date]['tags'] = {}
            self.data[status_date]['tag_tweeters'] = {}
            self.data[status_date]['tag_tags'] = {}

        # get all relevant hashtags
        relevant_hashtags = re.findall(r'(?<![A-Za-z0-9_])#([A-Za-z0-9_]+)',
                                       status_text.lower())

        tweet_hashtags = set(self.hash_tags_re.findall(status_text))
        tweet_tags = [ht[1:] for ht in tweet_hashtags]

        for tag in tweet_tags:
            if tag in self.data[status_date]['tags']:
                self.data[status_date]['tags'][tag] += 1
            else:
                self.data[status_date]['tags'][tag] = 1

            if tag not in self.data[status_date]['tag_tweeters']:
                self.data[status_date]['tag_tweeters'][tag] = {}
            if tweeter in self.data[status_date]['tag_tweeters'][tag]:
                self.data[status_date]['tag_tweeters'][tag][tweeter] += 1
            else:
                self.data[status_date]['tag_tweeters'][tag][tweeter] = 1

            if tag not in self.data[status_date]['tag_tags']:
                self.data[status_date]['tag_tags'][tag] = {}
            for tag2 in tweet_tags:
                if tag2 != tag:
                    if tag2 in self.data[status_date]['tag_tags'][tag]:
                        self.data[status_date]['tag_tags'][tag][tag2] += 1
                    else:
                        self.data[status_date]['tag_tags'][tag][tag2] = 1

        # remove links
        text = re.sub(r"(?<![A-Za-z0-9_])https?://[^ ,;'()\[\]<>{}]+",
                      '',
                      status_text,
                      flags=re.IGNORECASE)

        alist = re.split('[, .;\'\"(){\}\[\]<>:?/=+\\\`~!#^&*\\r\\n\-]+', text)
        tweetwords = list()
        for item in alist:
            nitem = item.strip(' ,.-+()[]:\'\"').lower()
            if u"\u2026" in nitem:  # ignore words truncated with ellipsis (...)
                continue
            if nitem == '':
                continue
            if nitem in self.boring_words:
                continue
            if nitem[:1] == '@' and len(nitem) > 2:
                # Tweeter mentions
                if tweeter not in self.data[status_date]['tweeter_mentions']:
                    self.data[status_date]['tweeter_mentions'][tweeter] = {}
                if nitem[1:] in self.data[status_date]['tweeter_mentions'][
                        tweeter]:
                    self.data[status_date]['tweeter_mentions'][tweeter][
                        nitem[1:]] += 1
                else:
                    self.data[status_date]['tweeter_mentions'][tweeter][
                        nitem[1:]] = 1
                continue

            tweetwords.append(nitem)
            for tag in relevant_hashtags:
                if tag not in self.data[status_date]['tag_words']:
                    self.data[status_date]['tag_words'][tag] = {}
                if nitem in self.data[status_date]['tag_words'][tag]:
                    self.data[status_date]['tag_words'][tag][nitem] += 1
                else:
                    self.data[status_date]['tag_words'][tag][nitem] = 1

            # Tweeter words
            if tweeter.lower() in self.db.rated_tweeters:
                if tweeter not in self.data[status_date]['tweeter_words']:
                    self.data[status_date]['tweeter_words'][tweeter] = {}
                if nitem in self.data[status_date]['tweeter_words'][tweeter]:
                    self.data[status_date]['tweeter_words'][tweeter][
                        nitem] += 1
                else:
                    self.data[status_date]['tweeter_words'][tweeter][nitem] = 1

        tweet_words_text = u'~' + u'~'.join([
            self.db.get_word_skey(x, self.date)[1]
            for x in sorted(set(tweetwords))
        ]) + u'~'
        self.db.update_tweet_words(status_id, tweet_words_text)

    def get_words(self, statuses, trend=None, source=None):
        # max_id = 0
        # min_id = MAX_STATUS_ID
        for status in statuses:
            # max_id = max(status.id, max_id)
            # min_id = min(status.id, min_id)

            tweeter = status.user.screen_name
            tweeter_name = status.user.name
            tweeter_created_at = self.env.get_local_date(
                status.user.created_at)
            tweeter_skey = self.db.get_tweeter_skey(
                screen_name=tweeter,
                name=tweeter_name,
                followers_count=status.user.followers_count,
                friends_count=status.user.friends_count,
                lang=status.user.lang,
                time_zone=status.user.time_zone,
                verified=status.user.verified,
                statuses_count=status.user.statuses_count,
                profile_image_url=status.user.profile_image_url,
                created_at=tweeter_created_at,
                location=status.user.location)

            tweet_text = status.text
            retweet_id = status.retweeted_status.id if status.retweeted_status is not None else 0

            if retweet_id != 0:
                tweet_text = "RT " + status.retweeted_status.user.screen_name + ": " + \
                             status.retweeted_status.text

            if self.score_names:
                self.calculate_name_score(status, trend)

            if status.user.followers_count > 0 and tweeter.lower(
            ) in self.db.rated_tweeters:
                self.db.write_daily_followers(tweeter_skey, self.today_skey,
                                              status.user.followers_count)

            retweet_created_at = ''
            retweet_screen_name = ''
            retweet_count = status.retweet_count
            if status.retweeted_status is not None:
                self.retweets.append(status.retweeted_status)

                retweet_created_at = self.env.get_local_timestamp(
                    status.retweeted_status.created_at)
                retweet_screen_name = status.retweeted_status.user.screen_name

            # check if duplicate and insert if not duplicate
            status_date = self.env.get_local_date(status.created_at)
            status_created_at = self.env.get_local_timestamp(status.created_at)
            date_skey = self.db.get_date_skey(status_date)
            if self.db.tweet_is_duplicate(
                    id_=status.id,
                    created_at=status_created_at,
                    screen_name=status.user.screen_name,
                    text=tweet_text,
                    tweeter_skey=tweeter_skey,
                    retweet_count=retweet_count,
                    in_reply_to_status_id=status.in_reply_to_status_id,
                    date_skey=date_skey,
                    retweet_id=retweet_id,
                    retweet_created_at=retweet_created_at,
                    retweet_screen_name=retweet_screen_name,
                    batch_id=self.batch_id,
                    source=source):
                self.t_skip += 1
                continue

            self.process_status_words(status_id=status.id,
                                      status_date=status_date,
                                      status_text=tweet_text,
                                      tweeter=tweeter)

    # @timeout(7)
    def pull_data(self, list_name):
        since_id = self.db.get_baseline_tweet_id()
        max_id = None
        all_statuses = []
        statuses = None
        while statuses is None or len(statuses) > 0:
            if list_name == 'home_timeline':
                statuses = self.twitters[
                    self.db_summary.default_token_index].GetHomeTimeline(
                        count=200,
                        since_id=since_id,
                        max_id=max_id,
                        include_entities=False)
            else:
                statuses = self.twitters[
                    self.db_summary.polling_token_index].GetListTimeline(
                        owner_screen_name=self.env.polling_account,
                        slug=list_name,
                        count=200,
                        since_id=since_id,
                        max_id=max_id,
                        include_entities=False)

            if len(statuses) > 0:
                self.get_words(statuses, source=list_name)
                all_statuses.extend(statuses)
                max_id = statuses[-1].id - 1
                logger.info('{}  {}'.format(statuses[-1].id, len(statuses)))

        if len(all_statuses) > 0:
            max_id = max([status.id for status in all_statuses])
            min_id = min([status.id for status in all_statuses])
            self.db.write_list_max_id(list_name, max_id, min_id)

        return len(all_statuses)
Exemple #9
0
def main():
    env = defaults.get_environment()
    db = DB(env, today())
    # db = DB(env, '2018-12-25')
    promotion = Promotion()

    demotedate_c = (datetime.date.today() -
                    timedelta(days=30)).strftime('%Y-%m-%d')
    demotedate_d = (datetime.date.today() -
                    timedelta(days=90)).strftime('%Y-%m-%d')

    # Promote to C
    logger.info("Tweeter Promotion  %s" % today())

    rows = db.get_tweeter_promotion_stats()

    # If a person has more than POWER_TWEEP followers, then mark it as F if it is negative - cannot do it for all
    # tweeps because
    # would get too many category F's. Don't want to waste resources storing Tweeps we may never encounter in future.
    logger.info(f'Starting loop of {len(rows)} records.')
    row_count = len(rows)
    current_row = 0
    for screen_name, pos, neg, blocked, category, relevance_score, followers_count, name, location, time_zone in rows:
        tweeter = promotion.add(screen_name=screen_name,
                                name=name,
                                category=category,
                                relevance_score=relevance_score,
                                location=location,
                                time_zone=time_zone,
                                followers_count=followers_count)
        # if relevance_score is None:
        #     relevance_score = 0
        # adjustment = 0
        if blocked > 3 and blocked > pos and relevance_score <= -10:
            tweeter.new_category = 'B'
        elif neg > pos and (category is not None or relevance_score != 0
                            or followers_count >= POWER_TWEEP):
            if neg > 3:
                tweeter.adjust_score(-2)
            else:
                tweeter.adjust_score(-1)
        else:  # pos >= neg
            if pos > 3:
                tweeter.adjust_score(2)
            elif pos > 1:
                tweeter.adjust_score(1)
        current_row += 1
        if current_row % 100 == 0:
            logger.info(
                f'{current_row:4d}/{row_count} {category} {screen_name}')

    # Promote top tweeps
    logger.info('Promoting top tweeps.')
    db.c.execute('select screen_name from dim_tweeter where category <= ?',
                 ('C', ))
    rows = db.c.fetchall()
    famous = [row[0] for row in rows]

    trenders = rank_words(
        f'{env.bot_data_directory}/trenders_published_%s.txt', 7)
    non_famous = [trender for trender in trenders if trender not in famous]
    for screen_name in non_famous[:50]:
        tweeter = promotion.add(screen_name=screen_name)
        tweeter.adjust_score(1)

    logger.info('Saving changes.')
    promotion.save_all()

    # Demote from D
    db.set_tweeter_category_by_date(date_category_was_set=demotedate_d,
                                    current_category='D',
                                    new_category='E')

    # Demote from C
    db.set_tweeter_category_by_date(date_category_was_set=demotedate_c,
                                    current_category='C',
                                    new_category='D')

    db.disconnect()
Exemple #10
0
def main():
    # parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.')
    # parser.add_argument('date', metavar='yyyy-mm-dd',
    #                     help='the date to process')
    #
    # args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, today())
    db_summary = DBSummary(environment)

    jdata = Publisher.get_pending(environment)

    # c = db_connect(env.summary_database)
    trenders_published = list()
    trenders_all = list()
    already_processed = list()
    if 'tweets' in jdata:
        for tweet in jdata['tweets']:
            tweet_status = db_summary.get_tweet_status(tweet['t_id'])
            if tweet_status is None:
                db_summary.save_tweet(tweet)
                for item in tweet['items']:
                    db_summary.save_tweet_item(tweet, item)
                    if tweet['type'] == 'trenders' and item['selected'] == 'Y':
                        trenders_all.append(item['tweet_text'][1:])
                        if tweet['status'] == 'pend-post':
                            trenders_published.append(item['tweet_text'][1:])
            elif tweet_status in ['posted', 'rejected']:
                tweet['status'] = tweet_status
                already_processed.append(tweet)

        if len(trenders_published) > 0:
            with open(
                    f'{environment.bot_data_directory}/trenders_all_{yesterday_file()}.txt',
                    'a') as f:
                for sn in trenders_all:
                    f.write("%s\n" % sn)
            with open(
                    f'{environment.bot_data_directory}/trenders_published_{yesterday_file()}.txt',
                    'a') as f:
                for sn in trenders_published:
                    f.write("%s\n" % sn)

    db_summary.disconnect()

    trend_date = now()
    # now = now()
    # yesterday = (datetime.datetime.now() - timedelta(days=2)).strftime('%Y-%m-%d')
    if 'trends' in jdata:
        if len(jdata['trends']) > 0:
            # c = db_connect()

            # t = (yesterday,)
            # c.execute('SELECT max(max_id) max_id FROM tag_history where date <= ?', t)
            # row = c.fetchone()
            # max_id = 0
            # if row != None:
            #     max_id = row[0]

            relevant_words = db.get_relevant_words()

            generic_words = db.get_generic_words()

            trs = list()
            for trend in jdata['trends']:
                tag = '#' + trend['hashtag'].lower()
                tr = {
                    'hashtag': trend['hashtag'],
                    'status': 'posted',
                    'trend_at': trend_date
                }
                trs.append(tr)
                tag_discovery_result = db.get_tag_discovery_result(tag)
                status = nvl(tag_discovery_result, 'NONE')

                if trend['status'] == 'pend-post' and status in ('NONE',
                                                                 'AUTO_DEL',
                                                                 'MAN_DEL'):
                    logger.info('Adding: ' + tag)
                    db.save_tag_discovery(tag, 'MAN_ADD')
                elif trend['status'] == 'pend-del' and status in ('AUTO_ADD',
                                                                  'MAN_ADD'):
                    logger.info('Deleting: ' + tag)
                    db.save_tag_discovery(tag, 'MAN_DEL')

                # Trend relevance
                if 'relevance' in trend:
                    relevance = relevant_words[
                        trend['hashtag'].lower()] if trend['hashtag'].lower(
                        ) in relevant_words else 'neutral'
                    if trend['relevance'] != relevance:
                        new_relevance = None if trend[
                            'relevance'] == 'neutral' else trend['relevance']
                        db.set_word_relevance(trend['hashtag'], new_relevance)

                # Trend generic
                if 'generic' in trend:
                    generic = generic_words[trend['hashtag'].lower(
                    )] if trend['hashtag'].lower() in generic_words else ''
                    if trend['generic'] != generic:
                        new_relevance = None if trend[
                            'generic'] == 'neutral' else trend['generic']
                        db.set_word_generic(trend['hashtag'], new_relevance)

            data = {'trends': trs}
            Publisher.publish(environment, data, 'trends')

            db.commit()
            open(f'{environment.temp_file_directory}/compute_daily',
                 'a').close()

    if 'categories' in jdata:
        if len(jdata['categories']) > 0:
            for cat in jdata['categories']:
                db.set_tweeter_category(cat['screen_name'], cat['category'])
                logger.info("Category for", cat['screen_name'], "changed to",
                            cat['category'])

            db.commit()

    if 'words' in jdata:
        if len(jdata['words']) > 0:
            for word in jdata['words']:
                category = word['category']
                if category == '':
                    category = None
                db.set_word_generic(word['word'], category)
                logger.info("Generic for", word['word'], "changed to",
                            category)

    db.disconnect()

    if len(already_processed) > 0:
        data = {'tweets': already_processed}
        Publisher.publish(environment, data, 'posted')
def main():
    parser = argparse.ArgumentParser(description='Draft stats for the given day and push to cloud for approval.')
    parser.add_argument('date', metavar='yyyy-mm-dd',
                        help='the date to process')

    args = parser.parse_args()

    environment = defaults.get_environment()
    db = DB(environment, args.date)
    db_summary = DBSummary(environment)

    date_skey = db.get_date_skey(args.date)

    actions = list()
    action = {'type': 'trends'}
    actions.append(action)
    action = {'type': 'mentions'}
    actions.append(action)

    action_ind = 0
    tweets = list()
    stat_tweet_count = 0
    while action_ind < len(actions):
        action = actions[action_ind]['type']
        # tweeters = None
        stats = Stats(args.date, action, db, actions, environment,
                      actions[action_ind]['trend'] if action == 'trenders' else None)

        i = 100
        is_tweetable = True

        if action == "trenders":
            tweet = stats.write_tweet(i)
            if not stats.is_trenders_tweet_postable(tweet) or stat_tweet_count >= DAILY_STAT_TWEET_LIMIT:
                is_tweetable = False
        elif action == "trends":
            tweet = stats.write_tweet(i)
        elif action == "mentions":
            tweet = stats.write_tweet(i)

        if is_tweetable:
            db_summary.save_tweet(tweet)
            stat_tweet_count += 1

        if tweet is not None:
            tweets.append(tweet)
        if len(tweets) >= 2:
            data = {'tweets': tweets, 'date': args.date}
            Publisher.publish(environment, data, 'draft')
            tweets = list()
            time.sleep(10)
        action_ind += 1

    db_summary.disconnect()

    # Now get app metrics
    rows = db.get_tweeter_category_counts()

    metric_dict = {'date': args.date, 'other': 0}
    for cat, count in rows:
        if cat is None:
            cat = ' '
        if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '):
            metric_dict[cat] = count
        else:
            metric_dict['other'] += count

    # Get count of total tweets and tweets by category
    rows = db.get_tweeter_category_tweet_counts(date_skey)

    metric_dict['tweets_total'] = 0
    metric_dict['tweets_other'] = 0
    for cat, count in rows:
        metric_dict['tweets_total'] += count

        if cat is None:
            cat = ' '
        if cat in ('A', 'B', 'C', 'D', 'E', 'F', 'R', ' '):
            metric_dict['tweets_' + cat] = count
        else:
            metric_dict['tweets_other'] += count

    # Add file sizes
    metric_dict['fact_db_size'] = os.path.getsize(environment.database)
    metric_dict['dim_db_size'] = os.path.getsize(environment.dimension_database)
    metric_dict['summ_db_size'] = os.path.getsize(environment.summary_database)

    followers_count = db.get_tweeter_followers_count('pakpolstats')
    metric_dict['account_followers'] = followers_count

    data = {'tweets': tweets, 'metrics': metric_dict, 'date': args.date}
    Publisher.publish(environment, data, 'draft')
    def __init__(self, environment, old_date):
        self.environment = environment
        archive_file = environment.dimension_database_old.format(
            last_month().replace('-', '_'))
        if os.path.isfile(archive_file):
            logger.warning('Archive file already exists.')
            exit(0)
            # raise ArchiveFileExistsError()

        self.old_db = sqlite3.connect(environment.dimension_database)
        self.old_c = self.old_db.cursor()
        self.old_c.execute('ATTACH DATABASE ? AS se', (environment.database, ))

        date = None
        try:
            self.old_c.execute('select min_date from db_baseline')
            date = self.old_c.fetchone()[0]
        except sqlite3.OperationalError:
            logger.critical('Could not read date.')
            exit(1)

        logger.info('Date: %s', date)

        self.old_db = DB(environment, old_date)

        # don't connect to new db via regular function, the global connection on db.py should be the main db
        conn2 = sqlite3.connect(environment.dimension_database_temp)
        self.new_c = conn2.cursor()

        db_copy = DBCopy(self.old_c, self.new_c)

        if date == 'empty':
            self.old_db.disconnect()
            conn2.commit()
            conn2.close()
            return

        date_dt = datetime.strptime(date, '%Y-%m-%d')
        recent_date = (date_dt - timedelta(days=30)).strftime('%Y-%m-%d')
        twitter_dt = datetime.strptime('2006-03-21 09:00:00',
                                       '%Y-%m-%d %H:%M:%S')
        future_date = (date_dt + timedelta(days=2000)).strftime('%Y-%m-%d')

        logger.info('Recent date: %s', recent_date)

        logger.info("DIM_DATE")
        cnt = 0
        dt = twitter_dt.strftime('%Y-%m-%d')
        i = 1
        while dt < future_date:
            cnt += 1
            # print "Date: ", i, dt
            t = (i, dt)
            self.new_c.execute(
                'insert into dim_date (date_skey, date) values (?, ?)', t)
            dt = (twitter_dt + timedelta(days=i)).strftime('%Y-%m-%d')
            i += 1
        logger.info('%d rows', cnt)

        # DIM_TWEETER
        t = (recent_date, )
        sql = """select {}
        from dim_tweeter 
        where category is not null
        or ifnull(relevance_score, 0) != 0
        or ifnull(bot_date, '2000-01-01') >= ? 
        or tweeter_skey in (select tweeter_skey from fact_daily_followers
            union select tweeter_skey from fact_daily_hashtag_tweeter
            union select tweeter_skey from fact_daily_tweeter_mention
            union select mentioned_tweeter_skey from fact_daily_tweeter_mention
            union select tweeter_skey from fact_daily_tweeter_word)
        """
        db_copy.copy_table('dim_tweeter', sql, t)

        # DIM_WORD
        # Word skeys were getting very large because we only moved the new days words forward, so many words
        # that weren't used in the few hours of the new day were being created with new skeys. Using yesterday
        # will slow the skey generation.
        sql = """select {}
        from dim_word
        where generic is not null or relevance is not null
        or word_skey in (
            select word_skey from fact_daily_hashtag
            union select tag_skey from fact_daily_hashtag_hashtag
            union select other_tag_skey from fact_daily_hashtag_hashtag
            union select tag_skey from fact_daily_hashtag_tweeter
            union select tag_skey from fact_daily_hashtag_word
            union select word_skey from fact_daily_hashtag_word
            union select word_skey from fact_daily_tweeter_word)"""
        db_copy.copy_table('dim_word', sql, None)

        self.old_db.disconnect()
        conn2.commit()
        conn2.close()

        db_copy.switch_files(current_file=environment.dimension_database,
                             archive_file=archive_file,
                             new_file=environment.dimension_database_temp)
Exemple #13
0
    def __init__(self, environment, date):
        self.environment = environment
        old_date = yesterday(date)
        archive_file = environment.database_old.format(
            old_date.replace('-', '_'))
        if os.path.isfile(archive_file):
            raise ArchiveFileExistsError()

        self.old_db = DB(environment, old_date)

        # don't connect to new db via regular function, the global connection on db.py should be the main db
        conn2 = sqlite3.connect(environment.database_temp)
        self.new_c = conn2.cursor()

        db_copy = DBCopy(self.old_db.c, self.new_c)

        if date == 'empty':
            self.old_db.disconnect()
            conn2.commit()
            conn2.close()
            return

        date_dt = datetime.strptime(date, '%Y-%m-%d')
        three_days_ago = (date_dt - timedelta(days=3)).strftime('%Y-%m-%d')
        recent_date = (date_dt - timedelta(days=7)).strftime('%Y-%m-%d')

        logger.info('Recent date: [' + recent_date + ']')

        date_skey = self.old_db.get_date_skey(date)

        # Special case for db_baseline
        baseline_id = 0
        logger.info('db_baseline')
        cnt = 0
        sql = """select max(id) from fact_status where date_skey < ?"""
        t = (date_skey, )
        self.old_db.c.execute(sql, t)
        row = self.old_db.c.fetchone()
        if row is not None:
            cnt += 1
            baseline_id = row[0]
            t = (row[0], date)
            self.new_c.execute(
                'insert into db_baseline (min_tweet_id, min_date) values (?, ?)',
                t)
        logger.info(f'{cnt} rows')

        # The remaining tables
        sql = """select {} from tag_history th
                 where max_id >= ?"""
        t = (baseline_id, )
        db_copy.copy_table('tag_history', sql, t)

        sql = """select {} from (select d.*
            from tag_discovery d
            left join tag_score s
            on d.tag = s.tag
            group by d.tag, d.result, d.discovery_time
            having discovery_time >= ? or sum(s.tweet_count) > ?)"""
        t = (three_days_ago, 50)
        db_copy.copy_table('tag_discovery', sql, t)

        sql = """select {}
            from tag_score
            where score_time >= ?"""
        t = (three_days_ago, )
        db_copy.copy_table('tag_score', sql, t)

        # FACT_DAILY_FOLLOWERS
        sql = 'select {} from fact_daily_followers where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_followers', sql, t)

        # FACT_DAILY_HASHTAG
        sql = 'select {} from fact_daily_hashtag where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag', sql, t)

        sql = 'select {} from fact_daily_hashtag_hashtag where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_hashtag', sql, t)

        sql = 'select {} from fact_daily_hashtag_tweeter where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_tweeter', sql, t)

        sql = 'select {} from fact_daily_hashtag_word where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_hashtag_word', sql, t)

        sql = """select {}
            from fact_daily_tweeter_mention where date_skey >= ?"""
        t = (date_skey, )
        db_copy.copy_table('fact_daily_tweeter_mention', sql, t)

        sql = 'select {} from fact_daily_tweeter_word where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_daily_tweeter_word', sql, t)

        # FACT_STATUS RETWEETED
        sql = 'select {} from fact_status where date_skey = ? and retweeted is not null'
        t = (date_skey - 1, )
        db_copy.copy_table('fact_status', sql, t)

        # FACT_STATUS
        sql = 'select {} from fact_status where date_skey >= ?'
        t = (date_skey, )
        db_copy.copy_table('fact_status', sql, t)

        self.old_db.disconnect()
        conn2.commit()
        conn2.close()

        db_copy.switch_files(current_file=environment.database,
                             archive_file=archive_file,
                             new_file=environment.database_temp)