def load_data(mabed_file):
    event_results = None

    mabed_filename = get_file_name(mabed_file)
    tmp_file = TMP_EVENT_DATA_TEMPLATE % get_file_name(mabed_filename)
    if os.path.isfile(tmp_file):
        with open(tmp_file, 'rb') as tmp_result:
            event_results = pickle.load(tmp_result)
    else:
        with open(mabed_file, 'r') as events_in:
            events = json.load(events_in)

        if not events:
            sys.exit(1)

        event_results = []
        for event_no, event in enumerate(events):
            print('event number %s ' % event_no)
            start_ts = _get_timestamp_from_date(event['start_date'])
            end_ts = _get_timestamp_from_date(event['end_date'])
            tweets = Tweet.objects(
                created_at__gte=datetime.datetime.fromtimestamp(start_ts),
                created_at__lte=datetime.datetime.fromtimestamp(end_ts +
                                                                86400 * 61))

            matched_tweets = []

            for tweet in tweets:
                main_words_found = []
                related_words_found = []

                for main_word in event['main_words']:
                    if main_word in tweet.text:
                        main_words_found.append(main_word)

                for related_word in event['related_words']:
                    if related_word['word'] in tweet.text:
                        related_words_found.append(related_word['word'])

                if len(event['main_words'] + event['related_words']
                       ) * 0.2 < len(related_words_found + main_words_found):
                    matched_tweets.append({
                        'main_word': main_words_found,
                        'related_word': related_words_found,
                        'tweet_id': tweet.twitter_id
                    })

            event_results.append({
                'event': event,
                'matched_tweets': matched_tweets
            })

        with open(tmp_file, 'wb') as tmp_result:
            pickle.dump(event_results, tmp_result)

    return event_results
Ejemplo n.º 2
0
def menu():
    click.clear()
    print("What do you want to do now?")
    print("\t1 - Extract tweets")
    print("\t2 - Generate rank of existing tweets (if existing)")
    print("\t9 - Exit")

    click.echo(
        f"\n{click.style('Warning:', fg='black', bg='red', bold=True)} some options are destructive and will lead to unexpected results. (e.g. overwrite existing tweets)")
    option = click.prompt("Enter the number of the option: ", confirmation_prompt=True,
                          type=click.Choice(['1', '2', '9'], False))

    if option == '1':
        query = click.prompt("Query for the tweets, or hashtag", type=str)
        with_retweets = click.prompt("Include retweets?", type=click.Choice(['Yes', 'No'], False), default='No')
        if with_retweets.lower() == 'no':
            query += " -filter:retweets"

        print("There are 4 different modes for the extraction: (3 of them for movies and shows)")
        print(" - Strict mode: Only include each preference if there is only one result from The Movie Database.")
        print(" - Order by popularity: Only include the first result ordered by popularity.")
        print(" - TMDB: Include the first result based on TMBD criteria.")
        print(" - Keep user text: Do not use The Movie Database service. Useful for lists that aren't movies or shows.")

        mode = click.prompt("Select the mode",
                            type=click.Choice(['strict', 'order_by_popularity', 'tmdb', 'keep_user_text'], False))
        max_tweets = click.prompt("How many tweets do you want to process? (Not final amount of extracted tweets)",
                                  type=int)
        tweets_extractor.extract(query, mode, max_tweets)
    elif option == '2':
        n_ranking = click.prompt("How many results should be in the ranking?", type=int)
        n_samples = click.prompt("How many  of the top preferences should each tweet have to be considered?", type=int)

        votes = {}

        for tweet in Tweet.objects():
            for i in range(len(tweet.preferences)):
                points = n_samples - i
                if points < 1:
                    break
                votes[tweet.preferences[i]] = votes.get(tweet.preferences[i], 0) + points

        print({k: v for k, v in sorted(votes.items(), key=lambda item: item[1], reverse=True)})

        """pipeline = [