def load_data(mabed_file): event_results = None mabed_filename = get_file_name(mabed_file) tmp_file = TMP_EVENT_DATA_TEMPLATE % get_file_name(mabed_filename) if os.path.isfile(tmp_file): with open(tmp_file, 'rb') as tmp_result: event_results = pickle.load(tmp_result) else: with open(mabed_file, 'r') as events_in: events = json.load(events_in) if not events: sys.exit(1) event_results = [] for event_no, event in enumerate(events): print('event number %s ' % event_no) start_ts = _get_timestamp_from_date(event['start_date']) end_ts = _get_timestamp_from_date(event['end_date']) tweets = Tweet.objects( created_at__gte=datetime.datetime.fromtimestamp(start_ts), created_at__lte=datetime.datetime.fromtimestamp(end_ts + 86400 * 61)) matched_tweets = [] for tweet in tweets: main_words_found = [] related_words_found = [] for main_word in event['main_words']: if main_word in tweet.text: main_words_found.append(main_word) for related_word in event['related_words']: if related_word['word'] in tweet.text: related_words_found.append(related_word['word']) if len(event['main_words'] + event['related_words'] ) * 0.2 < len(related_words_found + main_words_found): matched_tweets.append({ 'main_word': main_words_found, 'related_word': related_words_found, 'tweet_id': tweet.twitter_id }) event_results.append({ 'event': event, 'matched_tweets': matched_tweets }) with open(tmp_file, 'wb') as tmp_result: pickle.dump(event_results, tmp_result) return event_results
def menu(): click.clear() print("What do you want to do now?") print("\t1 - Extract tweets") print("\t2 - Generate rank of existing tweets (if existing)") print("\t9 - Exit") click.echo( f"\n{click.style('Warning:', fg='black', bg='red', bold=True)} some options are destructive and will lead to unexpected results. (e.g. overwrite existing tweets)") option = click.prompt("Enter the number of the option: ", confirmation_prompt=True, type=click.Choice(['1', '2', '9'], False)) if option == '1': query = click.prompt("Query for the tweets, or hashtag", type=str) with_retweets = click.prompt("Include retweets?", type=click.Choice(['Yes', 'No'], False), default='No') if with_retweets.lower() == 'no': query += " -filter:retweets" print("There are 4 different modes for the extraction: (3 of them for movies and shows)") print(" - Strict mode: Only include each preference if there is only one result from The Movie Database.") print(" - Order by popularity: Only include the first result ordered by popularity.") print(" - TMDB: Include the first result based on TMBD criteria.") print(" - Keep user text: Do not use The Movie Database service. Useful for lists that aren't movies or shows.") mode = click.prompt("Select the mode", type=click.Choice(['strict', 'order_by_popularity', 'tmdb', 'keep_user_text'], False)) max_tweets = click.prompt("How many tweets do you want to process? (Not final amount of extracted tweets)", type=int) tweets_extractor.extract(query, mode, max_tweets) elif option == '2': n_ranking = click.prompt("How many results should be in the ranking?", type=int) n_samples = click.prompt("How many of the top preferences should each tweet have to be considered?", type=int) votes = {} for tweet in Tweet.objects(): for i in range(len(tweet.preferences)): points = n_samples - i if points < 1: break votes[tweet.preferences[i]] = votes.get(tweet.preferences[i], 0) + points print({k: v for k, v in sorted(votes.items(), key=lambda item: item[1], reverse=True)}) """pipeline = [